From 1ba5bf993c6a3142e18e68ea6452b347f9cb5635 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Tue, 5 Jul 2016 10:18:08 +0200
Subject: xfrm: fix crash in XFRM_MSG_GETSA netlink handler

If we hit any of the error conditions inside xfrm_dump_sa(), then
xfrm_state_walk_init() never gets called. However, we still call
xfrm_state_walk_done() from xfrm_dump_sa_done(), which will crash
because the state walk was never initialized properly.

We can fix this by setting cb->args[0] only after we've processed the
first element and checking this before calling xfrm_state_walk_done().

Fixes: d3623099d3 ("ipsec: add support of limited SA dump")
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d516845e16e3..4fb04ced5867 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -896,7 +896,8 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb)
 	struct sock *sk = cb->skb->sk;
 	struct net *net = sock_net(sk);
 
-	xfrm_state_walk_done(walk, net);
+	if (cb->args[0])
+		xfrm_state_walk_done(walk, net);
 	return 0;
 }
 
@@ -921,8 +922,6 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
 		u8 proto = 0;
 		int err;
 
-		cb->args[0] = 1;
-
 		err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX,
 				  xfrma_policy);
 		if (err < 0)
@@ -939,6 +938,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
 			proto = nla_get_u8(attrs[XFRMA_PROTO]);
 
 		xfrm_state_walk_init(walk, proto, filter);
+		cb->args[0] = 1;
 	}
 
 	(void) xfrm_state_walk(net, walk, dump_one_state, &info);
-- 
cgit v1.2.3


From 73efc3245fd3edb3632d82a3a9c5d5d975a02efc Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Wed, 27 Jul 2016 08:03:18 +0200
Subject: xfrm: get rid of incorrect WARN

AFAICT this message is just printed whenever input validation fails.
This is a normal failure and we shouldn't be dumping the stack over it.

Looks like it was originally a printk that was maybe incorrectly
upgraded to a WARN:

commit 62db5cfd70b1ef53aa21f144a806fe3b78c84fab
Author: stephen hemminger <shemminger@vyatta.com>
Date:   Wed May 12 06:37:06 2010 +0000

    xfrm: add severity to printk

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4fb04ced5867..1a4f142dd50a 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2117,7 +2117,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	err = verify_newpolicy_info(&ua->policy);
 	if (err)
-		goto bad_policy;
+		goto free_state;
 
 	/*   build an XP */
 	xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);
@@ -2149,8 +2149,6 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	return 0;
 
-bad_policy:
-	WARN(1, "BAD policy passed\n");
 free_state:
 	kfree(x);
 nomem:
-- 
cgit v1.2.3


From 7677c7560c3e80fde08a7e710d378dedabf950c3 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Wed, 27 Jul 2016 08:44:15 +0200
Subject: xfrm: get rid of another incorrect WARN

During fuzzing I regularly run into this WARN(). According to Herbert Xu,
this "certainly shouldn't be a WARN, it probably shouldn't print anything
either".

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 1a4f142dd50a..cb65d916a345 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2051,9 +2051,6 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (up->hard) {
 		xfrm_policy_delete(xp, p->dir);
 		xfrm_audit_policy_delete(xp, 1, true);
-	} else {
-		// reset the timers here?
-		WARN(1, "Don't know what to do with soft policy expire\n");
 	}
 	km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);
 
-- 
cgit v1.2.3


From 6916fb3b10b3cbe3b1f9f5b680675f53e4e299eb Mon Sep 17 00:00:00 2001
From: Tobias Brunner <tobias@strongswan.org>
Date: Fri, 29 Jul 2016 09:57:32 +0200
Subject: xfrm: Ignore socket policies when rebuilding hash tables

Whenever thresholds are changed the hash tables are rebuilt.  This is
done by enumerating all policies and hashing and inserting them into
the right table according to the thresholds and direction.

Because socket policies are also contained in net->xfrm.policy_all but
no hash tables are defined for their direction (dir + XFRM_POLICY_MAX)
this causes a NULL or invalid pointer dereference after returning from
policy_hash_bysel() if the rebuild is done while any socket policies
are installed.

Since the rebuild after changing thresholds is scheduled this crash
could even occur if the userland sets thresholds seemingly before
installing any socket policies.

Fixes: 53c2e285f970 ("xfrm: Do not hash socket policies")
Signed-off-by: Tobias Brunner <tobias@strongswan.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b5e665b3cfb0..45f9cf97ea25 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -626,6 +626,10 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 
 	/* re-insert all policies by order of creation */
 	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+		if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) {
+			/* skip socket policies */
+			continue;
+		}
 		newpos = NULL;
 		chain = policy_hash_bysel(net, &policy->selector,
 					  policy->family,
-- 
cgit v1.2.3


From 4d0bd46a4d55383f7b925e6cf7865a77e0f0e020 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 8 Aug 2016 08:45:33 +0200
Subject: Revert "wext: Fix 32 bit iwpriv compatibility issue with 64 bit
 Kernel"

This reverts commit 3d5fdff46c4b2b9534fa2f9fc78e90a48e0ff724.

Ben Hutchings pointed out that the commit isn't safe since it assumes
that the structure used by the driver is iw_point, when in fact there's
no way to know about that.

Fortunately, the only driver in the tree that ever runs this code path
is the wilc1000 staging driver, so it doesn't really matter.

Clearly I should have investigated this better before applying, sorry.

Reported-by: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org [though I guess it doesn't matter much]
Fixes: 3d5fdff46c4b ("wext: Fix 32 bit iwpriv compatibility issue with 64 bit Kernel")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/wext-core.c | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index dbb2738e356a..6250b1cfcde5 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -958,29 +958,8 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
 			return private(dev, iwr, cmd, info, handler);
 	}
 	/* Old driver API : call driver ioctl handler */
-	if (dev->netdev_ops->ndo_do_ioctl) {
-#ifdef CONFIG_COMPAT
-		if (info->flags & IW_REQUEST_FLAG_COMPAT) {
-			int ret = 0;
-			struct iwreq iwr_lcl;
-			struct compat_iw_point *iwp_compat = (void *) &iwr->u.data;
-
-			memcpy(&iwr_lcl, iwr, sizeof(struct iwreq));
-			iwr_lcl.u.data.pointer = compat_ptr(iwp_compat->pointer);
-			iwr_lcl.u.data.length = iwp_compat->length;
-			iwr_lcl.u.data.flags = iwp_compat->flags;
-
-			ret = dev->netdev_ops->ndo_do_ioctl(dev, (void *) &iwr_lcl, cmd);
-
-			iwp_compat->pointer = ptr_to_compat(iwr_lcl.u.data.pointer);
-			iwp_compat->length = iwr_lcl.u.data.length;
-			iwp_compat->flags = iwr_lcl.u.data.flags;
-
-			return ret;
-		} else
-#endif
-			return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
-	}
+	if (dev->netdev_ops->ndo_do_ioctl)
+		return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
 	return -EOPNOTSUPP;
 }
 
-- 
cgit v1.2.3


From e45a8a9e60ff1dd5ad118c794337a1101b46ab0d Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Tue, 9 Aug 2016 18:27:08 +0200
Subject: xfrm: constify xfrm_replay structures

The xfrm_replay structures are never modified, so declare them as const.

Done with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     | 2 +-
 net/xfrm/xfrm_replay.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index adfebd6f243c..d2fdd6d70959 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -187,7 +187,7 @@ struct xfrm_state {
 	struct xfrm_replay_state_esn *preplay_esn;
 
 	/* The functions for replay detection. */
-	struct xfrm_replay	*repl;
+	const struct xfrm_replay *repl;
 
 	/* internal flag that only holds state for delayed aevent at the
 	 * moment
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 4fd725a0c500..cdc2e2e71bff 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -558,7 +558,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
 		x->repl->notify(x, XFRM_REPLAY_UPDATE);
 }
 
-static struct xfrm_replay xfrm_replay_legacy = {
+static const struct xfrm_replay xfrm_replay_legacy = {
 	.advance	= xfrm_replay_advance,
 	.check		= xfrm_replay_check,
 	.recheck	= xfrm_replay_check,
@@ -566,7 +566,7 @@ static struct xfrm_replay xfrm_replay_legacy = {
 	.overflow	= xfrm_replay_overflow,
 };
 
-static struct xfrm_replay xfrm_replay_bmp = {
+static const struct xfrm_replay xfrm_replay_bmp = {
 	.advance	= xfrm_replay_advance_bmp,
 	.check		= xfrm_replay_check_bmp,
 	.recheck	= xfrm_replay_check_bmp,
@@ -574,7 +574,7 @@ static struct xfrm_replay xfrm_replay_bmp = {
 	.overflow	= xfrm_replay_overflow_bmp,
 };
 
-static struct xfrm_replay xfrm_replay_esn = {
+static const struct xfrm_replay xfrm_replay_esn = {
 	.advance	= xfrm_replay_advance_esn,
 	.check		= xfrm_replay_check_esn,
 	.recheck	= xfrm_replay_recheck_esn,
-- 
cgit v1.2.3


From ae3fb6d32140e5c5b491892105ca89066171d217 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Aug 2016 12:16:04 +0200
Subject: xfrm: state: use hlist_for_each_entry_rcu helper

This is required once we allow lockless access of bydst/bysrc hash tables.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9895a8c56d8c..904ab4d4ac05 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -76,18 +76,18 @@ static void xfrm_hash_transfer(struct hlist_head *list,
 		h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
 				    x->props.reqid, x->props.family,
 				    nhashmask);
-		hlist_add_head(&x->bydst, ndsttable+h);
+		hlist_add_head_rcu(&x->bydst, ndsttable + h);
 
 		h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
 				    x->props.family,
 				    nhashmask);
-		hlist_add_head(&x->bysrc, nsrctable+h);
+		hlist_add_head_rcu(&x->bysrc, nsrctable + h);
 
 		if (x->id.spi) {
 			h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
 					    x->id.proto, x->props.family,
 					    nhashmask);
-			hlist_add_head(&x->byspi, nspitable+h);
+			hlist_add_head_rcu(&x->byspi, nspitable + h);
 		}
 	}
 }
@@ -520,10 +520,10 @@ int __xfrm_state_delete(struct xfrm_state *x)
 		x->km.state = XFRM_STATE_DEAD;
 		spin_lock(&net->xfrm.xfrm_state_lock);
 		list_del(&x->km.all);
-		hlist_del(&x->bydst);
-		hlist_del(&x->bysrc);
+		hlist_del_rcu(&x->bydst);
+		hlist_del_rcu(&x->bysrc);
 		if (x->id.spi)
-			hlist_del(&x->byspi);
+			hlist_del_rcu(&x->byspi);
 		net->xfrm.state_num--;
 		spin_unlock(&net->xfrm.xfrm_state_lock);
 
@@ -659,7 +659,7 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
 	unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
 	struct xfrm_state *x;
 
-	hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) {
+	hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) {
 		if (x->props.family != family ||
 		    x->id.spi       != spi ||
 		    x->id.proto     != proto ||
@@ -683,7 +683,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
 	unsigned int h = xfrm_src_hash(net, daddr, saddr, family);
 	struct xfrm_state *x;
 
-	hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) {
+	hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) {
 		if (x->props.family != family ||
 		    x->id.proto     != proto ||
 		    !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
@@ -781,7 +781,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
 	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
-	hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+	hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
 		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
@@ -797,7 +797,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 		goto found;
 
 	h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
-	hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) {
+	hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) {
 		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
@@ -852,12 +852,12 @@ found:
 		if (km_query(x, tmpl, pol) == 0) {
 			x->km.state = XFRM_STATE_ACQ;
 			list_add(&x->km.all, &net->xfrm.state_all);
-			hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+			hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
 			h = xfrm_src_hash(net, daddr, saddr, encap_family);
-			hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+			hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
 			if (x->id.spi) {
 				h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
-				hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+				hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
 			}
 			x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
 			tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
@@ -945,16 +945,16 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 
 	h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
 			  x->props.reqid, x->props.family);
-	hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+	hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
 
 	h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
-	hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+	hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
 
 	if (x->id.spi) {
 		h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
 				  x->props.family);
 
-		hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+		hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
 	}
 
 	tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
@@ -1063,9 +1063,9 @@ static struct xfrm_state *__find_acq_core(struct net *net,
 		xfrm_state_hold(x);
 		tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
 		list_add(&x->km.all, &net->xfrm.state_all);
-		hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+		hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
 		h = xfrm_src_hash(net, daddr, saddr, family);
-		hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+		hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
 
 		net->xfrm.state_num++;
 
@@ -1581,7 +1581,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
 	if (x->id.spi) {
 		spin_lock_bh(&net->xfrm.xfrm_state_lock);
 		h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
-		hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+		hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
 		spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
 		err = 0;
-- 
cgit v1.2.3


From 02efdff7e209859c2755ebe93b3bd0e3d40123ab Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Aug 2016 12:16:05 +0200
Subject: xfrm: state: use atomic_inc_not_zero to increment refcount

Once xfrm_state_lookup_byaddr no longer acquires the state lock another
cpu might be freeing the state entry at the same time.

To detect this we use atomic_inc_not_zero, we then signal -EAGAIN to
caller in case our result was stale.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 904ab4d4ac05..84c1db6254d5 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -37,6 +37,11 @@
 
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 
+static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
+{
+	return atomic_inc_not_zero(&x->refcnt);
+}
+
 static inline unsigned int xfrm_dst_hash(struct net *net,
 					 const xfrm_address_t *daddr,
 					 const xfrm_address_t *saddr,
@@ -668,7 +673,8 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
 
 		if ((mark & x->mark.m) != x->mark.v)
 			continue;
-		xfrm_state_hold(x);
+		if (!xfrm_state_hold_rcu(x))
+			continue;
 		return x;
 	}
 
@@ -692,7 +698,8 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
 
 		if ((mark & x->mark.m) != x->mark.v)
 			continue;
-		xfrm_state_hold(x);
+		if (!xfrm_state_hold_rcu(x))
+			continue;
 		return x;
 	}
 
@@ -871,10 +878,14 @@ found:
 		}
 	}
 out:
-	if (x)
-		xfrm_state_hold(x);
-	else
+	if (x) {
+		if (!xfrm_state_hold_rcu(x)) {
+			*err = -EAGAIN;
+			x = NULL;
+		}
+	} else {
 		*err = acquire_in_progress ? -EAGAIN : error;
+	}
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 	if (to_put)
 		xfrm_state_put(to_put);
-- 
cgit v1.2.3


From df7274eb70b7c8488170ebe8757dd94647a8e1e5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Aug 2016 12:16:06 +0200
Subject: xfrm: state: delay freeing until rcu grace period has elapsed

The hash table backend memory and the state structs are free'd via
kfree/vfree.

Once we only rely on rcu during lookups we have to make sure no other cpu
is currently accessing this before doing the free.

Free operations already happen from worker so we can use synchronize_rcu
to wait until concurrent readers are done.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 84c1db6254d5..8e373876924f 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -146,6 +146,9 @@ static void xfrm_hash_resize(struct work_struct *work)
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
 	osize = (ohashmask + 1) * sizeof(struct hlist_head);
+
+	synchronize_rcu();
+
 	xfrm_hash_free(odst, osize);
 	xfrm_hash_free(osrc, osize);
 	xfrm_hash_free(ospi, osize);
@@ -369,6 +372,8 @@ static void xfrm_state_gc_task(struct work_struct *work)
 	hlist_move_list(&net->xfrm.state_gc_list, &gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 
+	synchronize_rcu();
+
 	hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
 		xfrm_state_gc_destroy(x);
 }
-- 
cgit v1.2.3


From b65e3d7be06fd8ff5236439254f338fe1a8d4bbd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Aug 2016 12:16:07 +0200
Subject: xfrm: state: add sequence count to detect hash resizes

Once xfrm_state_find is lockless we have to cope with a concurrent
resize opertion.

We use a sequence counter to block in case a resize is in progress
and to detect if we might have missed a state that got moved to
a new hash table.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8e373876924f..ac4037cf6a29 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -36,6 +36,7 @@
  */
 
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
+static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
 
 static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
 {
@@ -127,6 +128,7 @@ static void xfrm_hash_resize(struct work_struct *work)
 	}
 
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
+	write_seqcount_begin(&xfrm_state_hash_generation);
 
 	nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
 	for (i = net->xfrm.state_hmask; i >= 0; i--)
@@ -143,6 +145,7 @@ static void xfrm_hash_resize(struct work_struct *work)
 	net->xfrm.state_byspi = nspi;
 	net->xfrm.state_hmask = nhashmask;
 
+	write_seqcount_end(&xfrm_state_hash_generation);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
 	osize = (ohashmask + 1) * sizeof(struct hlist_head);
@@ -787,10 +790,13 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 	struct xfrm_state *best = NULL;
 	u32 mark = pol->mark.v & pol->mark.m;
 	unsigned short encap_family = tmpl->encap_family;
+	unsigned int sequence;
 	struct km_event c;
 
 	to_put = NULL;
 
+	sequence = read_seqcount_begin(&xfrm_state_hash_generation);
+
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
 	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
 	hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
@@ -894,6 +900,15 @@ out:
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 	if (to_put)
 		xfrm_state_put(to_put);
+
+	if (read_seqcount_retry(&xfrm_state_hash_generation, sequence)) {
+		*err = -EAGAIN;
+		if (x) {
+			xfrm_state_put(x);
+			x = NULL;
+		}
+	}
+
 	return x;
 }
 
-- 
cgit v1.2.3


From c8406998b80183ef87895ab1de4dbed8bb2d53a0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Aug 2016 12:16:08 +0200
Subject: xfrm: state: use rcu_deref and assign_pointer helpers

Before xfrm_state_find() can use rcu_read_lock instead of xfrm_state_lock
we need to switch users of the hash table to assign/obtain the pointers
with the appropriate rcu helpers.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index ac4037cf6a29..53e7867f9254 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -28,6 +28,9 @@
 
 #include "xfrm_hash.h"
 
+#define xfrm_state_deref_prot(table, net) \
+	rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
+
 /* Each xfrm_state may be linked to two tables:
 
    1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
@@ -131,18 +134,17 @@ static void xfrm_hash_resize(struct work_struct *work)
 	write_seqcount_begin(&xfrm_state_hash_generation);
 
 	nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
+	odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net);
 	for (i = net->xfrm.state_hmask; i >= 0; i--)
-		xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi,
-				   nhashmask);
+		xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask);
 
-	odst = net->xfrm.state_bydst;
-	osrc = net->xfrm.state_bysrc;
-	ospi = net->xfrm.state_byspi;
+	osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net);
+	ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net);
 	ohashmask = net->xfrm.state_hmask;
 
-	net->xfrm.state_bydst = ndst;
-	net->xfrm.state_bysrc = nsrc;
-	net->xfrm.state_byspi = nspi;
+	rcu_assign_pointer(net->xfrm.state_bydst, ndst);
+	rcu_assign_pointer(net->xfrm.state_bysrc, nsrc);
+	rcu_assign_pointer(net->xfrm.state_byspi, nspi);
 	net->xfrm.state_hmask = nhashmask;
 
 	write_seqcount_end(&xfrm_state_hash_generation);
-- 
cgit v1.2.3


From d737a5805581c6f99dad4caa9fdf80965d617d1a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Aug 2016 12:16:09 +0200
Subject: xfrm: state: don't use lock anymore unless acquire operation is
 needed

push the lock down, after earlier patches we can rely on rcu to
make sure state struct won't go away.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h | 6 +++---
 net/xfrm/xfrm_state.c    | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 24cd3949a9a4..1ab51d188408 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -38,9 +38,9 @@ struct netns_xfrm {
 	 * mode. Also, it can be used by ah/esp icmp error handler to find
 	 * offending SA.
 	 */
-	struct hlist_head	*state_bydst;
-	struct hlist_head	*state_bysrc;
-	struct hlist_head	*state_byspi;
+	struct hlist_head	__rcu *state_bydst;
+	struct hlist_head	__rcu *state_bysrc;
+	struct hlist_head	__rcu *state_byspi;
 	unsigned int		state_hmask;
 	unsigned int		state_num;
 	struct work_struct	state_hash_work;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 53e7867f9254..1a15b658a79e 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -799,7 +799,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 
 	sequence = read_seqcount_begin(&xfrm_state_hash_generation);
 
-	spin_lock_bh(&net->xfrm.xfrm_state_lock);
+	rcu_read_lock();
 	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
 	hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
 		if (x->props.family == encap_family &&
@@ -870,6 +870,7 @@ found:
 		}
 
 		if (km_query(x, tmpl, pol) == 0) {
+			spin_lock_bh(&net->xfrm.xfrm_state_lock);
 			x->km.state = XFRM_STATE_ACQ;
 			list_add(&x->km.all, &net->xfrm.state_all);
 			hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
@@ -883,6 +884,7 @@ found:
 			tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
 			net->xfrm.state_num++;
 			xfrm_hash_grow_check(net, x->bydst.next != NULL);
+			spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 		} else {
 			x->km.state = XFRM_STATE_DEAD;
 			to_put = x;
@@ -899,7 +901,7 @@ out:
 	} else {
 		*err = acquire_in_progress ? -EAGAIN : error;
 	}
-	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+	rcu_read_unlock();
 	if (to_put)
 		xfrm_state_put(to_put);
 
-- 
cgit v1.2.3


From 1625f4529957738be7d87cf157e107b8fb9d23b9 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Wed, 10 Aug 2016 13:54:57 +0300
Subject: net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key

Running LTP 'icmp-uni-basic.sh -6 -p ipcomp -m tunnel' test over
openvswitch + veth can trigger kernel panic:

  BUG: unable to handle kernel NULL pointer dereference
  at 00000000000000e0 IP: [<ffffffff8169d1d2>] xfrm_input+0x82/0x750
  ...
  [<ffffffff816d472e>] xfrm6_rcv_spi+0x1e/0x20
  [<ffffffffa082c3c2>] xfrm6_tunnel_rcv+0x42/0x50 [xfrm6_tunnel]
  [<ffffffffa082727e>] tunnel6_rcv+0x3e/0x8c [tunnel6]
  [<ffffffff8169f365>] ip6_input_finish+0xd5/0x430
  [<ffffffff8169fc53>] ip6_input+0x33/0x90
  [<ffffffff8169f1d5>] ip6_rcv_finish+0xa5/0xb0
  ...

It seems that tunnel.ip6 can have garbage values and also dereferenced
without a proper check, only tunnel.ip4 is being verified. Fix it by
adding one more if block for AF_INET6 and initialize tunnel.ip6 with NULL
inside xfrm6_rcv_spi() (which is similar to xfrm4_rcv_spi()).

Fixes: 049f8e2 ("xfrm: Override skb->mark with tunnel->parm.i_key in xfrm_input")

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/xfrm6_input.c |  1 +
 net/xfrm/xfrm_input.c  | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 0eaab1fa6be5..00a2d40677d6 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -23,6 +23,7 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 
 int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 {
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
 	return xfrm_input(skb, nexthdr, spi, 0);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 1c4ad477ce93..6e3f0254d8a1 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -207,15 +207,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	family = XFRM_SPI_SKB_CB(skb)->family;
 
 	/* if tunnel is present override skb->mark value with tunnel i_key */
-	if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) {
-		switch (family) {
-		case AF_INET:
+	switch (family) {
+	case AF_INET:
+		if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
 			mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key);
-			break;
-		case AF_INET6:
+		break;
+	case AF_INET6:
+		if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
 			mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key);
-			break;
-		}
+		break;
 	}
 
 	/* Allocate new secpath or COW existing one. */
-- 
cgit v1.2.3


From a5eefc1df641f3c99fe54b309e7b79c18cec4a1e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:52 +0200
Subject: xfrm: policy: use rcu versions for iteration and list add/del

This is required once we allow lockless readers.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b5e665b3cfb0..93b8ff74001f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -426,14 +426,14 @@ redo:
 		h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
 				pol->family, nhashmask, dbits, sbits);
 		if (!entry0) {
-			hlist_del(&pol->bydst);
-			hlist_add_head(&pol->bydst, ndsttable+h);
+			hlist_del_rcu(&pol->bydst);
+			hlist_add_head_rcu(&pol->bydst, ndsttable + h);
 			h0 = h;
 		} else {
 			if (h != h0)
 				continue;
-			hlist_del(&pol->bydst);
-			hlist_add_behind(&pol->bydst, entry0);
+			hlist_del_rcu(&pol->bydst);
+			hlist_add_behind_rcu(&pol->bydst, entry0);
 		}
 		entry0 = &pol->bydst;
 	}
@@ -1106,7 +1106,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	read_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_direct(net, daddr, saddr, family, dir);
 	ret = NULL;
-	hlist_for_each_entry(pol, chain, bydst) {
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
 		err = xfrm_policy_match(pol, fl, type, family, dir);
 		if (err) {
 			if (err == -ESRCH)
@@ -1122,7 +1122,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		}
 	}
 	chain = &net->xfrm.policy_inexact[dir];
-	hlist_for_each_entry(pol, chain, bydst) {
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
 		if ((pol->priority >= priority) && ret)
 			break;
 
@@ -1271,7 +1271,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 
 	/* Socket policies are not hashed. */
 	if (!hlist_unhashed(&pol->bydst)) {
-		hlist_del(&pol->bydst);
+		hlist_del_rcu(&pol->bydst);
 		hlist_del(&pol->byidx);
 	}
 
-- 
cgit v1.2.3


From e1e551bc56302b80ff930c966f9985095fb1b70d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:53 +0200
Subject: xfrm: policy: prepare policy_bydst hash for rcu lookups

Since commit 56f047305dd4b6b617
("xfrm: add rcu grace period in xfrm_policy_destroy()") xfrm policy
objects are already free'd via rcu.

In order to make more places lockless (i.e. use rcu_read_lock instead of
grabbing read-side of policy rwlock) we only need to:

- use rcu_assign_pointer to store address of new hash table backend memory
- add rcu barrier so that freeing of old memory is delayed (expansion
  and free happens from system workqueue, so synchronize_rcu is fine)
- use rcu_dereference to fetch current address of the hash table.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 93b8ff74001f..4a8d90a88c83 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -385,9 +385,11 @@ static struct hlist_head *policy_hash_bysel(struct net *net,
 	__get_hash_thresh(net, family, dir, &dbits, &sbits);
 	hash = __sel_hash(sel, family, hmask, dbits, sbits);
 
-	return (hash == hmask + 1 ?
-		&net->xfrm.policy_inexact[dir] :
-		net->xfrm.policy_bydst[dir].table + hash);
+	if (hash == hmask + 1)
+		return &net->xfrm.policy_inexact[dir];
+
+	return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
+		     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
 }
 
 static struct hlist_head *policy_hash_direct(struct net *net,
@@ -403,7 +405,8 @@ static struct hlist_head *policy_hash_direct(struct net *net,
 	__get_hash_thresh(net, family, dir, &dbits, &sbits);
 	hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
 
-	return net->xfrm.policy_bydst[dir].table + hash;
+	return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
+		     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
 }
 
 static void xfrm_dst_hash_transfer(struct net *net,
@@ -468,8 +471,8 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
 	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
 	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
-	struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
 	struct hlist_head *ndst = xfrm_hash_alloc(nsize);
+	struct hlist_head *odst;
 	int i;
 
 	if (!ndst)
@@ -477,14 +480,19 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 
 	write_lock_bh(&net->xfrm.xfrm_policy_lock);
 
+	odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
+				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
+
 	for (i = hmask; i >= 0; i--)
 		xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
 
-	net->xfrm.policy_bydst[dir].table = ndst;
+	rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
 	net->xfrm.policy_bydst[dir].hmask = nhashmask;
 
 	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
+	synchronize_rcu();
+
 	xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
 }
 
-- 
cgit v1.2.3


From 30846090a746edfdb230deadd638cfa96f7b8c91 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:54 +0200
Subject: xfrm: policy: add sequence count to sync with hash resize

Once xfrm_policy_lookup_bytype doesn't grab xfrm_policy_lock anymore its
possible for a hash resize to occur in parallel.

Use sequence counter to block lookup in case a resize is in
progress and to also re-lookup in case hash table was altered
in the mean time (might cause use to not find the best-match).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4a8d90a88c83..576d90321068 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -49,6 +49,7 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
 						__read_mostly;
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
+static __read_mostly seqcount_t xfrm_policy_hash_generation;
 
 static void xfrm_init_pmtu(struct dst_entry *dst);
 static int stale_bundle(struct dst_entry *dst);
@@ -479,6 +480,10 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 		return;
 
 	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	write_seqcount_begin(&xfrm_policy_hash_generation);
+
+	odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
+				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 
 	odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
 				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
@@ -489,6 +494,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 	rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
 	net->xfrm.policy_bydst[dir].hmask = nhashmask;
 
+	write_seqcount_end(&xfrm_policy_hash_generation);
 	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	synchronize_rcu();
@@ -1104,7 +1110,8 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	struct xfrm_policy *pol, *ret;
 	const xfrm_address_t *daddr, *saddr;
 	struct hlist_head *chain;
-	u32 priority = ~0U;
+	unsigned int sequence;
+	u32 priority;
 
 	daddr = xfrm_flowi_daddr(fl, family);
 	saddr = xfrm_flowi_saddr(fl, family);
@@ -1112,7 +1119,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		return NULL;
 
 	read_lock_bh(&net->xfrm.xfrm_policy_lock);
-	chain = policy_hash_direct(net, daddr, saddr, family, dir);
+ retry:
+	do {
+		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		chain = policy_hash_direct(net, daddr, saddr, family, dir);
+	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+	priority = ~0U;
 	ret = NULL;
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
 		err = xfrm_policy_match(pol, fl, type, family, dir);
@@ -1148,6 +1161,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		}
 	}
 
+	if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+		goto retry;
+
 	xfrm_pol_hold(ret);
 fail:
 	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -3090,6 +3106,7 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
 void __init xfrm_init(void)
 {
 	register_pernet_subsys(&xfrm_net_ops);
+	seqcount_init(&xfrm_policy_hash_generation);
 	xfrm_input_init();
 }
 
-- 
cgit v1.2.3


From e37cc8ade5afaf082f804c6d18eb23377146bec4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:55 +0200
Subject: xfrm: policy: use atomic_inc_not_zero in rcu section

If we don't hold the policy lock anymore the refcnt might
already be 0, i.e. policy struct is about to be free'd.

Switch to atomic_inc_not_zero to avoid this.

On removal policies are already unlinked from the tables (lists)
before the last _put occurs so we are not supposed to find the same
'dead' entry on the next loop, so its safe to just repeat the lookup.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 576d90321068..09f2e2b38246 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -60,6 +60,11 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
 
+static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
+{
+	return atomic_inc_not_zero(&policy->refcnt);
+}
+
 static inline bool
 __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
 {
@@ -1164,7 +1169,8 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
 		goto retry;
 
-	xfrm_pol_hold(ret);
+	if (ret && !xfrm_pol_hold_rcu(ret))
+		goto retry;
 fail:
 	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
-- 
cgit v1.2.3


From a7c44247f704e385c77579d65c6ee6d002832529 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:56 +0200
Subject: xfrm: policy: make xfrm_policy_lookup_bytype lockless

side effect: no longer disables BH (should be fine).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h | 2 +-
 net/xfrm/xfrm_policy.c   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 1ab51d188408..3ab828a97e68 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -11,7 +11,7 @@
 struct ctl_table_header;
 
 struct xfrm_policy_hash {
-	struct hlist_head	*table;
+	struct hlist_head	__rcu *table;
 	unsigned int		hmask;
 	u8			dbits4;
 	u8			sbits4;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 09f2e2b38246..9302647f20a0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1123,7 +1123,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	if (unlikely(!daddr || !saddr))
 		return NULL;
 
-	read_lock_bh(&net->xfrm.xfrm_policy_lock);
+	rcu_read_lock();
  retry:
 	do {
 		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
@@ -1172,7 +1172,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	if (ret && !xfrm_pol_hold_rcu(ret))
 		goto retry;
 fail:
-	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
cgit v1.2.3


From ae33786f73a7ce5b15ce29e8f342e43606385cef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:57 +0200
Subject: xfrm: policy: only use rcu in xfrm_sk_policy_lookup

Don't acquire the readlock anymore and rely on rcu alone.

In case writer on other CPU changed policy at the wrong moment (after we
obtained sk policy pointer but before we could obtain the reference)
just repeat the lookup.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9302647f20a0..3d27b9a2fbac 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1249,10 +1249,9 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 						 const struct flowi *fl)
 {
 	struct xfrm_policy *pol;
-	struct net *net = sock_net(sk);
 
 	rcu_read_lock();
-	read_lock_bh(&net->xfrm.xfrm_policy_lock);
+ again:
 	pol = rcu_dereference(sk->sk_policy[dir]);
 	if (pol != NULL) {
 		bool match = xfrm_selector_match(&pol->selector, fl,
@@ -1267,8 +1266,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 			err = security_xfrm_policy_lookup(pol->security,
 						      fl->flowi_secid,
 						      policy_to_flow_dir(dir));
-			if (!err)
-				xfrm_pol_hold(pol);
+			if (!err && !xfrm_pol_hold_rcu(pol))
+				goto again;
 			else if (err == -ESRCH)
 				pol = NULL;
 			else
@@ -1277,7 +1276,6 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 			pol = NULL;
 	}
 out:
-	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	rcu_read_unlock();
 	return pol;
 }
-- 
cgit v1.2.3


From d5b8f86dc7200d16e48bb3a6aaac29c0cdf621c9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:58 +0200
Subject: xfrm: policy: don't acquire policy lock in xfrm_spd_getinfo

It doesn't seem that important.

We now get inconsistent view of the counters, but those are stale anyway
right after we drop the lock.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3d27b9a2fbac..35b85a9a358c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -560,7 +560,6 @@ static inline int xfrm_byidx_should_resize(struct net *net, int total)
 
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
 {
-	read_lock_bh(&net->xfrm.xfrm_policy_lock);
 	si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
 	si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
 	si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
@@ -569,7 +568,6 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
 	si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
 	si->spdhcnt = net->xfrm.policy_idx_hmask;
 	si->spdhmcnt = xfrm_policy_hashmax;
-	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
 }
 EXPORT_SYMBOL(xfrm_spd_getinfo);
 
-- 
cgit v1.2.3


From 9d0380df6217e8dd014118fa1c99dda9974f3613 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Aug 2016 15:17:59 +0200
Subject: xfrm: policy: convert policy_lock to spinlock

After earlier patches conversions all spots acquire the writer lock and
we can now convert this to a normal spinlock.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |  2 +-
 net/xfrm/xfrm_policy.c   | 68 ++++++++++++++++++++++++------------------------
 2 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 3ab828a97e68..177ed444d7b2 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -73,7 +73,7 @@ struct netns_xfrm {
 	struct dst_ops		xfrm6_dst_ops;
 #endif
 	spinlock_t xfrm_state_lock;
-	rwlock_t xfrm_policy_lock;
+	spinlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
 
 	/* flow cache part */
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 35b85a9a358c..dd01fd2e55fa 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -484,7 +484,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 	if (!ndst)
 		return;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	write_seqcount_begin(&xfrm_policy_hash_generation);
 
 	odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
@@ -500,7 +500,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 	net->xfrm.policy_bydst[dir].hmask = nhashmask;
 
 	write_seqcount_end(&xfrm_policy_hash_generation);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	synchronize_rcu();
 
@@ -519,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
 	if (!nidx)
 		return;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
 	for (i = hmask; i >= 0; i--)
 		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
@@ -527,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
 	net->xfrm.policy_byidx = nidx;
 	net->xfrm.policy_idx_hmask = nhashmask;
 
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
 }
@@ -617,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		rbits6 = net->xfrm.policy_hthresh.rbits6;
 	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
 	/* reset the bydst and inexact table in all directions */
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
@@ -659,7 +659,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 			hlist_add_head(&policy->bydst, chain);
 	}
 
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	mutex_unlock(&hash_resize_mutex);
 }
@@ -770,7 +770,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	struct hlist_head *chain;
 	struct hlist_node *newpos;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
 	delpol = NULL;
 	newpos = NULL;
@@ -781,7 +781,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		    xfrm_sec_ctx_match(pol->security, policy->security) &&
 		    !WARN_ON(delpol)) {
 			if (excl) {
-				write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 				return -EEXIST;
 			}
 			delpol = pol;
@@ -817,7 +817,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	policy->curlft.use_time = 0;
 	if (!mod_timer(&policy->timer, jiffies + HZ))
 		xfrm_pol_hold(policy);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (delpol)
 		xfrm_policy_kill(delpol);
@@ -837,7 +837,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	struct hlist_head *chain;
 
 	*err = 0;
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_bysel(net, sel, sel->family, dir);
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, bydst) {
@@ -850,7 +850,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 				*err = security_xfrm_policy_delete(
 								pol->security);
 				if (*err) {
-					write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+					spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 					return pol;
 				}
 				__xfrm_policy_unlink(pol, dir);
@@ -859,7 +859,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 			break;
 		}
 	}
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (ret && delete)
 		xfrm_policy_kill(ret);
@@ -878,7 +878,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 		return NULL;
 
 	*err = 0;
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = net->xfrm.policy_byidx + idx_hash(net, id);
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, byidx) {
@@ -889,7 +889,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 				*err = security_xfrm_policy_delete(
 								pol->security);
 				if (*err) {
-					write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+					spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 					return pol;
 				}
 				__xfrm_policy_unlink(pol, dir);
@@ -898,7 +898,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 			break;
 		}
 	}
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (ret && delete)
 		xfrm_policy_kill(ret);
@@ -956,7 +956,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 {
 	int dir, err = 0, cnt = 0;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
 	err = xfrm_policy_flush_secctx_check(net, type, task_valid);
 	if (err)
@@ -972,14 +972,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 			if (pol->type != type)
 				continue;
 			__xfrm_policy_unlink(pol, dir);
-			write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 			cnt++;
 
 			xfrm_audit_policy_delete(pol, 1, task_valid);
 
 			xfrm_policy_kill(pol);
 
-			write_lock_bh(&net->xfrm.xfrm_policy_lock);
+			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 			goto again1;
 		}
 
@@ -991,13 +991,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 				if (pol->type != type)
 					continue;
 				__xfrm_policy_unlink(pol, dir);
-				write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 				cnt++;
 
 				xfrm_audit_policy_delete(pol, 1, task_valid);
 				xfrm_policy_kill(pol);
 
-				write_lock_bh(&net->xfrm.xfrm_policy_lock);
+				spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 				goto again2;
 			}
 		}
@@ -1006,7 +1006,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 	if (!cnt)
 		err = -ESRCH;
 out:
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_policy_flush);
@@ -1026,7 +1026,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 	if (list_empty(&walk->walk.all) && walk->seq != 0)
 		return 0;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	if (list_empty(&walk->walk.all))
 		x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
 	else
@@ -1054,7 +1054,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 	}
 	list_del_init(&walk->walk.all);
 out:
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	return error;
 }
 EXPORT_SYMBOL(xfrm_policy_walk);
@@ -1073,9 +1073,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
 	if (list_empty(&walk->walk.all))
 		return;
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
 	list_del(&walk->walk.all);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 }
 EXPORT_SYMBOL(xfrm_policy_walk_done);
 
@@ -1321,9 +1321,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 {
 	struct net *net = xp_net(pol);
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	pol = __xfrm_policy_unlink(pol, dir);
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	if (pol) {
 		xfrm_policy_kill(pol);
 		return 0;
@@ -1342,7 +1342,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 		return -EINVAL;
 #endif
 
-	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	old_pol = rcu_dereference_protected(sk->sk_policy[dir],
 				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 	if (pol) {
@@ -1360,7 +1360,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 		 */
 		xfrm_sk_policy_unlink(old_pol, dir);
 	}
-	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (old_pol) {
 		xfrm_policy_kill(old_pol);
@@ -1390,9 +1390,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
 		newp->type = old->type;
 		memcpy(newp->xfrm_vec, old->xfrm_vec,
 		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
-		write_lock_bh(&net->xfrm.xfrm_policy_lock);
+		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 		xfrm_sk_policy_link(newp, dir);
-		write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 		xfrm_pol_put(newp);
 	}
 	return newp;
@@ -3074,7 +3074,7 @@ static int __net_init xfrm_net_init(struct net *net)
 
 	/* Initialize the per-net locks here */
 	spin_lock_init(&net->xfrm.xfrm_state_lock);
-	rwlock_init(&net->xfrm.xfrm_policy_lock);
+	spin_lock_init(&net->xfrm.xfrm_policy_lock);
 	mutex_init(&net->xfrm.xfrm_cfg_mutex);
 
 	return 0;
@@ -3206,7 +3206,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
 	struct hlist_head *chain;
 	u32 priority = ~0U;
 
-	read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
 	hlist_for_each_entry(pol, chain, bydst) {
 		if (xfrm_migrate_selector_match(sel, &pol->selector) &&
@@ -3230,7 +3230,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
 
 	xfrm_pol_hold(ret);
 
-	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 11d7a0bb95eaaba1741bb24a7c3c169c82f09c7b Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 14 Aug 2016 19:52:56 -0700
Subject: xfrm: Only add l3mdev oif to dst lookups

Subash reported that commit 42a7b32b73d6 ("xfrm: Add oif to dst lookups")
broke a wifi use case that uses fib rules and xfrms. The intent of
42a7b32b73d6 was driven by VRFs with IPsec. As a compromise relax the
use of oif in xfrm lookups to L3 master devices only (ie., oif is either
an L3 master device or is enslaved to a master device).

Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups")
Reported-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 2 +-
 net/ipv6/xfrm6_policy.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 7b0edb37a115..e07ed8b1deb3 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	memset(fl4, 0, sizeof(*fl4));
 	fl4->daddr = daddr->a4;
 	fl4->flowi4_tos = tos;
-	fl4->flowi4_oif = oif;
+	fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index c074771a10f7..dd84ecd1221b 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -36,7 +36,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 	int err;
 
 	memset(&fl6, 0, sizeof(fl6));
-	fl6.flowi6_oif = oif;
+	fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
 	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
 	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
 	if (saddr)
-- 
cgit v1.2.3


From 4141b36ab16d7a66b4cf712f2d21eba61c5927e5 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 24 Aug 2016 13:08:40 +0200
Subject: xfrm: Fix xfrm_policy_lock imbalance

An earlier patch accidentally replaced a write_lock_bh
with a spin_unlock_bh. Fix this by using spin_lock_bh
instead.

Fixes: 9d0380df6217 ("xfrm: policy: convert policy_lock to spinlock")
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dd01fd2e55fa..f7ce6265961a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -979,7 +979,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 
 			xfrm_policy_kill(pol);
 
-			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+			spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 			goto again1;
 		}
 
-- 
cgit v1.2.3


From 35db57bbc4b7ab810bba6e6d6954a0faf5a842cf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 23 Aug 2016 16:00:12 +0200
Subject: xfrm: state: remove per-netns gc task

After commit 5b8ef3415a21f173
("xfrm: Remove ancient sleeping when the SA is in acquire state")
gc does not need any per-netns data anymore.

As far as gc is concerned all state structs are the same, so we
can use a global work struct for it.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |  2 --
 net/xfrm/xfrm_state.c    | 18 +++++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 177ed444d7b2..27bb9633c69d 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -44,8 +44,6 @@ struct netns_xfrm {
 	unsigned int		state_hmask;
 	unsigned int		state_num;
 	struct work_struct	state_hash_work;
-	struct hlist_head	state_gc_list;
-	struct work_struct	state_gc_work;
 
 	struct list_head	policy_all;
 	struct hlist_head	*policy_byidx;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1a15b658a79e..ba8bf518ba14 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -31,6 +31,8 @@
 #define xfrm_state_deref_prot(table, net) \
 	rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
 
+static void xfrm_state_gc_task(struct work_struct *work);
+
 /* Each xfrm_state may be linked to two tables:
 
    1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
@@ -41,6 +43,9 @@
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
 
+static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
+static HLIST_HEAD(xfrm_state_gc_list);
+
 static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
 {
 	return atomic_inc_not_zero(&x->refcnt);
@@ -368,13 +373,12 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 
 static void xfrm_state_gc_task(struct work_struct *work)
 {
-	struct net *net = container_of(work, struct net, xfrm.state_gc_work);
 	struct xfrm_state *x;
 	struct hlist_node *tmp;
 	struct hlist_head gc_list;
 
 	spin_lock_bh(&xfrm_state_gc_lock);
-	hlist_move_list(&net->xfrm.state_gc_list, &gc_list);
+	hlist_move_list(&xfrm_state_gc_list, &gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 
 	synchronize_rcu();
@@ -515,14 +519,12 @@ EXPORT_SYMBOL(xfrm_state_alloc);
 
 void __xfrm_state_destroy(struct xfrm_state *x)
 {
-	struct net *net = xs_net(x);
-
 	WARN_ON(x->km.state != XFRM_STATE_DEAD);
 
 	spin_lock_bh(&xfrm_state_gc_lock);
-	hlist_add_head(&x->gclist, &net->xfrm.state_gc_list);
+	hlist_add_head(&x->gclist, &xfrm_state_gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
-	schedule_work(&net->xfrm.state_gc_work);
+	schedule_work(&xfrm_state_gc_work);
 }
 EXPORT_SYMBOL(__xfrm_state_destroy);
 
@@ -2134,8 +2136,6 @@ int __net_init xfrm_state_init(struct net *net)
 
 	net->xfrm.state_num = 0;
 	INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
-	INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
-	INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
 	spin_lock_init(&net->xfrm.xfrm_state_lock);
 	return 0;
 
@@ -2153,7 +2153,7 @@ void xfrm_state_fini(struct net *net)
 
 	flush_work(&net->xfrm.state_hash_work);
 	xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
-	flush_work(&net->xfrm.state_gc_work);
+	flush_work(&xfrm_state_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.state_all));
 
-- 
cgit v1.2.3


From 16590a228109e2f318d2cc6466221134cfab723a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 22 Aug 2016 14:57:42 -0400
Subject: SUNRPC: Silence WARN_ON when NFSv4.1 over RDMA is in use

Using NFSv4.1 on RDMA should be safe, so broaden the new checks in
rpc_create().

WARN_ON_ONCE is used, matching most other WARN call sites in clnt.c.

Fixes: 39a9beab5acb ("rpc: share one xps between all backchannels")
Fixes: d50039ea5ee6 ("nfsd4/rpc: move backchannel create logic...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/clnt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7f79fb7dc6a0..66f23b376fa0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -453,7 +453,7 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 	struct rpc_xprt_switch *xps;
 
 	if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
-		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
 		xps = args->bc_xprt->xpt_bc_xps;
 		xprt_switch_get(xps);
 	} else {
@@ -520,7 +520,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	char servername[48];
 
 	if (args->bc_xprt) {
-		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
 		xprt = args->bc_xprt->xpt_bc_xprt;
 		if (xprt) {
 			xprt_get(xprt);
-- 
cgit v1.2.3


From 89e1f6d2b956649fbe0704d543a90b8e0cf872b0 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 01:02:18 +0800
Subject: netfilter: nft_reject: restrict to INPUT/FORWARD/OUTPUT

After I add the nft rule "nft add rule filter prerouting reject
with tcp reset", kernel panic happened on my system:
  NULL pointer dereference at ...
  IP: [<ffffffff81b9db2f>] nf_send_reset+0xaf/0x400
  Call Trace:
  [<ffffffff81b9da80>] ? nf_reject_ip_tcphdr_get+0x160/0x160
  [<ffffffffa0928061>] nft_reject_ipv4_eval+0x61/0xb0 [nft_reject_ipv4]
  [<ffffffffa08e836a>] nft_do_chain+0x1fa/0x890 [nf_tables]
  [<ffffffffa08e8170>] ? __nft_trace_packet+0x170/0x170 [nf_tables]
  [<ffffffffa06e0900>] ? nf_ct_invert_tuple+0xb0/0xc0 [nf_conntrack]
  [<ffffffffa07224d4>] ? nf_nat_setup_info+0x5d4/0x650 [nf_nat]
  [...]

Because in the PREROUTING chain, routing information is not exist,
then we will dereference the NULL pointer and oops happen.

So we restrict reject expression to INPUT, FORWARD and OUTPUT chain.
This is consistent with iptables REJECT target.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_reject.h   |  4 ++++
 net/ipv4/netfilter/nft_reject_ipv4.c |  1 +
 net/ipv6/netfilter/nft_reject_ipv6.c |  1 +
 net/netfilter/nft_reject.c           | 16 ++++++++++++++++
 net/netfilter/nft_reject_inet.c      |  7 ++++++-
 5 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h
index 60fa1530006b..02e28c529b29 100644
--- a/include/net/netfilter/nft_reject.h
+++ b/include/net/netfilter/nft_reject.h
@@ -8,6 +8,10 @@ struct nft_reject {
 
 extern const struct nla_policy nft_reject_policy[];
 
+int nft_reject_validate(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nft_data **data);
+
 int nft_reject_init(const struct nft_ctx *ctx,
 		    const struct nft_expr *expr,
 		    const struct nlattr * const tb[]);
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index c24f41c816b3..2c2553b9026c 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
 	.eval		= nft_reject_ipv4_eval,
 	.init		= nft_reject_init,
 	.dump		= nft_reject_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 533cd5719c59..92bda9908bb9 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = {
 	.eval		= nft_reject_ipv6_eval,
 	.init		= nft_reject_init,
 	.dump		= nft_reject_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 0522fc9bfb0a..c64de3f7379d 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
 };
 EXPORT_SYMBOL_GPL(nft_reject_policy);
 
+int nft_reject_validate(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nft_data **data)
+{
+	return nft_chain_validate_hooks(ctx->chain,
+					(1 << NF_INET_LOCAL_IN) |
+					(1 << NF_INET_FORWARD) |
+					(1 << NF_INET_LOCAL_OUT));
+}
+EXPORT_SYMBOL_GPL(nft_reject_validate);
+
 int nft_reject_init(const struct nft_ctx *ctx,
 		    const struct nft_expr *expr,
 		    const struct nlattr * const tb[])
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
+	int err;
+
+	err = nft_reject_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_REJECT_TYPE] == NULL)
 		return -EINVAL;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 759ca5248a3d..e79d9ca2ffee 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx,
 				const struct nlattr * const tb[])
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
-	int icmp_code;
+	int icmp_code, err;
+
+	err = nft_reject_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_REJECT_TYPE] == NULL)
 		return -EINVAL;
@@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
 	.eval		= nft_reject_inet_eval,
 	.init		= nft_reject_inet_init,
 	.dump		= nft_reject_inet_dump,
+	.validate	= nft_reject_validate,
 };
 
 static struct nft_expr_type nft_reject_inet_type __read_mostly = {
-- 
cgit v1.2.3


From 93fac10b99d78eb2c50a739cba2e590c7332d539 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 21:58:16 +0800
Subject: netfilter: nfnetlink: use list_for_each_entry_safe to delete all
 objects

cttimeout and acct objects are deleted from the list while traversing
it, so use list_for_each_entry is unsafe here.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_acct.c      | 6 +++---
 net/netfilter/nfnetlink_cttimeout.c | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 70eb2f6a3b01..d44d89b56127 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -343,12 +343,12 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl,
 			 struct sk_buff *skb, const struct nlmsghdr *nlh,
 			 const struct nlattr * const tb[])
 {
-	char *acct_name;
-	struct nf_acct *cur;
+	struct nf_acct *cur, *tmp;
 	int ret = -ENOENT;
+	char *acct_name;
 
 	if (!tb[NFACCT_NAME]) {
-		list_for_each_entry(cur, &net->nfnl_acct_list, head)
+		list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head)
 			nfnl_acct_try_del(cur);
 
 		return 0;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 68216cdc7083..f74fee1e2d0a 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -350,12 +350,13 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
 				 const struct nlmsghdr *nlh,
 				 const struct nlattr * const cda[])
 {
-	struct ctnl_timeout *cur;
+	struct ctnl_timeout *cur, *tmp;
 	int ret = -ENOENT;
 	char *name;
 
 	if (!cda[CTA_TIMEOUT_NAME]) {
-		list_for_each_entry(cur, &net->nfct_timeout_list, head)
+		list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list,
+					 head)
 			ctnl_timeout_try_del(net, cur);
 
 		return 0;
-- 
cgit v1.2.3


From 23aaba5ad55547db62bada5066c8fb6412d5b1c2 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 21:58:17 +0800
Subject: netfilter: cttimeout: put back l4proto when replacing timeout policy

We forget to call nf_ct_l4proto_put when replacing the existing
timeout policy. Acctually, there's no need to get ct l4proto
before doing replace, so we can move it to a later position.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cttimeout.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index f74fee1e2d0a..6844c7af0b8f 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -98,31 +98,28 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
 		break;
 	}
 
-	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
-
-	/* This protocol is not supportted, skip. */
-	if (l4proto->l4proto != l4num) {
-		ret = -EOPNOTSUPP;
-		goto err_proto_put;
-	}
-
 	if (matching) {
 		if (nlh->nlmsg_flags & NLM_F_REPLACE) {
 			/* You cannot replace one timeout policy by another of
 			 * different kind, sorry.
 			 */
 			if (matching->l3num != l3num ||
-			    matching->l4proto->l4proto != l4num) {
-				ret = -EINVAL;
-				goto err_proto_put;
-			}
-
-			ret = ctnl_timeout_parse_policy(&matching->data,
-							l4proto, net,
-							cda[CTA_TIMEOUT_DATA]);
-			return ret;
+			    matching->l4proto->l4proto != l4num)
+				return -EINVAL;
+
+			return ctnl_timeout_parse_policy(&matching->data,
+							 matching->l4proto, net,
+							 cda[CTA_TIMEOUT_DATA]);
 		}
-		ret = -EBUSY;
+
+		return -EBUSY;
+	}
+
+	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+	/* This protocol is not supportted, skip. */
+	if (l4proto->l4proto != l4num) {
+		ret = -EOPNOTSUPP;
 		goto err_proto_put;
 	}
 
-- 
cgit v1.2.3


From 533e33009897c7dd1b0424c0d4b3331b222d5681 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 21:58:18 +0800
Subject: netfilter: cttimeout: unlink timeout objs in the unconfirmed ct lists

KASAN reported this bug:
  BUG: KASAN: use-after-free in icmp_packet+0x25/0x50 [nf_conntrack_ipv4] at
  addr ffff880002db08c8
  Read of size 4 by task lt-nf-queue/19041
  Call Trace:
  <IRQ>  [<ffffffff815eeebb>] dump_stack+0x63/0x88
  [<ffffffff813386f8>] kasan_report_error+0x528/0x560
  [<ffffffff81338cc8>] kasan_report+0x58/0x60
  [<ffffffffa07393f5>] ? icmp_packet+0x25/0x50 [nf_conntrack_ipv4]
  [<ffffffff81337551>] __asan_load4+0x61/0x80
  [<ffffffffa07393f5>] icmp_packet+0x25/0x50 [nf_conntrack_ipv4]
  [<ffffffffa06ecaa0>] nf_conntrack_in+0x550/0x980 [nf_conntrack]
  [<ffffffffa06ec550>] ? __nf_conntrack_confirm+0xb10/0xb10 [nf_conntrack]
  [ ... ]

The main reason is that we missed to unlink the timeout objects in the
unconfirmed ct lists, so we will access the timeout objects that have
already been freed.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cttimeout.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 6844c7af0b8f..139e0867e56e 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -302,7 +302,16 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
 	const struct hlist_nulls_node *nn;
 	unsigned int last_hsize;
 	spinlock_t *lock;
-	int i;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+		spin_lock_bh(&pcpu->lock);
+		hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
+			untimeout(h, timeout);
+		spin_unlock_bh(&pcpu->lock);
+	}
 
 	local_bh_disable();
 restart:
-- 
cgit v1.2.3


From 960fa72f67f1be6891d63a5518860d1ae4e14b88 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Mon, 22 Aug 2016 22:57:56 +0800
Subject: netfilter: nft_meta: improve the validity check of pkttype set expr

"meta pkttype set" is only supported on prerouting chain with bridge
family and ingress chain with netdev family.

But the validate check is incomplete, and the user can add the nft
rules on input chain with bridge family, for example:
  # nft add table bridge filter
  # nft add chain bridge filter input {type filter hook input \
    priority 0 \;}
  # nft add chain bridge filter test
  # nft add rule bridge filter test meta pkttype set unicast
  # nft add rule bridge filter input jump test

This patch fixes the problem.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h       |  4 ++++
 net/bridge/netfilter/nft_meta_bridge.c |  1 +
 net/netfilter/nft_meta.c               | 17 +++++++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index d27588c8dbd9..1139cde0fdc5 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -36,4 +36,8 @@ void nft_meta_set_eval(const struct nft_expr *expr,
 void nft_meta_set_destroy(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr);
 
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data);
+
 #endif
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 4b901d9f2e7c..ad47a921b701 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = {
 	.init		= nft_meta_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
+	.validate	= nft_meta_set_validate,
 };
 
 static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 2863f3493038..8a6bc7630912 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -291,10 +291,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
-static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nft_data **data)
 {
+	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int hooks;
 
+	if (priv->key != NFT_META_PKTTYPE)
+		return 0;
+
 	switch (ctx->afi->family) {
 	case NFPROTO_BRIDGE:
 		hooks = 1 << NF_BR_PRE_ROUTING;
@@ -308,6 +314,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
 
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
 
 int nft_meta_set_init(const struct nft_ctx *ctx,
 		      const struct nft_expr *expr,
@@ -327,15 +334,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
 		len = sizeof(u8);
 		break;
 	case NFT_META_PKTTYPE:
-		err = nft_meta_set_init_pkttype(ctx);
-		if (err)
-			return err;
 		len = sizeof(u8);
 		break;
 	default:
 		return -EOPNOTSUPP;
 	}
 
+	err = nft_meta_set_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
+
 	priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
 	err = nft_validate_register_load(priv->sreg, len);
 	if (err < 0)
@@ -407,6 +415,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
 	.init		= nft_meta_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
+	.validate	= nft_meta_set_validate,
 };
 
 static const struct nft_expr_ops *
-- 
cgit v1.2.3


From 4249fc1f023a2106170bbf715e2e1a0ebc2d5b1f Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 23 Aug 2016 10:20:31 +0200
Subject: netfilter: ebtables: put module reference when an incorrect extension
 is found

commit bcf493428840 ("netfilter: ebtables: Fix extension lookup with
identical name") added a second lookup in case the extension that was
found during the first lookup matched another extension with the same
name, but didn't release the reference on the incorrect module.

Fixes: bcf493428840 ("netfilter: ebtables: Fix extension lookup with identical name")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index cceac5bb658f..0833c251aef7 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -368,6 +368,8 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
 
 	match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
 	if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) {
+		if (!IS_ERR(match))
+			module_put(match->me);
 		request_module("ebt_%s", m->u.name);
 		match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
 	}
-- 
cgit v1.2.3


From 936523441bb64cdc9a5b263e8fd2782e70313a57 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 6 Aug 2016 15:50:52 +0200
Subject: batman-adv: Add missing refcnt for last_candidate

batadv_find_router dereferences last_bonding_candidate from
orig_node without making sure that it has a valid reference. This reference
has to be retrieved by increasing the reference counter while holding
neigh_list_lock. The lock is required to avoid that
batadv_last_bonding_replace removes the current last_bonding_candidate,
reduces the reference counter and maybe destroys the object in this
process.

Fixes: f3b3d9018975 ("batman-adv: add bonding again")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/routing.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 7602c001e92b..3d199478c405 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -469,6 +469,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
 	return 0;
 }
 
+/**
+ * batadv_last_bonding_get - Get last_bonding_candidate of orig_node
+ * @orig_node: originator node whose last bonding candidate should be retrieved
+ *
+ * Return: last bonding candidate of router or NULL if not found
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+static struct batadv_orig_ifinfo *
+batadv_last_bonding_get(struct batadv_orig_node *orig_node)
+{
+	struct batadv_orig_ifinfo *last_bonding_candidate;
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+	last_bonding_candidate = orig_node->last_bonding_candidate;
+
+	if (last_bonding_candidate)
+		kref_get(&last_bonding_candidate->refcount);
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+
+	return last_bonding_candidate;
+}
+
 /**
  * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node
  * @orig_node: originator node whose bonding candidates should be replaced
@@ -539,7 +562,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 	 * router - obviously there are no other candidates.
 	 */
 	rcu_read_lock();
-	last_candidate = orig_node->last_bonding_candidate;
+	last_candidate = batadv_last_bonding_get(orig_node);
 	if (last_candidate)
 		last_cand_router = rcu_dereference(last_candidate->router);
 
@@ -631,6 +654,9 @@ next:
 		batadv_orig_ifinfo_put(next_candidate);
 	}
 
+	if (last_candidate)
+		batadv_orig_ifinfo_put(last_candidate);
+
 	return router;
 }
 
-- 
cgit v1.2.3


From 1e5d343b8f23770e8ac5d31f5c439826bdb35148 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Tue, 23 Aug 2016 03:13:03 +0200
Subject: batman-adv: fix elp packet data reservation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The skb_reserve() call only reserved headroom for the mac header, but
not the elp packet header itself.

Fixing this by using skb_put()'ing towards the skb tail instead of
skb_push()'ing towards the skb head.

Fixes: d6f94d91f766 ("batman-adv: ELP - adding basic infrastructure")
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_v_elp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 7d170010beb9..ee08540ce503 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -335,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
 		goto out;
 
 	skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
-	elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
+	elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
 	elp_packet = (struct batadv_elp_packet *)elp_buff;
 	memset(elp_packet, 0, BATADV_ELP_HLEN);
 
-- 
cgit v1.2.3


From 554d072e7bc3e56de5893c8181110a547b2062c9 Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Mon, 29 Aug 2016 12:37:35 +0300
Subject: mac80211: TDLS: don't require beaconing for AP BW

Stop downgrading TDLS chandef when reaching the AP BW. The AP provides
the necessary regulatory protection in this case.

This fixes https://bugzilla.kernel.org/show_bug.cgi?id=153961, which
reported an infinite loop here.

Reported-by: Kamil Toman <kamil.toman@gmail.com>
Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tdls.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index b5d28f14b9cf..afca7d103684 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -333,10 +333,11 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
 	if (!uc.center_freq1)
 		return;
 
-	/* proceed to downgrade the chandef until usable or the same */
+	/* proceed to downgrade the chandef until usable or the same as AP BW */
 	while (uc.width > max_width ||
-	       !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
-					      sdata->wdev.iftype))
+	       (uc.width > sta->tdls_chandef.width &&
+		!cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
+					       sdata->wdev.iftype)))
 		ieee80211_chandef_downgrade(&uc);
 
 	if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) {
-- 
cgit v1.2.3


From c73c2484901139c28383b58eabcbf4d613e91518 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Sun, 28 Aug 2016 16:59:52 +0800
Subject: netfilter: nf_tables_netdev: remove redundant ip_hdr assignment

We have already use skb_header_pointer to get the ip header pointer,
so there's no need to use ip_hdr again. Moreover, in NETDEV INGRESS
hook, ip header maybe not linear, so use ip_hdr is not appropriate,
remove it.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_netdev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 5eefe4a355c6..75d696f11045 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -30,7 +30,6 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
 	if (!iph)
 		return;
 
-	iph = ip_hdr(skb);
 	if (iph->ihl < 5 || iph->version != 4)
 		return;
 
-- 
cgit v1.2.3


From 9264251ee2a55bce8fb93826b3f581fb9eb7e2c2 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 31 Aug 2016 14:16:44 +0200
Subject: bridge: re-introduce 'fix parsing of MLDv2 reports'

commit bc8c20acaea1 ("bridge: multicast: treat igmpv3 report with
INCLUDE and no sources as a leave") seems to have accidentally reverted
commit 47cc84ce0c2f ("bridge: fix parsing of MLDv2 reports"). This
commit brings back a change to br_ip6_multicast_mld2_report() where
parsing of MLDv2 reports stops when the first group is successfully
added to the MDB cache.

Fixes: bc8c20acaea1 ("bridge: multicast: treat igmpv3 report with INCLUDE and no sources as a leave")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index a5423a1eec05..c5fea9393946 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1138,7 +1138,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 		} else {
 			err = br_ip6_multicast_add_group(br, port,
 							 &grec->grec_mca, vid);
-			if (!err)
+			if (err)
 				break;
 		}
 	}
-- 
cgit v1.2.3


From c0338aff2260ea6c092806312dbb154cec07a242 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sun, 28 Aug 2016 21:28:26 -0700
Subject: kcm: fix a socket double free

Dmitry reported a double free on kcm socket, which could
be easily reproduced by:

	#include <unistd.h>
	#include <sys/syscall.h>

	int main()
	{
	  int fd = syscall(SYS_socket, 0x29ul, 0x5ul, 0x0ul, 0, 0, 0);
	  syscall(SYS_ioctl, fd, 0x89e2ul, 0x20a98000ul, 0, 0, 0);
	  return 0;
	}

This is because on the error path, after we install
the new socket file, we call sock_release() to clean
up the socket, which leaves the fd pointing to a freed
socket. Fix this by calling sys_close() on that fd
directly.

Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/kcm/kcmsock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index cb39e05b166c..411693288648 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -13,6 +13,7 @@
 #include <linux/socket.h>
 #include <linux/uaccess.h>
 #include <linux/workqueue.h>
+#include <linux/syscalls.h>
 #include <net/kcm.h>
 #include <net/netns/generic.h>
 #include <net/sock.h>
@@ -2029,7 +2030,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			if (copy_to_user((void __user *)arg, &info,
 					 sizeof(info))) {
 				err = -EFAULT;
-				sock_release(newsock);
+				sys_close(info.fd);
 			}
 		}
 
-- 
cgit v1.2.3


From d2f394dc4816b7bd1b44981d83509f18f19c53f0 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Thu, 1 Sep 2016 16:22:16 +0200
Subject: tipc: fix random link resets while adding a second bearer

In a dual bearer configuration, if the second tipc link becomes
active while the first link still has pending nametable "bulk"
updates, it randomly leads to reset of the second link.

When a link is established, the function named_distribute(),
fills the skb based on node mtu (allows room for TUNNEL_PROTOCOL)
with NAME_DISTRIBUTOR message for each PUBLICATION.
However, the function named_distribute() allocates the buffer by
increasing the node mtu by INT_H_SIZE (to insert NAME_DISTRIBUTOR).
This consumes the space allocated for TUNNEL_PROTOCOL.

When establishing the second link, the link shall tunnel all the
messages in the first link queue including the "bulk" update.
As size of the NAME_DISTRIBUTOR messages while tunnelling, exceeds
the link mtu the transmission fails (-EMSGSIZE).

Thus, the synch point based on the message count of the tunnel
packets is never reached leading to link timeout.

In this commit, we adjust the size of name distributor message so that
they can be tunnelled.

Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_distr.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 6b626a64b517..a04fe9be1c60 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -62,6 +62,8 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
 
 /**
  * named_prepare_buf - allocate & initialize a publication message
+ *
+ * The buffer returned is of size INT_H_SIZE + payload size
  */
 static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
 					 u32 dest)
@@ -141,9 +143,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
 	struct publication *publ;
 	struct sk_buff *skb = NULL;
 	struct distr_item *item = NULL;
-	uint msg_dsz = (tipc_node_get_mtu(net, dnode, 0) / ITEM_SIZE) *
-			ITEM_SIZE;
-	uint msg_rem = msg_dsz;
+	u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) /
+			ITEM_SIZE) * ITEM_SIZE;
+	u32 msg_rem = msg_dsz;
 
 	list_for_each_entry(publ, pls, local_list) {
 		/* Prepare next buffer: */
-- 
cgit v1.2.3


From d26c638c16cb54f6fb1507e27df93ede692db572 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 30 Aug 2016 10:09:21 +0200
Subject: ipv6: add missing netconf notif when 'all' is updated

The 'default' value was not advertised.

Fixes: f3a1bfb11ccb ("rtnl/ipv6: use netconf msg to advertise forwarding status")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f418d2eaeddd..2a688171a188 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -778,7 +778,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
 	}
 
 	if (p == &net->ipv6.devconf_all->forwarding) {
+		int old_dflt = net->ipv6.devconf_dflt->forwarding;
+
 		net->ipv6.devconf_dflt->forwarding = newf;
+		if ((!newf) ^ (!old_dflt))
+			inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+						     NETCONFA_IFINDEX_DEFAULT,
+						     net->ipv6.devconf_dflt);
+
 		addrconf_forward_change(net, newf);
 		if ((!newf) ^ (!old))
 			inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
-- 
cgit v1.2.3


From 29c994e361009142ec0bca6493cc8f7b0d3c561a Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 30 Aug 2016 10:09:22 +0200
Subject: netconf: add a notif when settings are created

All changes are notified, but the initial state was missing.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c  | 11 +++++++----
 net/ipv6/addrconf.c |  9 ++++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 415e117967c7..062a67ca9a21 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2232,7 +2232,7 @@ static struct devinet_sysctl_table {
 };
 
 static int __devinet_sysctl_register(struct net *net, char *dev_name,
-					struct ipv4_devconf *p)
+				     int ifindex, struct ipv4_devconf *p)
 {
 	int i;
 	struct devinet_sysctl_table *t;
@@ -2255,6 +2255,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
 		goto free;
 
 	p->sysctl = t;
+
+	inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
 	return 0;
 
 free:
@@ -2286,7 +2288,7 @@ static int devinet_sysctl_register(struct in_device *idev)
 	if (err)
 		return err;
 	err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
-					&idev->cnf);
+					idev->dev->ifindex, &idev->cnf);
 	if (err)
 		neigh_sysctl_unregister(idev->arp_parms);
 	return err;
@@ -2347,11 +2349,12 @@ static __net_init int devinet_init_net(struct net *net)
 	}
 
 #ifdef CONFIG_SYSCTL
-	err = __devinet_sysctl_register(net, "all", all);
+	err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
 	if (err < 0)
 		goto err_reg_all;
 
-	err = __devinet_sysctl_register(net, "default", dflt);
+	err = __devinet_sysctl_register(net, "default",
+					NETCONFA_IFINDEX_DEFAULT, dflt);
 	if (err < 0)
 		goto err_reg_dflt;
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2a688171a188..bdf368eff5ab 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6032,7 +6032,7 @@ static const struct ctl_table addrconf_sysctl[] = {
 static int __addrconf_sysctl_register(struct net *net, char *dev_name,
 		struct inet6_dev *idev, struct ipv6_devconf *p)
 {
-	int i;
+	int i, ifindex;
 	struct ctl_table *table;
 	char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
 
@@ -6052,6 +6052,13 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
 	if (!p->sysctl_header)
 		goto free;
 
+	if (!strcmp(dev_name, "all"))
+		ifindex = NETCONFA_IFINDEX_ALL;
+	else if (!strcmp(dev_name, "default"))
+		ifindex = NETCONFA_IFINDEX_DEFAULT;
+	else
+		ifindex = idev->dev->ifindex;
+	inet6_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
 	return 0;
 
 free:
-- 
cgit v1.2.3


From 85a3d4a9356b595d5440c3f1bf07ee7cecca1567 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 30 Aug 2016 17:44:29 +0200
Subject: net: bridge: don't increment tx_dropped in br_do_proxy_arp

pskb_may_pull may fail due to various reasons (e.g. alloc failure), but the
skb isn't changed/dropped and processing continues so we shouldn't
increment tx_dropped.

CC: Kyeyoon Park <kyeyoonp@codeaurora.org>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: bridge@lists.linux-foundation.org
Fixes: 958501163ddd ("bridge: Add support for IEEE 802.11 Proxy ARP")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_input.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8e486203d133..abe11f085479 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -80,13 +80,10 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
 
 	BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
 
-	if (dev->flags & IFF_NOARP)
+	if ((dev->flags & IFF_NOARP) ||
+	    !pskb_may_pull(skb, arp_hdr_len(dev)))
 		return;
 
-	if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
-		dev->stats.tx_dropped++;
-		return;
-	}
 	parp = arp_hdr(skb);
 
 	if (parp->ar_pro != htons(ETH_P_IP) ||
-- 
cgit v1.2.3


From 28b346cbc0715ae45b2814d857f1d8a7e6817ed8 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 30 Aug 2016 11:55:23 -0400
Subject: tcp: fastopen: fix rcv_wup initialization for TFO server on SYN/data

Yuchung noticed that on the first TFO server data packet sent after
the (TFO) handshake, the server echoed the TCP timestamp value in the
SYN/data instead of the timestamp value in the final ACK of the
handshake. This problem did not happen on regular opens.

The tcp_replace_ts_recent() logic that decides whether to remember an
incoming TS value needs tp->rcv_wup to hold the latest receive
sequence number that we have ACKed (latest tp->rcv_nxt we have
ACKed). This commit fixes this issue by ensuring that a TFO server
properly updates tp->rcv_wup to match tp->rcv_nxt at the time it sends
a SYN/ACK for the SYN/data.

Reported-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_fastopen.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 54d9f9b0120f..62a5751d4fe1 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -226,6 +226,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	tcp_fastopen_add_skb(child, skb);
 
 	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
+	tp->rcv_wup = tp->rcv_nxt;
 	/* tcp_conn_request() is sending the SYNACK,
 	 * and queues the child into listener accept queue.
 	 */
-- 
cgit v1.2.3


From 635c223cfa05af9523146b2f37e119d945f449ae Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Wed, 31 Aug 2016 14:15:05 +0800
Subject: rps: flow_dissector: Fix uninitialized flow_keys used in
 __skb_get_hash possibly

The original codes depend on that the function parameters are evaluated from
left to right. But the parameter's evaluation order is not defined in C
standard actually.

When flow_keys_have_l4(&keys) is invoked before ___skb_get_hash(skb, &keys,
hashrnd) with some compilers or environment, the keys passed to
flow_keys_have_l4 is not initialized.

Fixes: 6db61d79c1e1 ("flow_dissector: Ignore flow dissector return value from ___skb_get_hash")

Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 61ad43f61c5e..52742a02814f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -680,11 +680,13 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
 void __skb_get_hash(struct sk_buff *skb)
 {
 	struct flow_keys keys;
+	u32 hash;
 
 	__flow_hash_secret_init();
 
-	__skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd),
-			  flow_keys_have_l4(&keys));
+	hash = ___skb_get_hash(skb, &keys, hashrnd);
+
+	__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
-- 
cgit v1.2.3


From ab34380162cbc9b5172afdadf5136643c687bb73 Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Fri, 26 Aug 2016 23:52:29 +0800
Subject: ipv6: Don't unset flowi6_proto in ipxip6_tnl_xmit()

Commit 8eb30be0352d0916 ("ipv6: Create ip6_tnl_xmit") unsets
flowi6_proto in ip4ip6_tnl_xmit() and ip6ip6_tnl_xmit().
Since xfrm_selector_match() relies on this info, IPv6 packets
sent by an ip6tunnel cannot be properly selected by their
protocols after removing it. This patch puts flowi6_proto back.

Cc: stable@vger.kernel.org
Fixes: 8eb30be0352d ("ipv6: Create ip6_tnl_xmit")
Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 7b0481e3738f..888543debe4e 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1174,6 +1174,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 		encap_limit = t->parms.encap_limit;
 
 	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_IPIP;
 
 	dsfield = ipv4_get_dsfield(iph);
 
@@ -1233,6 +1234,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 		encap_limit = t->parms.encap_limit;
 
 	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_IPV6;
 
 	dsfield = ipv6_get_dsfield(ipv6h);
 	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-- 
cgit v1.2.3


From 2f86953e7436c9b9a4690909c5e2db24799e173b Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 2 Sep 2016 10:22:54 +0200
Subject: l2tp: fix use-after-free during module unload

Tunnel deletion is delayed by both a workqueue (l2tp_tunnel_delete -> wq
 -> l2tp_tunnel_del_work) and RCU (sk_destruct -> RCU ->
l2tp_tunnel_destruct).

By the time l2tp_tunnel_destruct() runs to destroy the tunnel and finish
destroying the socket, the private data reserved via the net_generic
mechanism has already been freed, but l2tp_tunnel_destruct() actually
uses this data.

Make sure tunnel deletion for the netns has completed before returning
from l2tp_exit_net() by first flushing the tunnel removal workqueue, and
then waiting for RCU callbacks to complete.

Fixes: 167eb17e0b17 ("l2tp: create tunnel sockets in the right namespace")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 1e40dacaa137..a2ed3bda4ddc 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1855,6 +1855,9 @@ static __net_exit void l2tp_exit_net(struct net *net)
 		(void)l2tp_tunnel_delete(tunnel);
 	}
 	rcu_read_unlock_bh();
+
+	flush_workqueue(l2tp_wq);
+	rcu_barrier();
 }
 
 static struct pernet_operations l2tp_net_ops = {
-- 
cgit v1.2.3


From a41bd25ae67d3e4052c7f00ee9f2b4ba9219309e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 25 Aug 2016 18:42:35 +0200
Subject: sunrpc: fix UDP memory accounting

The commit f9b2ee714c5c ("SUNRPC: Move UDP receive data path
into a workqueue context"), as a side effect, moved the
skb_free_datagram() call outside the scope of the related socket
lock, but UDP sockets require such lock to be held for proper
memory accounting.
Fix it by replacing skb_free_datagram() with
skb_free_datagram_locked().

Fixes: f9b2ee714c5c ("SUNRPC: Move UDP receive data path into a workqueue context")
Reported-and-tested-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Cc: stable@vger.kernel.org # 4.4+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8ede3bc52481..bf168838a029 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1074,7 +1074,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
 		skb = skb_recv_datagram(sk, 0, 1, &err);
 		if (skb != NULL) {
 			xs_udp_data_read_skb(&transport->xprt, sk, skb);
-			skb_free_datagram(sk, skb);
+			skb_free_datagram_locked(sk, skb);
 			continue;
 		}
 		if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
-- 
cgit v1.2.3


From 24b27fc4cdf9e10c5e79e5923b6b7c2c5c95096c Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 1 Sep 2016 22:18:34 -0700
Subject: bonding: Fix bonding crash

Following few steps will crash kernel -

  (a) Create bonding master
      > modprobe bonding miimon=50
  (b) Create macvlan bridge on eth2
      > ip link add link eth2 dev mvl0 address aa:0:0:0:0:01 \
	   type macvlan
  (c) Now try adding eth2 into the bond
      > echo +eth2 > /sys/class/net/bond0/bonding/slaves
      <crash>

Bonding does lots of things before checking if the device enslaved is
busy or not.

In this case when the notifier call-chain sends notifications, the
bond_netdev_event() assumes that the rx_handler /rx_handler_data is
registered while the bond_enslave() hasn't progressed far enough to
register rx_handler for the new slave.

This patch adds a rx_handler check that can be performed right at the
beginning of the enslave code to avoid getting into this situation.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  7 ++++---
 include/linux/netdevice.h       |  1 +
 net/core/dev.c                  | 16 ++++++++++++++++
 3 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 217e8da0628c..9599ed6f1213 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1341,9 +1341,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 			    slave_dev->name);
 	}
 
-	/* already enslaved */
-	if (slave_dev->flags & IFF_SLAVE) {
-		netdev_dbg(bond_dev, "Error: Device was already enslaved\n");
+	/* already in-use? */
+	if (netdev_is_rx_handler_busy(slave_dev)) {
+		netdev_err(bond_dev,
+			   "Error: Device is in use and cannot be enslaved\n");
 		return -EBUSY;
 	}
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3a788bf0affd..e8d79d4ebcfe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3267,6 +3267,7 @@ static inline void napi_free_frags(struct napi_struct *napi)
 	napi->skb = NULL;
 }
 
+bool netdev_is_rx_handler_busy(struct net_device *dev);
 int netdev_rx_handler_register(struct net_device *dev,
 			       rx_handler_func_t *rx_handler,
 			       void *rx_handler_data);
diff --git a/net/core/dev.c b/net/core/dev.c
index dd6ce598de89..ea6312057a71 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3974,6 +3974,22 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	return skb;
 }
 
+/**
+ *	netdev_is_rx_handler_busy - check if receive handler is registered
+ *	@dev: device to check
+ *
+ *	Check if a receive handler is already registered for a given device.
+ *	Return true if there one.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+bool netdev_is_rx_handler_busy(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	return dev && rtnl_dereference(dev->rx_handler);
+}
+EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
+
 /**
  *	netdev_rx_handler_register - register receive handler
  *	@dev: device to register a handler for
-- 
cgit v1.2.3


From 38f7bd94a97b542de86a2be9229289717e33a7a4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 1 Sep 2016 14:56:49 -0700
Subject: Revert "af_unix: Fix splice-bind deadlock"

This reverts commit c845acb324aa85a39650a14e7696982ceea75dc1.

It turns out that it just replaces one deadlock with another one: we can
still get the wrong lock ordering with the readlock due to overlayfs
calling back into the filesystem layer and still taking the vfs locks
after the readlock.

The proper solution ends up being to just split the readlock into two
pieces: the bind lock (taken *outside* the vfs locks) and the IO lock
(taken *inside* the filesystem locks).  The two locks are independent
anyway.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 66 +++++++++++++++++++++---------------------------------
 1 file changed, 26 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f1dffe84f0d5..433ae1bbef97 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -954,20 +954,32 @@ fail:
 	return NULL;
 }
 
-static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode,
-		      struct path *res)
+static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 {
-	int err;
+	struct dentry *dentry;
+	struct path path;
+	int err = 0;
+	/*
+	 * Get the parent directory, calculate the hash for last
+	 * component.
+	 */
+	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+	err = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		return err;
 
-	err = security_path_mknod(path, dentry, mode, 0);
+	/*
+	 * All right, let's create it.
+	 */
+	err = security_path_mknod(&path, dentry, mode, 0);
 	if (!err) {
-		err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
+		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 		if (!err) {
-			res->mnt = mntget(path->mnt);
+			res->mnt = mntget(path.mnt);
 			res->dentry = dget(dentry);
 		}
 	}
-
+	done_path_create(&path, dentry);
 	return err;
 }
 
@@ -978,12 +990,10 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 	char *sun_path = sunaddr->sun_path;
-	int err, name_err;
+	int err;
 	unsigned int hash;
 	struct unix_address *addr;
 	struct hlist_head *list;
-	struct path path;
-	struct dentry *dentry;
 
 	err = -EINVAL;
 	if (sunaddr->sun_family != AF_UNIX)
@@ -999,34 +1009,14 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	addr_len = err;
 
-	name_err = 0;
-	dentry = NULL;
-	if (sun_path[0]) {
-		/* Get the parent directory, calculate the hash for last
-		 * component.
-		 */
-		dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
-
-		if (IS_ERR(dentry)) {
-			/* delay report until after 'already bound' check */
-			name_err = PTR_ERR(dentry);
-			dentry = NULL;
-		}
-	}
-
 	err = mutex_lock_interruptible(&u->readlock);
 	if (err)
-		goto out_path;
+		goto out;
 
 	err = -EINVAL;
 	if (u->addr)
 		goto out_up;
 
-	if (name_err) {
-		err = name_err == -EEXIST ? -EADDRINUSE : name_err;
-		goto out_up;
-	}
-
 	err = -ENOMEM;
 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 	if (!addr)
@@ -1037,11 +1027,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	addr->hash = hash ^ sk->sk_type;
 	atomic_set(&addr->refcnt, 1);
 
-	if (dentry) {
-		struct path u_path;
+	if (sun_path[0]) {
+		struct path path;
 		umode_t mode = S_IFSOCK |
 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
-		err = unix_mknod(dentry, &path, mode, &u_path);
+		err = unix_mknod(sun_path, mode, &path);
 		if (err) {
 			if (err == -EEXIST)
 				err = -EADDRINUSE;
@@ -1049,9 +1039,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			goto out_up;
 		}
 		addr->hash = UNIX_HASH_SIZE;
-		hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+		hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
 		spin_lock(&unix_table_lock);
-		u->path = u_path;
+		u->path = path;
 		list = &unix_socket_table[hash];
 	} else {
 		spin_lock(&unix_table_lock);
@@ -1074,10 +1064,6 @@ out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
 	mutex_unlock(&u->readlock);
-out_path:
-	if (dentry)
-		done_path_create(&path, dentry);
-
 out:
 	return err;
 }
-- 
cgit v1.2.3


From 6e1ce3c3451291142a57c4f3f6f999a29fb5b3bc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 1 Sep 2016 14:43:53 -0700
Subject: af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'

Right now we use the 'readlock' both for protecting some of the af_unix
IO path and for making the bind be single-threaded.

The two are independent, but using the same lock makes for a nasty
deadlock due to ordering with regards to filesystem locking.  The bind
locking would want to nest outside the VSF pathname locking, but the IO
locking wants to nest inside some of those same locks.

We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix
splice-bind deadlock") which moved the readlock inside the vfs locks,
but that caused problems with overlayfs that will then call back into
filesystem routines that take the lock in the wrong order anyway.

Splitting the locks means that we can go back to having the bind lock be
the outermost lock, and we don't have any deadlocks with lock ordering.

Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    | 45 +++++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9b4c418bebd8..fd60eccb59a6 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -52,7 +52,7 @@ struct unix_sock {
 	struct sock		sk;
 	struct unix_address     *addr;
 	struct path		path;
-	struct mutex		readlock;
+	struct mutex		iolock, bindlock;
 	struct sock		*peer;
 	struct list_head	link;
 	atomic_long_t		inflight;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 433ae1bbef97..8309687a56b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -661,11 +661,11 @@ static int unix_set_peek_off(struct sock *sk, int val)
 {
 	struct unix_sock *u = unix_sk(sk);
 
-	if (mutex_lock_interruptible(&u->readlock))
+	if (mutex_lock_interruptible(&u->iolock))
 		return -EINTR;
 
 	sk->sk_peek_off = val;
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 
 	return 0;
 }
@@ -779,7 +779,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	spin_lock_init(&u->lock);
 	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
-	mutex_init(&u->readlock); /* single task reading lock */
+	mutex_init(&u->iolock); /* single task reading lock */
+	mutex_init(&u->bindlock); /* single task binding lock */
 	init_waitqueue_head(&u->peer_wait);
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
@@ -848,7 +849,7 @@ static int unix_autobind(struct socket *sock)
 	int err;
 	unsigned int retries = 0;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		return err;
 
@@ -895,7 +896,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
-out:	mutex_unlock(&u->readlock);
+out:	mutex_unlock(&u->bindlock);
 	return err;
 }
 
@@ -1009,7 +1010,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	addr_len = err;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		goto out;
 
@@ -1063,7 +1064,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->bindlock);
 out:
 	return err;
 }
@@ -1955,17 +1956,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 	if (false) {
 alloc_skb:
 		unix_state_unlock(other);
-		mutex_unlock(&unix_sk(other)->readlock);
+		mutex_unlock(&unix_sk(other)->iolock);
 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
 					      &err, 0);
 		if (!newskb)
 			goto err;
 	}
 
-	/* we must acquire readlock as we modify already present
+	/* we must acquire iolock as we modify already present
 	 * skbs in the sk_receive_queue and mess with skb->len
 	 */
-	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
+	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
 	if (err) {
 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
 		goto err;
@@ -2032,7 +2033,7 @@ alloc_skb:
 	}
 
 	unix_state_unlock(other);
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 
 	other->sk_data_ready(other);
 	scm_destroy(&scm);
@@ -2041,7 +2042,7 @@ alloc_skb:
 err_state_unlock:
 	unix_state_unlock(other);
 err_unlock:
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 err:
 	kfree_skb(newskb);
 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
@@ -2109,7 +2110,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		mutex_lock(&u->readlock);
+		mutex_lock(&u->iolock);
 
 		skip = sk_peek_offset(sk, flags);
 		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
@@ -2117,14 +2118,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		if (skb)
 			break;
 
-		mutex_unlock(&u->readlock);
+		mutex_unlock(&u->iolock);
 
 		if (err != -EAGAIN)
 			break;
 	} while (timeo &&
 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
 
-	if (!skb) { /* implies readlock unlocked */
+	if (!skb) { /* implies iolock unlocked */
 		unix_state_lock(sk);
 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
@@ -2189,7 +2190,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 out_free:
 	skb_free_datagram(sk, skb);
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 out:
 	return err;
 }
@@ -2284,7 +2285,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
 	/* Lock the socket to prevent queue disordering
 	 * while sleeps in memcpy_tomsg
 	 */
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	if (flags & MSG_PEEK)
 		skip = sk_peek_offset(sk, flags);
@@ -2326,7 +2327,7 @@ again:
 				break;
 			}
 
-			mutex_unlock(&u->readlock);
+			mutex_unlock(&u->iolock);
 
 			timeo = unix_stream_data_wait(sk, timeo, last,
 						      last_len);
@@ -2337,7 +2338,7 @@ again:
 				goto out;
 			}
 
-			mutex_lock(&u->readlock);
+			mutex_lock(&u->iolock);
 			goto redo;
 unlock:
 			unix_state_unlock(sk);
@@ -2440,7 +2441,7 @@ unlock:
 		}
 	} while (size);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	if (state->msg)
 		scm_recv(sock, state->msg, &scm, flags);
 	else
@@ -2481,9 +2482,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk,
 	int ret;
 	struct unix_sock *u = unix_sk(sk);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	ret = splice_to_pipe(pipe, spd);
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 5210d393ef84e5d2a4854671a9af2d97fd1b8dd4 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Fri, 2 Sep 2016 20:49:12 +0800
Subject: netfilter: nf_tables_trace: fix endiness when dump chain policy

NFTA_TRACE_POLICY attribute is big endian, but we forget to call
htonl to convert it. Fortunately, this attribute is parsed as big
endian in libnftnl.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 39eb1cc62e91..fa24a5b398b1 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -237,7 +237,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
 		break;
 	case NFT_TRACETYPE_POLICY:
 		if (nla_put_be32(skb, NFTA_TRACE_POLICY,
-				 info->basechain->policy))
+				 htonl(info->basechain->policy)))
 			goto nla_put_failure;
 		break;
 	}
-- 
cgit v1.2.3


From d1a6cba576fc7c43e476538fe5aa72fe04bd80e1 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Tue, 6 Sep 2016 22:31:02 +0800
Subject: netfilter: nft_chain_route: re-route before skb is queued to
 userspace

Imagine such situation, user add the following nft rules, and queue
the packets to userspace for further check:
  # ip rule add fwmark 0x0/0x1 lookup eth0
  # ip rule add fwmark 0x1/0x1 lookup eth1
  # nft add table filter
  # nft add chain filter output {type route hook output priority 0 \;}
  # nft add rule filter output mark set 0x1
  # nft add rule filter output queue num 0

But after we reinject the skbuff, the packet will be sent via the
wrong route, i.e. in this case, the packet will be routed via eth0
table, not eth1 table. Because we skip to do re-route when verdict
is NF_QUEUE, even if the mark was changed.

Acctually, we should not touch sk_buff if verdict is NF_DROP or
NF_STOLEN, and when re-route fails, return NF_DROP with error code.
This is consistent with the mangle table in iptables.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nft_chain_route_ipv4.c | 11 +++++++----
 net/ipv6/netfilter/nft_chain_route_ipv6.c | 10 +++++++---
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 2375b0a8be46..30493beb611a 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
 	__be32 saddr, daddr;
 	u_int8_t tos;
 	const struct iphdr *iph;
+	int err;
 
 	/* root is playing with raw sockets. */
 	if (skb->len < sizeof(struct iphdr) ||
@@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv,
 	tos = iph->tos;
 
 	ret = nft_do_chain(&pkt, priv);
-	if (ret != NF_DROP && ret != NF_QUEUE) {
+	if (ret != NF_DROP && ret != NF_STOLEN) {
 		iph = ip_hdr(skb);
 
 		if (iph->saddr != saddr ||
 		    iph->daddr != daddr ||
 		    skb->mark != mark ||
-		    iph->tos != tos)
-			if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
-				ret = NF_DROP;
+		    iph->tos != tos) {
+			err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+			if (err < 0)
+				ret = NF_DROP_ERR(err);
+		}
 	}
 	return ret;
 }
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 71d995ff3108..2535223ba956 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
 	struct in6_addr saddr, daddr;
 	u_int8_t hop_limit;
 	u32 mark, flowlabel;
+	int err;
 
 	/* malformed packet, drop it */
 	if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
@@ -46,13 +47,16 @@ static unsigned int nf_route_table_hook(void *priv,
 	flowlabel = *((u32 *)ipv6_hdr(skb));
 
 	ret = nft_do_chain(&pkt, priv);
-	if (ret != NF_DROP && ret != NF_QUEUE &&
+	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
 	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
 	     skb->mark != mark ||
 	     ipv6_hdr(skb)->hop_limit != hop_limit ||
-	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
-		return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP;
+	     flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
+		err = ip6_route_me_harder(state->net, skb);
+		if (err < 0)
+			ret = NF_DROP_ERR(err);
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From 03c2778a938aaba0893f6d6cdc29511d91a79848 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@codemonkey.org.uk>
Date: Fri, 2 Sep 2016 14:39:50 -0400
Subject: ipv6: release dst in ping_v6_sendmsg

Neither the failure or success paths of ping_v6_sendmsg release
the dst it acquires.  This leads to a flood of warnings from
"net/core/dst.c:288 dst_release" on older kernels that
don't have 8bf4ada2e21378816b28205427ee6b0e1ca4c5f1 backported.

That patch optimistically hoped this had been fixed post 3.10, but
it seems at least one case wasn't, where I've seen this triggered
a lot from machines doing unprivileged icmp sockets.

Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Dave Jones <davej@codemonkey.org.uk>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ping.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 0900352c924c..0e983b694ee8 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -126,8 +126,10 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	rt = (struct rt6_info *) dst;
 
 	np = inet6_sk(sk);
-	if (!np)
-		return -EBADF;
+	if (!np) {
+		err = -EBADF;
+		goto dst_err_out;
+	}
 
 	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
 		fl6.flowi6_oif = np->mcast_oif;
@@ -163,6 +165,9 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 	release_sock(sk);
 
+dst_err_out:
+	dst_release(dst);
+
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 78d506e1b7071b24850fd5ac22b896c459b0a04c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 6 Sep 2016 11:22:49 -0400
Subject: xprtrdma: Revert 3d4cf35bd4fa ("xprtrdma: Reply buffer
 exhaustion...")

Receive buffer exhaustion, if it were to actually occur, would be
catastrophic. However, when there are no reply buffers to post, that
means all of them have already been posted and are waiting for
incoming replies. By design, there can never be more RPCs in flight
than there are available receive buffers.

A receive buffer can be left posted after an RPC exits without a
received reply; say, due to a credential problem or a soft timeout.
This does not result in fewer posted receive buffers than there are
pending RPCs, and there is already logic in xprtrdma to deal
appropriately with this case.

It also looks like the "+ 2" that was removed was accidentally
accommodating the number of extra receive buffers needed for
receiving backchannel requests. That will need to be addressed by
another patch.

Fixes: 3d4cf35bd4fa ("xprtrdma: Reply buffer exhaustion can be...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtrdma/verbs.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 536d0be3f61b..fefcba982415 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -923,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	}
 
 	INIT_LIST_HEAD(&buf->rb_recv_bufs);
-	for (i = 0; i < buf->rb_max_requests; i++) {
+	for (i = 0; i < buf->rb_max_requests + 2; i++) {
 		struct rpcrdma_rep *rep;
 
 		rep = rpcrdma_create_rep(r_xprt);
@@ -1076,6 +1076,8 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 
 /*
  * Get a set of request/reply buffers.
+ *
+ * Reply buffer (if available) is attached to send buffer upon return.
  */
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -1094,13 +1096,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 
 out_reqbuf:
 	spin_unlock(&buffers->rb_lock);
-	pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
+	pr_warn("RPC:       %s: out of request buffers\n", __func__);
 	return NULL;
 out_repbuf:
-	list_add(&req->rl_free, &buffers->rb_send_bufs);
 	spin_unlock(&buffers->rb_lock);
-	pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
-	return NULL;
+	pr_warn("RPC:       %s: out of reply buffers\n", __func__);
+	req->rl_reply = NULL;
+	return req;
 }
 
 /*
-- 
cgit v1.2.3


From 05c974669ecec510a85d8534099bb75404e82c41 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 6 Sep 2016 11:22:58 -0400
Subject: xprtrdma: Fix receive buffer accounting

An RPC can terminate before its reply arrives, if a credential
problem or a soft timeout occurs. After this happens, xprtrdma
reports it is out of Receive buffers.

A Receive buffer is posted before each RPC is sent, and returned to
the buffer pool when a reply is received. If no reply is received
for an RPC, that Receive buffer remains posted. But xprtrdma tries
to post another when the next RPC is sent.

If this happens a few dozen times, there are no receive buffers left
to be posted at send time. I don't see a way for a transport
connection to recover at that point, and it will spit warnings and
unnecessarily delay RPCs on occasion for its remaining lifetime.

Commit 1e465fd4ff47 ("xprtrdma: Replace send and receive arrays")
removed a little bit of logic to detect this case and not provide
a Receive buffer so no more buffers are posted, and then transport
operation continues correctly. We didn't understand what that logic
did, and it wasn't commented, so it was removed as part of the
overhaul to support backchannel requests.

Restore it, but be wary of the need to keep extra Receives posted
to deal with backchannel requests.

Fixes: 1e465fd4ff47 ("xprtrdma: Replace send and receive arrays")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtrdma/verbs.c     | 41 +++++++++++++++++++++++++++++------------
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 2 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index fefcba982415..799cce6cbe45 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -51,6 +51,7 @@
 #include <linux/slab.h>
 #include <linux/prefetch.h>
 #include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
 #include <asm/bitops.h>
 #include <linux/module.h> /* try_module_get()/module_put() */
 
@@ -923,7 +924,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	}
 
 	INIT_LIST_HEAD(&buf->rb_recv_bufs);
-	for (i = 0; i < buf->rb_max_requests + 2; i++) {
+	for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) {
 		struct rpcrdma_rep *rep;
 
 		rep = rpcrdma_create_rep(r_xprt);
@@ -1018,6 +1019,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 		rep = rpcrdma_buffer_get_rep_locked(buf);
 		rpcrdma_destroy_rep(ia, rep);
 	}
+	buf->rb_send_count = 0;
 
 	spin_lock(&buf->rb_reqslock);
 	while (!list_empty(&buf->rb_allreqs)) {
@@ -1032,6 +1034,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 		spin_lock(&buf->rb_reqslock);
 	}
 	spin_unlock(&buf->rb_reqslock);
+	buf->rb_recv_count = 0;
 
 	rpcrdma_destroy_mrs(buf);
 }
@@ -1074,6 +1077,23 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 	spin_unlock(&buf->rb_mwlock);
 }
 
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
+{
+	/* If an RPC previously completed without a reply (say, a
+	 * credential problem or a soft timeout occurs) then hold off
+	 * on supplying more Receive buffers until the number of new
+	 * pending RPCs catches up to the number of posted Receives.
+	 */
+	if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
+		return NULL;
+
+	if (unlikely(list_empty(&buffers->rb_recv_bufs)))
+		return NULL;
+	buffers->rb_recv_count++;
+	return rpcrdma_buffer_get_rep_locked(buffers);
+}
+
 /*
  * Get a set of request/reply buffers.
  *
@@ -1087,10 +1107,9 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 	spin_lock(&buffers->rb_lock);
 	if (list_empty(&buffers->rb_send_bufs))
 		goto out_reqbuf;
+	buffers->rb_send_count++;
 	req = rpcrdma_buffer_get_req_locked(buffers);
-	if (list_empty(&buffers->rb_recv_bufs))
-		goto out_repbuf;
-	req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+	req->rl_reply = rpcrdma_buffer_get_rep(buffers);
 	spin_unlock(&buffers->rb_lock);
 	return req;
 
@@ -1098,11 +1117,6 @@ out_reqbuf:
 	spin_unlock(&buffers->rb_lock);
 	pr_warn("RPC:       %s: out of request buffers\n", __func__);
 	return NULL;
-out_repbuf:
-	spin_unlock(&buffers->rb_lock);
-	pr_warn("RPC:       %s: out of reply buffers\n", __func__);
-	req->rl_reply = NULL;
-	return req;
 }
 
 /*
@@ -1119,9 +1133,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 	req->rl_reply = NULL;
 
 	spin_lock(&buffers->rb_lock);
+	buffers->rb_send_count--;
 	list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
-	if (rep)
+	if (rep) {
+		buffers->rb_recv_count--;
 		list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+	}
 	spin_unlock(&buffers->rb_lock);
 }
 
@@ -1135,8 +1152,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
 	struct rpcrdma_buffer *buffers = req->rl_buffer;
 
 	spin_lock(&buffers->rb_lock);
-	if (!list_empty(&buffers->rb_recv_bufs))
-		req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+	req->rl_reply = rpcrdma_buffer_get_rep(buffers);
 	spin_unlock(&buffers->rb_lock);
 }
 
@@ -1150,6 +1166,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
 	struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
 
 	spin_lock(&buffers->rb_lock);
+	buffers->rb_recv_count--;
 	list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
 	spin_unlock(&buffers->rb_lock);
 }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 670fad57153a..a71b0f5897d8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -321,6 +321,7 @@ struct rpcrdma_buffer {
 	char			*rb_pool;
 
 	spinlock_t		rb_lock;	/* protect buf lists */
+	int			rb_send_count, rb_recv_count;
 	struct list_head	rb_send_bufs;
 	struct list_head	rb_recv_bufs;
 	u32			rb_max_requests;
-- 
cgit v1.2.3


From 5a56a0b3a45dd0cc5b2f7bec6afd053a474ed9f5 Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Mon, 5 Sep 2016 10:20:20 +1200
Subject: net: Don't delete routes in different VRFs

When deleting an IP address from an interface, there is a clean-up of
routes which refer to this local address. However, there was no check to
see that the VRF matched. This meant that deletion wasn't confined to
the VRF it should have been.

To solve this, a new field has been added to fib_info to hold a table
id. When removing fib entries corresponding to a local ip address, this
table id is also used in the comparison.

The table id is populated when the fib_info is created. This was already
done in some places, but not in ip_rt_ioctl(). This has now been fixed.

Fixes: 021dd3b8a142 ("net: Add routes to the table associated with the device")
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 3 ++-
 net/ipv4/fib_frontend.c  | 3 ++-
 net/ipv4/fib_semantics.c | 8 ++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4079fc18ffe4..7d4a72e75f33 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -111,6 +111,7 @@ struct fib_info {
 	unsigned char		fib_scope;
 	unsigned char		fib_type;
 	__be32			fib_prefsrc;
+	u32			fib_tb_id;
 	u32			fib_priority;
 	u32			*fib_metrics;
 #define fib_mtu fib_metrics[RTAX_MTU-1]
@@ -319,7 +320,7 @@ void fib_flush_external(struct net *net);
 /* Exported by fib_semantics.c */
 int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
-int fib_sync_down_addr(struct net *net, __be32 local);
+int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
 extern u32 fib_multipath_secret __read_mostly;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ef2ebeb89d0f..1b25daf8c7f1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 		if (!dev)
 			return -ENODEV;
 		cfg->fc_oif = dev->ifindex;
+		cfg->fc_table = l3mdev_fib_table(dev);
 		if (colon) {
 			struct in_ifaddr *ifa;
 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
@@ -1027,7 +1028,7 @@ no_promotions:
 			 * First of all, we scan fib_info list searching
 			 * for stray nexthop entries, then ignite fib_flush.
 			 */
-			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+			if (fib_sync_down_addr(dev, ifa->ifa_local))
 				fib_flush(dev_net(dev));
 		}
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 539fa264e67d..e9f56225e53f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi->fib_priority = cfg->fc_priority;
 	fi->fib_prefsrc = cfg->fc_prefsrc;
 	fi->fib_type = cfg->fc_type;
+	fi->fib_tb_id = cfg->fc_table;
 
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
@@ -1337,18 +1338,21 @@ nla_put_failure:
  *   referring to it.
  * - device went down -> we must shutdown all nexthops going via it.
  */
-int fib_sync_down_addr(struct net *net, __be32 local)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
 {
 	int ret = 0;
 	unsigned int hash = fib_laddr_hashfn(local);
 	struct hlist_head *head = &fib_info_laddrhash[hash];
+	struct net *net = dev_net(dev);
+	int tb_id = l3mdev_fib_table(dev);
 	struct fib_info *fi;
 
 	if (!fib_info_laddrhash || local == 0)
 		return 0;
 
 	hlist_for_each_entry(fi, head, fib_lhash) {
-		if (!net_eq(fi->fib_net, net))
+		if (!net_eq(fi->fib_net, net) ||
+		    fi->fib_tb_id != tb_id)
 			continue;
 		if (fi->fib_prefsrc == local) {
 			fi->fib_flags |= RTNH_F_DEAD;
-- 
cgit v1.2.3


From 751eb6b6042a596b0080967c1a529a9fe98dac1d Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Mon, 5 Sep 2016 16:06:31 +0800
Subject: ipv6: addrconf: fix dev refcont leak when DAD failed

In general, when DAD detected IPv6 duplicate address, ifp->state
will be set to INET6_IFADDR_STATE_ERRDAD and DAD is stopped by a
delayed work, the call tree should be like this:

ndisc_recv_ns
  -> addrconf_dad_failure        <- missing ifp put
     -> addrconf_mod_dad_work
       -> schedule addrconf_dad_work()
         -> addrconf_dad_stop()  <- missing ifp hold before call it

addrconf_dad_failure() called with ifp refcont holding but not put.
addrconf_dad_work() call addrconf_dad_stop() without extra holding
refcount. This will not cause any issue normally.

But the race between addrconf_dad_failure() and addrconf_dad_work()
may cause ifp refcount leak and netdevice can not be unregister,
dmesg show the following messages:

IPv6: eth0: IPv6 duplicate address fe80::XX:XXXX:XXXX:XX detected!
...
unregister_netdevice: waiting for eth0 to become free. Usage count = 1

Cc: stable@vger.kernel.org
Fixes: c15b1ccadb32 ("ipv6: move DAD and addrconf_verify processing
to workqueue")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index bdf368eff5ab..2f1f5d439788 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1948,6 +1948,7 @@ errdad:
 	spin_unlock_bh(&ifp->lock);
 
 	addrconf_mod_dad_work(ifp, 0);
+	in6_ifa_put(ifp);
 }
 
 /* Join to solicited addr multicast group.
@@ -3857,6 +3858,7 @@ static void addrconf_dad_work(struct work_struct *w)
 		addrconf_dad_begin(ifp);
 		goto out;
 	} else if (action == DAD_ABORT) {
+		in6_ifa_hold(ifp);
 		addrconf_dad_stop(ifp, 1);
 		if (disable_ipv6)
 			addrconf_ifdown(idev->dev, 0);
-- 
cgit v1.2.3


From 71a17de30733822b1ca6fbb3792581f5e7ee13de Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 14:43:39 +0100
Subject: rxrpc: Whitespace cleanup

Remove some whitespace.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index bb342f5fe7e4..ad702f9f8d1f 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -720,7 +720,6 @@ static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
 	}
 }
 
-
 static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
 {
 	if (!rxrpc_get_connection_maybe(conn))
@@ -879,7 +878,7 @@ int __init rxrpc_init_security(void);
 void rxrpc_exit_security(void);
 int rxrpc_init_client_conn_security(struct rxrpc_connection *);
 int rxrpc_init_server_conn_security(struct rxrpc_connection *);
- 
+
 /*
  * sendmsg.c
  */
-- 
cgit v1.2.3


From e796cb419237f54b96442ae7feca1859c693865c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 14:42:15 +0100
Subject: rxrpc: Delete unused rxrpc_kernel_free_skb()

Delete rxrpc_kernel_free_skb() as it's unused.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/skbuff.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 9752f8b1fdd0..a546a2ba6341 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -148,19 +148,6 @@ void rxrpc_packet_destructor(struct sk_buff *skb)
 	_leave("");
 }
 
-/**
- * rxrpc_kernel_free_skb - Free an RxRPC socket buffer
- * @skb: The socket buffer to be freed
- *
- * Let RxRPC free its own socket buffer, permitting it to maintain debug
- * accounting.
- */
-void rxrpc_kernel_free_skb(struct sk_buff *skb)
-{
-	rxrpc_free_skb(skb);
-}
-EXPORT_SYMBOL(rxrpc_kernel_free_skb);
-
 /*
  * Note the existence of a new-to-us socket buffer (allocated or dequeued).
  */
-- 
cgit v1.2.3


From fff72429c2e83bdbe32dc7f1ad6398dfe50750c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 14:34:21 +0100
Subject: rxrpc: Improve the call tracking tracepoint

Improve the call tracking tracepoint by showing more differentiation
between some of the put and get events, including:

  (1) Getting and putting refs for the socket call user ID tree.

  (2) Getting and putting refs for queueing and failing to queue the call
      processor work item.

Note that these aren't necessarily used in this patch, but will be taken
advantage of in future patches.

An enum is added for the event subtype numbers rather than coding them
directly as decimal numbers and a table of 3-letter strings is provided
rather than a sequence of ?: operators.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 11 +++-------
 net/rxrpc/af_rxrpc.c         |  2 +-
 net/rxrpc/ar-internal.h      | 22 ++++++++++++++++++--
 net/rxrpc/call_accept.c      | 10 ++++-----
 net/rxrpc/call_event.c       |  2 +-
 net/rxrpc/call_object.c      | 48 ++++++++++++++++++++++++++++----------------
 net/rxrpc/input.c            |  6 +++---
 net/rxrpc/recvmsg.c          | 23 +++++++++++----------
 net/rxrpc/sendmsg.c          |  4 ++--
 net/rxrpc/skbuff.c           |  2 +-
 10 files changed, 79 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index cbe574ea674b..30164896f1f6 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -17,7 +17,8 @@
 #include <linux/tracepoint.h>
 
 TRACE_EVENT(rxrpc_call,
-	    TP_PROTO(struct rxrpc_call *call, int op, int usage, int nskb,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op,
+		     int usage, int nskb,
 		     const void *where, const void *aux),
 
 	    TP_ARGS(call, op, usage, nskb, where, aux),
@@ -42,13 +43,7 @@ TRACE_EVENT(rxrpc_call,
 
 	    TP_printk("c=%p %s u=%d s=%d p=%pSR a=%p",
 		      __entry->call,
-		      (__entry->op == 0 ? "NWc" :
-		       __entry->op == 1 ? "NWs" :
-		       __entry->op == 2 ? "SEE" :
-		       __entry->op == 3 ? "GET" :
-		       __entry->op == 4 ? "Gsb" :
-		       __entry->op == 5 ? "PUT" :
-		       "Psb"),
+		      rxrpc_call_traces[__entry->op],
 		      __entry->usage,
 		      __entry->nskb,
 		      __entry->where,
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index b66a9e6f8d04..8356cd003d51 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -296,7 +296,7 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
 	_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
 	rxrpc_remove_user_ID(rxrpc_sk(sock->sk), call);
 	rxrpc_purge_queue(&call->knlrecv_queue);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 }
 EXPORT_SYMBOL(rxrpc_kernel_end_call);
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ad702f9f8d1f..913255a53564 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -508,6 +508,24 @@ struct rxrpc_call {
 	unsigned long		ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
 };
 
+enum rxrpc_call_trace {
+	rxrpc_call_new_client,
+	rxrpc_call_new_service,
+	rxrpc_call_queued,
+	rxrpc_call_queued_ref,
+	rxrpc_call_seen,
+	rxrpc_call_got,
+	rxrpc_call_got_skb,
+	rxrpc_call_got_userid,
+	rxrpc_call_put,
+	rxrpc_call_put_skb,
+	rxrpc_call_put_userid,
+	rxrpc_call_put_noqueue,
+	rxrpc_call__nr_trace
+};
+
+extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
+
 #include <trace/events/rxrpc.h>
 
 /*
@@ -555,8 +573,8 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
 void rxrpc_release_call(struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
 void rxrpc_see_call(struct rxrpc_call *);
-void rxrpc_get_call(struct rxrpc_call *);
-void rxrpc_put_call(struct rxrpc_call *);
+void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
+void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void __exit rxrpc_destroy_all_calls(void);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 68a439e30df1..487ae7aa86db 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -115,7 +115,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
 
 	write_lock(&rx->call_lock);
 	if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 
 		spin_lock(&call->conn->state_lock);
 		if (sp->hdr.securityIndex > 0 &&
@@ -155,7 +155,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
 	_debug("done");
 	read_unlock_bh(&local->services_lock);
 	rxrpc_free_skb(notification);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = 0");
 	return 0;
 
@@ -166,11 +166,11 @@ invalid_service:
 	read_lock_bh(&call->state_lock);
 	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
 	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 		rxrpc_queue_call(call);
 	}
 	read_unlock_bh(&call->state_lock);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	ret = -ECONNREFUSED;
 error:
 	rxrpc_free_skb(notification);
@@ -341,6 +341,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	}
 
 	/* formalise the acceptance */
+	rxrpc_get_call(call, rxrpc_call_got_userid);
 	call->notify_rx = notify_rx;
 	call->user_call_ID = user_call_ID;
 	rb_link_node(&call->sock_node, parent, pp);
@@ -351,7 +352,6 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 		BUG();
 	rxrpc_queue_call(call);
 
-	rxrpc_get_call(call);
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
 	_leave(" = %p{%d}", call, call->debug_id);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 4754c7fb6242..fee8b6ddb334 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -1246,7 +1246,7 @@ send_message_2:
 kill_ACKs:
 	del_timer_sync(&call->ack_timer);
 	if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
-		rxrpc_put_call(call);
+		rxrpc_put_call(call, rxrpc_call_put);
 	clear_bit(RXRPC_CALL_EV_ACK, &call->events);
 
 maybe_reschedule:
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 65691742199b..3166b5222435 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -55,6 +55,21 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 	[RXRPC_CALL_NETWORK_ERROR]		= "NetError",
 };
 
+const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
+	[rxrpc_call_new_client]		= "NWc",
+	[rxrpc_call_new_service]	= "NWs",
+	[rxrpc_call_queued]		= "QUE",
+	[rxrpc_call_queued_ref]		= "QUR",
+	[rxrpc_call_seen]		= "SEE",
+	[rxrpc_call_got]		= "GOT",
+	[rxrpc_call_got_skb]		= "Gsk",
+	[rxrpc_call_got_userid]		= "Gus",
+	[rxrpc_call_put]		= "PUT",
+	[rxrpc_call_put_skb]		= "Psk",
+	[rxrpc_call_put_userid]		= "Pus",
+	[rxrpc_call_put_noqueue]	= "PNQ",
+};
+
 struct kmem_cache *rxrpc_call_jar;
 LIST_HEAD(rxrpc_calls);
 DEFINE_RWLOCK(rxrpc_call_lock);
@@ -96,7 +111,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx,
 	return NULL;
 
 found_extant_call:
-	rxrpc_get_call(call);
+	rxrpc_get_call(call, rxrpc_call_got);
 	read_unlock(&rx->call_lock);
 	_leave(" = %p [%d]", call, atomic_read(&call->usage));
 	return call;
@@ -252,8 +267,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 			goto found_user_ID_now_present;
 	}
 
-	rxrpc_get_call(call);
-
+	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
 	rb_insert_color(&call->sock_node, &rx->calls);
 	write_unlock(&rx->call_lock);
@@ -275,7 +289,7 @@ error:
 	write_lock(&rx->call_lock);
 	rb_erase(&call->sock_node, &rx->calls);
 	write_unlock(&rx->call_lock);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put_userid);
 
 	write_lock_bh(&rxrpc_call_lock);
 	list_del_init(&call->link);
@@ -283,7 +297,7 @@ error:
 
 	set_bit(RXRPC_CALL_RELEASED, &call->flags);
 	call->state = RXRPC_CALL_DEAD;
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 
@@ -296,7 +310,7 @@ found_user_ID_now_present:
 	write_unlock(&rx->call_lock);
 	set_bit(RXRPC_CALL_RELEASED, &call->flags);
 	call->state = RXRPC_CALL_DEAD;
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = -EEXIST [%p]", call);
 	return ERR_PTR(-EEXIST);
 }
@@ -322,8 +336,8 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	if (!candidate)
 		return ERR_PTR(-EBUSY);
 
-	trace_rxrpc_call(candidate, 1, atomic_read(&candidate->usage),
-			 0, here, NULL);
+	trace_rxrpc_call(candidate, rxrpc_call_new_service,
+			 atomic_read(&candidate->usage), 0, here, NULL);
 
 	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
 	candidate->socket	= rx;
@@ -358,7 +372,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 			read_unlock(&call->state_lock);
 			goto aborted_call;
 		default:
-			rxrpc_get_call(call);
+			rxrpc_get_call(call, rxrpc_call_got);
 			read_unlock(&call->state_lock);
 			goto extant_call;
 		}
@@ -447,20 +461,20 @@ void rxrpc_see_call(struct rxrpc_call *call)
 		int n = atomic_read(&call->usage);
 		int m = atomic_read(&call->skb_count);
 
-		trace_rxrpc_call(call, 2, n, m, here, 0);
+		trace_rxrpc_call(call, rxrpc_call_seen, n, m, here, NULL);
 	}
 }
 
 /*
  * Note the addition of a ref on a call.
  */
-void rxrpc_get_call(struct rxrpc_call *call)
+void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_inc_return(&call->usage);
 	int m = atomic_read(&call->skb_count);
 
-	trace_rxrpc_call(call, 3, n, m, here, 0);
+	trace_rxrpc_call(call, op, n, m, here, NULL);
 }
 
 /*
@@ -472,7 +486,7 @@ void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 	int n = atomic_inc_return(&call->usage);
 	int m = atomic_inc_return(&call->skb_count);
 
-	trace_rxrpc_call(call, 4, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_got_skb, n, m, here, skb);
 }
 
 /*
@@ -575,7 +589,7 @@ static void rxrpc_dead_call_expired(unsigned long _call)
 	write_lock_bh(&call->state_lock);
 	call->state = RXRPC_CALL_DEAD;
 	write_unlock_bh(&call->state_lock);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 }
 
 /*
@@ -632,7 +646,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 /*
  * release a call
  */
-void rxrpc_put_call(struct rxrpc_call *call)
+void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	int n, m;
@@ -641,7 +655,7 @@ void rxrpc_put_call(struct rxrpc_call *call)
 
 	n = atomic_dec_return(&call->usage);
 	m = atomic_read(&call->skb_count);
-	trace_rxrpc_call(call, 5, n, m, here, NULL);
+	trace_rxrpc_call(call, op, n, m, here, NULL);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
@@ -661,7 +675,7 @@ void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 
 	n = atomic_dec_return(&call->usage);
 	m = atomic_dec_return(&call->skb_count);
-	trace_rxrpc_call(call, 6, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_put_skb, n, m, here, skb);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 72f016cfaaf5..f7239a6f9181 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -537,7 +537,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
 	}
 
 	read_unlock(&call->state_lock);
-	rxrpc_get_call(call);
+	rxrpc_get_call(call, rxrpc_call_got);
 
 	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
 	    sp->hdr.flags & RXRPC_JUMBO_PACKET)
@@ -545,12 +545,12 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
 	else
 		rxrpc_fast_process_packet(call, skb);
 
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	goto done;
 
 resend_final_ack:
 	_debug("final ack again");
-	rxrpc_get_call(call);
+	rxrpc_get_call(call, rxrpc_call_got);
 	set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 	rxrpc_queue_call(call);
 	goto free_unlock;
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 0ab7b334bab1..97f8ee76c67c 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -79,7 +79,8 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
 				release_sock(&rx->sk);
 				if (continue_call)
-					rxrpc_put_call(continue_call);
+					rxrpc_put_call(continue_call,
+						       rxrpc_call_put);
 				return -ENODATA;
 			}
 		}
@@ -137,13 +138,13 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			if (call != continue_call ||
 			    skb->mark != RXRPC_SKB_MARK_DATA) {
 				release_sock(&rx->sk);
-				rxrpc_put_call(continue_call);
+				rxrpc_put_call(continue_call, rxrpc_call_put);
 				_leave(" = %d [noncont]", copied);
 				return copied;
 			}
 		}
 
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 
 		/* copy the peer address and timestamp */
 		if (!continue_call) {
@@ -233,7 +234,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		if (!continue_call)
 			continue_call = sp->call;
 		else
-			rxrpc_put_call(call);
+			rxrpc_put_call(call, rxrpc_call_put);
 		call = NULL;
 
 		if (flags & MSG_PEEK) {
@@ -255,9 +256,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 out:
 	release_sock(&rx->sk);
 	if (call)
-		rxrpc_put_call(call);
+		rxrpc_put_call(call, rxrpc_call_put);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	_leave(" = %d [data]", copied);
 	return copied;
 
@@ -341,18 +342,18 @@ terminal_message:
 	}
 
 	release_sock(&rx->sk);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ret;
 
 copy_error:
 	_debug("copy error");
 	release_sock(&rx->sk);
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ret;
 
@@ -361,7 +362,7 @@ wait_interrupted:
 wait_error:
 	finish_wait(sk_sleep(&rx->sk), &wait);
 	if (continue_call)
-		rxrpc_put_call(continue_call);
+		rxrpc_put_call(continue_call, rxrpc_call_put);
 	if (copied)
 		copied = ret;
 	_leave(" = %d [waitfail %d]", copied, ret);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 7376794a0308..803078bea507 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -534,7 +534,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		call = rxrpc_accept_call(rx, user_call_ID, NULL);
 		if (IS_ERR(call))
 			return PTR_ERR(call);
-		rxrpc_put_call(call);
+		rxrpc_put_call(call, rxrpc_call_put);
 		return 0;
 	}
 
@@ -573,7 +573,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		ret = rxrpc_send_data(rx, call, msg, len);
 	}
 
-	rxrpc_put_call(call);
+	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index a546a2ba6341..c0613ab6d2d5 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -35,7 +35,7 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call)
 
 		/* get an extra ref on the call for the final-ACK generator to
 		 * release */
-		rxrpc_get_call(call);
+		rxrpc_get_call(call, rxrpc_call_got);
 		set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 		if (try_to_del_timer_sync(&call->ack_timer) >= 0)
 			rxrpc_queue_call(call);
-- 
cgit v1.2.3


From f4fdb3525b247e1b4f9cc563641494c96c80f91a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 14:45:26 +0100
Subject: rxrpc: Use call->peer rather than call->conn->params.peer

Use call->peer rather than call->conn->params.peer to avoid the possibility
of call->conn being NULL and, whilst we're at it, check it for NULL before we
access it.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_object.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 3166b5222435..060ddc32a85e 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -514,9 +514,11 @@ void rxrpc_release_call(struct rxrpc_call *call)
 	 */
 	_debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
 
-	spin_lock(&conn->params.peer->lock);
-	hlist_del_init(&call->error_link);
-	spin_unlock(&conn->params.peer->lock);
+	if (call->peer) {
+		spin_lock(&call->peer->lock);
+		hlist_del_init(&call->error_link);
+		spin_unlock(&call->peer->lock);
+	}
 
 	write_lock_bh(&rx->call_lock);
 	if (!list_empty(&call->accept_link)) {
-- 
cgit v1.2.3


From 278ac0cdd5e516bdef2b9b8f5a4dd6366a5bccfe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 15:19:25 +0100
Subject: rxrpc: Cache the security index in the rxrpc_call struct

Cache the security index in the rxrpc_call struct so that we can get at it
even when the call has been disconnected and the connection pointer
cleared.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 1 +
 net/rxrpc/call_object.c | 1 +
 net/rxrpc/conn_client.c | 3 +++
 net/rxrpc/input.c       | 2 +-
 net/rxrpc/sendmsg.c     | 2 +-
 5 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 913255a53564..e3dfc9da05fe 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -475,6 +475,7 @@ struct rxrpc_call {
 	atomic_t		skb_count;	/* Outstanding packets on this call */
 	atomic_t		sequence;	/* Tx data packet sequence counter */
 	u16			service_id;	/* service ID */
+	u8			security_ix;	/* Security type */
 	u32			call_id;	/* call ID on connection  */
 	u32			cid;		/* connection ID plus channel index */
 	int			debug_id;	/* debug ID for printks */
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 060ddc32a85e..83019e489555 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -345,6 +345,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	candidate->peer		= conn->params.peer;
 	candidate->cid		= sp->hdr.cid;
 	candidate->call_id	= sp->hdr.callNumber;
+	candidate->security_ix	= sp->hdr.securityIndex;
 	candidate->rx_data_post	= 0;
 	candidate->state	= RXRPC_CALL_SERVER_ACCEPTING;
 	candidate->flags	|= (1 << RXRPC_CALL_IS_SERVICE);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 82de1aeaef21..9344a8416ceb 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -348,6 +348,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
 
 	if (cp->exclusive) {
 		call->conn = candidate;
+		call->security_ix = candidate->security_ix;
 		_leave(" = 0 [exclusive %d]", candidate->debug_id);
 		return 0;
 	}
@@ -395,6 +396,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
 candidate_published:
 	set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
 	call->conn = candidate;
+	call->security_ix = candidate->security_ix;
 	spin_unlock(&local->client_conns_lock);
 	_leave(" = 0 [new %d]", candidate->debug_id);
 	return 0;
@@ -412,6 +414,7 @@ found_extant_conn:
 
 	spin_lock(&conn->channel_lock);
 	call->conn = conn;
+	call->security_ix = conn->security_ix;
 	list_add(&call->chan_wait_link, &conn->waiting_calls);
 	spin_unlock(&conn->channel_lock);
 	_leave(" = 0 [extant %d]", conn->debug_id);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index f7239a6f9181..9242fefd7f40 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -198,7 +198,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
 
 	/* if the packet need security things doing to it, then it goes down
 	 * the slow path */
-	if (call->conn->security_ix)
+	if (call->security_ix)
 		goto enqueue_packet;
 
 	sp->call = call;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 803078bea507..2439aff131c7 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -322,7 +322,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			sp->hdr.serial	= atomic_inc_return(&conn->serial);
 			sp->hdr.type	= RXRPC_PACKET_TYPE_DATA;
 			sp->hdr.userStatus = 0;
-			sp->hdr.securityIndex = conn->security_ix;
+			sp->hdr.securityIndex = call->security_ix;
 			sp->hdr._rsvd	= 0;
 			sp->hdr.serviceId = call->service_id;
 
-- 
cgit v1.2.3


From 8b7fac50ab7f2668c43795c135025c472922a344 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 15:28:54 +0100
Subject: rxrpc: Pass the connection pointer to rxrpc_post_packet_to_call()

Pass the connection pointer to rxrpc_post_packet_to_call() as the call
might get disconnected whilst we're looking at it, but the connection
pointer determined by rxrpc_data_read() is guaranteed by RCU for the
duration of the call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 9242fefd7f40..52da4373131f 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -497,7 +497,8 @@ protocol_error:
  * post an incoming packet to the appropriate call/socket to deal with
  * - must get rid of the sk_buff, either by freeing it or by queuing it
  */
-static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
+static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
+				      struct rxrpc_call *call,
 				      struct sk_buff *skb)
 {
 	struct rxrpc_skb_priv *sp;
@@ -558,7 +559,7 @@ resend_final_ack:
 dead_call:
 	if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
 		skb->priority = RX_CALL_DEAD;
-		rxrpc_reject_packet(call->conn->params.local, skb);
+		rxrpc_reject_packet(conn->params.local, skb);
 		goto unlock;
 	}
 free_unlock:
@@ -754,7 +755,7 @@ void rxrpc_data_ready(struct sock *sk)
 			goto cant_route_call;
 
 		rxrpc_see_call(call);
-		rxrpc_post_packet_to_call(call, skb);
+		rxrpc_post_packet_to_call(conn, call, skb);
 		goto out_unlock;
 	}
 
-- 
cgit v1.2.3


From 6543ac523558b2392271f3f8088e6455b3f00bb1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 15:26:39 +0100
Subject: rxrpc: Use rxrpc_is_service_call() rather than
 rxrpc_conn_is_service()

Use rxrpc_is_service_call() rather than rxrpc_conn_is_service() if the call
is available just in case call->conn is NULL.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 52da4373131f..8267f42a7753 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -357,7 +357,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	case RXRPC_PACKET_TYPE_BUSY:
 		_proto("Rx BUSY %%%u", sp->hdr.serial);
 
-		if (rxrpc_conn_is_service(call->conn))
+		if (rxrpc_is_service_call(call))
 			goto protocol_error;
 
 		write_lock_bh(&call->state_lock);
@@ -525,7 +525,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
 		default:
 			goto dead_call;
 		case RXRPC_CALL_SUCCEEDED:
-			if (rxrpc_conn_is_service(call->conn))
+			if (rxrpc_is_service_call(call))
 				goto dead_call;
 			goto resend_final_ack;
 		}
-- 
cgit v1.2.3


From 8d94aa381dab19f3c0f524f5d255248b0ae50125 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 09:19:31 +0100
Subject: rxrpc: Calls shouldn't hold socket refs

rxrpc calls shouldn't hold refs on the sock struct.  This was done so that
the socket wouldn't go away whilst the call was in progress, such that the
call could reach the socket's queues.

However, we can mark the socket as requiring an RCU release and rely on the
RCU read lock.

To make this work, we do:

 (1) rxrpc_release_call() removes the call's call user ID.  This is now
     only called from socket operations and not from the call processor:

	rxrpc_accept_call() / rxrpc_kernel_accept_call()
	rxrpc_reject_call() / rxrpc_kernel_reject_call()
	rxrpc_kernel_end_call()
	rxrpc_release_calls_on_socket()
	rxrpc_recvmsg()

     Though it is also called in the cleanup path of
     rxrpc_accept_incoming_call() before we assign a user ID.

 (2) Pass the socket pointer into rxrpc_release_call() rather than getting
     it from the call so that we can get rid of uninitialised calls.

 (3) Fix call processor queueing to pass a ref to the work queue and to
     release that ref at the end of the processor function (or to pass it
     back to the work queue if we have to requeue).

 (4) Skip out of the call processor function asap if the call is complete
     and don't requeue it if the call is complete.

 (5) Clean up the call immediately that the refcount reaches 0 rather than
     trying to defer it.  Actual deallocation is deferred to RCU, however.

 (6) Don't hold socket refs for allocated calls.

 (7) Use the RCU read lock when queueing a message on a socket and treat
     the call's socket pointer according to RCU rules and check it for
     NULL.

     We also need to use the RCU read lock when viewing a call through
     procfs.

 (8) Transmit the final ACK/ABORT to a client call in rxrpc_release_call()
     if this hasn't been done yet so that we can then disconnect the call.
     Once the call is disconnected, it won't have any access to the
     connection struct and the UDP socket for the call work processor to be
     able to send the ACK.  Terminal retransmission will be handled by the
     connection processor.

 (9) Release all calls immediately on the closing of a socket rather than
     trying to defer this.  Incomplete calls will be aborted.

The call refcount model is much simplified.  Refs are held on the call by:

 (1) A socket's user ID tree.

 (2) A socket's incoming call secureq and acceptq.

 (3) A kernel service that has a call in progress.

 (4) A queued call work processor.  We have to take care to put any call
     that we failed to queue.

 (5) sk_buffs on a socket's receive queue.  A future patch will get rid of
     this.

Whilst we're at it, we can do:

 (1) Get rid of the RXRPC_CALL_EV_RELEASE event.  Release is now done
     entirely from the socket routines and never from the call's processor.

 (2) Get rid of the RXRPC_CALL_DEAD state.  Calls now end in the
     RXRPC_CALL_COMPLETE state.

 (3) Get rid of the rxrpc_call::destroyer work item.  Calls are now torn
     down when their refcount reaches 0 and then handed over to RCU for
     final cleanup.

 (4) Get rid of the rxrpc_call::deadspan timer.  Calls are cleaned up
     immediately they're finished with and don't hang around.
     Post-completion retransmission is handled by the connection processor
     once the call is disconnected.

 (5) Get rid of the dead call expiry setting as there's no longer a timer
     to set.

 (6) rxrpc_destroy_all_calls() can just check that the call list is empty.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    |   4 +-
 net/rxrpc/ar-internal.h |  15 ++--
 net/rxrpc/call_accept.c |  55 ++++--------
 net/rxrpc/call_event.c  |  74 +++++++---------
 net/rxrpc/call_object.c | 224 +++++++++++++++++++-----------------------------
 net/rxrpc/input.c       |  26 +++---
 net/rxrpc/output.c      | 145 +++++++++++++++++++++++++++++++
 net/rxrpc/proc.c        |   4 +-
 net/rxrpc/recvmsg.c     |  24 +-----
 net/rxrpc/skbuff.c      |   3 -
 net/rxrpc/sysctl.c      |   8 --
 11 files changed, 303 insertions(+), 279 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 8356cd003d51..77a132abf140 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -294,8 +294,7 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call);
 void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
 {
 	_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
-	rxrpc_remove_user_ID(rxrpc_sk(sock->sk), call);
-	rxrpc_purge_queue(&call->knlrecv_queue);
+	rxrpc_release_call(rxrpc_sk(sock->sk), call);
 	rxrpc_put_call(call, rxrpc_call_put);
 }
 EXPORT_SYMBOL(rxrpc_kernel_end_call);
@@ -558,6 +557,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 		return -ENOMEM;
 
 	sock_init_data(sock, sk);
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	sk->sk_state		= RXRPC_UNBOUND;
 	sk->sk_write_space	= rxrpc_write_space;
 	sk->sk_max_ack_backlog	= 0;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e3dfc9da05fe..3addda4bfa6b 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -35,8 +35,6 @@ struct rxrpc_crypt {
 #define rxrpc_queue_delayed_work(WS,D)	\
 	queue_delayed_work(rxrpc_workqueue, (WS), (D))
 
-#define rxrpc_queue_call(CALL)	rxrpc_queue_work(&(CALL)->processor)
-
 struct rxrpc_connection;
 
 /*
@@ -397,7 +395,6 @@ enum rxrpc_call_event {
 	RXRPC_CALL_EV_ACCEPTED,		/* incoming call accepted by userspace app */
 	RXRPC_CALL_EV_SECURED,		/* incoming call's connection is now secure */
 	RXRPC_CALL_EV_POST_ACCEPT,	/* need to post an "accept?" message to the app */
-	RXRPC_CALL_EV_RELEASE,		/* need to release the call's resources */
 };
 
 /*
@@ -417,7 +414,6 @@ enum rxrpc_call_state {
 	RXRPC_CALL_SERVER_SEND_REPLY,	/* - server sending reply */
 	RXRPC_CALL_SERVER_AWAIT_ACK,	/* - server awaiting final ACK */
 	RXRPC_CALL_COMPLETE,		/* - call complete */
-	RXRPC_CALL_DEAD,		/* - call is dead */
 	NR__RXRPC_CALL_STATES
 };
 
@@ -442,12 +438,10 @@ struct rxrpc_call {
 	struct rcu_head		rcu;
 	struct rxrpc_connection	*conn;		/* connection carrying call */
 	struct rxrpc_peer	*peer;		/* Peer record for remote address */
-	struct rxrpc_sock	*socket;	/* socket responsible */
+	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
 	struct timer_list	lifetimer;	/* lifetime remaining on call */
-	struct timer_list	deadspan;	/* reap timer for re-ACK'ing, etc  */
 	struct timer_list	ack_timer;	/* ACK generation timer */
 	struct timer_list	resend_timer;	/* Tx resend timer */
-	struct work_struct	destroyer;	/* call destroyer */
 	struct work_struct	processor;	/* packet processor and ACK generator */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
 	struct list_head	link;		/* link in master call list */
@@ -558,7 +552,6 @@ void rxrpc_process_call(struct work_struct *);
 extern const char *const rxrpc_call_states[];
 extern const char *const rxrpc_call_completions[];
 extern unsigned int rxrpc_max_call_lifetime;
-extern unsigned int rxrpc_dead_call_expiry;
 extern struct kmem_cache *rxrpc_call_jar;
 extern struct list_head rxrpc_calls;
 extern rwlock_t rxrpc_call_lock;
@@ -571,8 +564,10 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
 				       struct rxrpc_connection *,
 				       struct sk_buff *);
-void rxrpc_release_call(struct rxrpc_call *);
+void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
+bool __rxrpc_queue_call(struct rxrpc_call *);
+bool rxrpc_queue_call(struct rxrpc_call *);
 void rxrpc_see_call(struct rxrpc_call *);
 void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
@@ -835,6 +830,7 @@ extern const char *rxrpc_acks(u8 reason);
 /*
  * output.c
  */
+int rxrpc_send_call_packet(struct rxrpc_call *, u8);
 int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
 
 /*
@@ -880,7 +876,6 @@ extern const struct file_operations rxrpc_connection_seq_fops;
 /*
  * recvmsg.c
  */
-void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
 int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
 
 /*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 487ae7aa86db..879a964de80c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -163,13 +163,7 @@ invalid_service:
 	_debug("invalid");
 	read_unlock_bh(&local->services_lock);
 
-	read_lock_bh(&call->state_lock);
-	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
-		rxrpc_get_call(call, rxrpc_call_got);
-		rxrpc_queue_call(call);
-	}
-	read_unlock_bh(&call->state_lock);
+	rxrpc_release_call(rx, call);
 	rxrpc_put_call(call, rxrpc_call_put);
 	ret = -ECONNREFUSED;
 error:
@@ -236,13 +230,11 @@ found_service:
 	if (sk_acceptq_is_full(&rx->sk))
 		goto backlog_full;
 	sk_acceptq_added(&rx->sk);
-	sock_hold(&rx->sk);
 	read_unlock_bh(&local->services_lock);
 
 	ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
 	if (ret < 0)
 		sk_acceptq_removed(&rx->sk);
-	sock_put(&rx->sk);
 	switch (ret) {
 	case -ECONNRESET: /* old calls are ignored */
 	case -ECONNABORTED: /* aborted calls are reaborted or ignored */
@@ -333,9 +325,6 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	case RXRPC_CALL_COMPLETE:
 		ret = call->error;
 		goto out_release;
-	case RXRPC_CALL_DEAD:
-		ret = -ETIME;
-		goto out_discard;
 	default:
 		BUG();
 	}
@@ -350,24 +339,20 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 		BUG();
 	if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
 		BUG();
-	rxrpc_queue_call(call);
 
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
+	rxrpc_queue_call(call);
 	_leave(" = %p{%d}", call, call->debug_id);
 	return call;
 
-	/* if the call is already dying or dead, then we leave the socket's ref
-	 * on it to be released by rxrpc_dead_call_expired() as induced by
-	 * rxrpc_release_call() */
 out_release:
-	_debug("release %p", call);
-	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
-		rxrpc_queue_call(call);
-out_discard:
 	write_unlock_bh(&call->state_lock);
-	_debug("discard %p", call);
+	write_unlock(&rx->call_lock);
+	_debug("release %p", call);
+	rxrpc_release_call(rx, call);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
 out:
 	write_unlock(&rx->call_lock);
 	_leave(" = %d", ret);
@@ -390,8 +375,11 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 	write_lock(&rx->call_lock);
 
 	ret = -ENODATA;
-	if (list_empty(&rx->acceptq))
-		goto out;
+	if (list_empty(&rx->acceptq)) {
+		write_unlock(&rx->call_lock);
+		_leave(" = -ENODATA");
+		return -ENODATA;
+	}
 
 	/* dequeue the first call and check it's still valid */
 	call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
@@ -407,30 +395,17 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 		if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
 			rxrpc_queue_call(call);
 		ret = 0;
-		goto out_release;
+		break;
 	case RXRPC_CALL_COMPLETE:
 		ret = call->error;
-		goto out_release;
-	case RXRPC_CALL_DEAD:
-		ret = -ETIME;
-		goto out_discard;
+		break;
 	default:
 		BUG();
 	}
 
-	/* if the call is already dying or dead, then we leave the socket's ref
-	 * on it to be released by rxrpc_dead_call_expired() as induced by
-	 * rxrpc_release_call() */
-out_release:
-	_debug("release %p", call);
-	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
-		rxrpc_queue_call(call);
-out_discard:
 	write_unlock_bh(&call->state_lock);
-	_debug("discard %p", call);
-out:
 	write_unlock(&rx->call_lock);
+	rxrpc_release_call(rx, call);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index fee8b6ddb334..8365d3366114 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -811,8 +811,9 @@ static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
 }
 
 /*
- * handle background processing of incoming call packets and ACK / abort
- * generation
+ * Handle background processing of incoming call packets and ACK / abort
+ * generation.  A ref on the call is donated to us by whoever queued the work
+ * item.
  */
 void rxrpc_process_call(struct work_struct *work)
 {
@@ -827,6 +828,7 @@ void rxrpc_process_call(struct work_struct *work)
 	unsigned long bits;
 	__be32 data, pad;
 	size_t len;
+	bool requeue = false;
 	int loop, nbit, ioc, ret, mtu;
 	u32 serial, abort_code = RX_PROTOCOL_ERROR;
 	u8 *acks = NULL;
@@ -838,6 +840,11 @@ void rxrpc_process_call(struct work_struct *work)
 	       call->debug_id, rxrpc_call_states[call->state], call->events,
 	       (jiffies - call->creation_jif) / (HZ / 10));
 
+	if (call->state >= RXRPC_CALL_COMPLETE) {
+		rxrpc_put_call(call, rxrpc_call_put);
+		return;
+	}
+
 	if (!call->conn)
 		goto skip_msg_init;
 
@@ -1088,16 +1095,21 @@ skip_msg_init:
 		spin_lock_bh(&call->lock);
 
 		if (call->state == RXRPC_CALL_SERVER_SECURING) {
+			struct rxrpc_sock *rx;
 			_debug("securing");
-			write_lock(&call->socket->call_lock);
-			if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-			    !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
-				_debug("not released");
-				call->state = RXRPC_CALL_SERVER_ACCEPTING;
-				list_move_tail(&call->accept_link,
-					       &call->socket->acceptq);
+			rcu_read_lock();
+			rx = rcu_dereference(call->socket);
+			if (rx) {
+				write_lock(&rx->call_lock);
+				if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+					_debug("not released");
+					call->state = RXRPC_CALL_SERVER_ACCEPTING;
+					list_move_tail(&call->accept_link,
+						       &rx->acceptq);
+				}
+				write_unlock(&rx->call_lock);
 			}
-			write_unlock(&call->socket->call_lock);
+			rcu_read_unlock();
 			read_lock(&call->state_lock);
 			if (call->state < RXRPC_CALL_COMPLETE)
 				set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
@@ -1139,11 +1151,6 @@ skip_msg_init:
 		goto maybe_reschedule;
 	}
 
-	if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
-		rxrpc_release_call(call);
-		clear_bit(RXRPC_CALL_EV_RELEASE, &call->events);
-	}
-
 	/* other events may have been raised since we started checking */
 	goto maybe_reschedule;
 
@@ -1209,10 +1216,8 @@ send_message_2:
 			     &msg, iov, ioc, len);
 	if (ret < 0) {
 		_debug("sendmsg failed: %d", ret);
-		read_lock_bh(&call->state_lock);
-		if (call->state < RXRPC_CALL_DEAD)
-			rxrpc_queue_call(call);
-		read_unlock_bh(&call->state_lock);
+		if (call->state < RXRPC_CALL_COMPLETE)
+			requeue = true;
 		goto error;
 	}
 
@@ -1245,41 +1250,22 @@ send_message_2:
 
 kill_ACKs:
 	del_timer_sync(&call->ack_timer);
-	if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
-		rxrpc_put_call(call, rxrpc_call_put);
 	clear_bit(RXRPC_CALL_EV_ACK, &call->events);
 
 maybe_reschedule:
 	if (call->events || !skb_queue_empty(&call->rx_queue)) {
-		read_lock_bh(&call->state_lock);
-		if (call->state < RXRPC_CALL_DEAD)
-			rxrpc_queue_call(call);
-		read_unlock_bh(&call->state_lock);
-	}
-
-	/* don't leave aborted connections on the accept queue */
-	if (call->state >= RXRPC_CALL_COMPLETE &&
-	    !list_empty(&call->accept_link)) {
-		_debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
-		       call, call->events, call->flags, call->conn->proto.cid);
-
-		read_lock_bh(&call->state_lock);
-		if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-		    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
-			rxrpc_queue_call(call);
-		read_unlock_bh(&call->state_lock);
+		if (call->state < RXRPC_CALL_COMPLETE)
+			requeue = true;
 	}
 
 error:
 	kfree(acks);
 
-	/* because we don't want two CPUs both processing the work item for one
-	 * call at the same time, we use a flag to note when it's busy; however
-	 * this means there's a race between clearing the flag and setting the
-	 * work pending bit and the work item being processed again */
-	if (call->events && !work_pending(&call->processor)) {
+	if ((requeue || call->events) && !work_pending(&call->processor)) {
 		_debug("jumpstart %x", call->conn->proto.cid);
-		rxrpc_queue_call(call);
+		__rxrpc_queue_call(call);
+	} else {
+		rxrpc_put_call(call, rxrpc_call_put);
 	}
 
 	_leave("");
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 83019e489555..be5733d55794 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -24,11 +24,6 @@
  */
 unsigned int rxrpc_max_call_lifetime = 60 * HZ;
 
-/*
- * Time till dead call expires after last use (in jiffies).
- */
-unsigned int rxrpc_dead_call_expiry = 2 * HZ;
-
 const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_UNINITIALISED]		= "Uninit  ",
 	[RXRPC_CALL_CLIENT_AWAIT_CONN]		= "ClWtConn",
@@ -43,7 +38,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_SERVER_SEND_REPLY]		= "SvSndRpl",
 	[RXRPC_CALL_SERVER_AWAIT_ACK]		= "SvAwtACK",
 	[RXRPC_CALL_COMPLETE]			= "Complete",
-	[RXRPC_CALL_DEAD]			= "Dead    ",
 };
 
 const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
@@ -74,11 +68,10 @@ struct kmem_cache *rxrpc_call_jar;
 LIST_HEAD(rxrpc_calls);
 DEFINE_RWLOCK(rxrpc_call_lock);
 
-static void rxrpc_destroy_call(struct work_struct *work);
 static void rxrpc_call_life_expired(unsigned long _call);
-static void rxrpc_dead_call_expired(unsigned long _call);
 static void rxrpc_ack_time_expired(unsigned long _call);
 static void rxrpc_resend_time_expired(unsigned long _call);
+static void rxrpc_cleanup_call(struct rxrpc_call *call);
 
 /*
  * find an extant server call
@@ -138,13 +131,10 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 
 	setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
 		    (unsigned long) call);
-	setup_timer(&call->deadspan, &rxrpc_dead_call_expired,
-		    (unsigned long) call);
 	setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
 		    (unsigned long) call);
 	setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
 		    (unsigned long) call);
-	INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
 	INIT_WORK(&call->processor, &rxrpc_process_call);
 	INIT_LIST_HEAD(&call->link);
 	INIT_LIST_HEAD(&call->chan_wait_link);
@@ -185,11 +175,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
 	if (!call)
 		return ERR_PTR(-ENOMEM);
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
-
-	sock_hold(&rx->sk);
-	call->socket = rx;
 	call->rx_data_post = 1;
 	call->service_id = srx->srx_service;
+	rcu_assign_pointer(call->socket, rx);
 
 	_leave(" = %p", call);
 	return call;
@@ -244,8 +232,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 		return call;
 	}
 
-	trace_rxrpc_call(call, 0, atomic_read(&call->usage), 0, here,
-			 (const void *)user_call_ID);
+	trace_rxrpc_call(call, rxrpc_call_new_client,
+			 atomic_read(&call->usage), 0,
+			 here, (const void *)user_call_ID);
 
 	/* Publish the call, even though it is incompletely set up as yet */
 	call->user_call_ID = user_call_ID;
@@ -295,8 +284,10 @@ error:
 	list_del_init(&call->link);
 	write_unlock_bh(&rxrpc_call_lock);
 
+error_out:
+	__rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+				    RX_CALL_DEAD, ret);
 	set_bit(RXRPC_CALL_RELEASED, &call->flags);
-	call->state = RXRPC_CALL_DEAD;
 	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
@@ -308,11 +299,8 @@ error:
 	 */
 found_user_ID_now_present:
 	write_unlock(&rx->call_lock);
-	set_bit(RXRPC_CALL_RELEASED, &call->flags);
-	call->state = RXRPC_CALL_DEAD;
-	rxrpc_put_call(call, rxrpc_call_put);
-	_leave(" = -EEXIST [%p]", call);
-	return ERR_PTR(-EEXIST);
+	ret = -EEXIST;
+	goto error_out;
 }
 
 /*
@@ -340,7 +328,6 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 			 atomic_read(&candidate->usage), 0, here, NULL);
 
 	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
-	candidate->socket	= rx;
 	candidate->conn		= conn;
 	candidate->peer		= conn->params.peer;
 	candidate->cid		= sp->hdr.cid;
@@ -351,6 +338,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	candidate->flags	|= (1 << RXRPC_CALL_IS_SERVICE);
 	if (conn->security_ix > 0)
 		candidate->state = RXRPC_CALL_SERVER_SECURING;
+	rcu_assign_pointer(candidate->socket, rx);
 
 	spin_lock(&conn->channel_lock);
 
@@ -411,7 +399,6 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	candidate = NULL;
 	conn->channels[chan].call_counter = call_id;
 	rcu_assign_pointer(conn->channels[chan].call, call);
-	sock_hold(&rx->sk);
 	rxrpc_get_connection(conn);
 	rxrpc_get_peer(call->peer);
 	spin_unlock(&conn->channel_lock);
@@ -452,6 +439,39 @@ old_call:
 	return ERR_PTR(-ECONNRESET);
 }
 
+/*
+ * Queue a call's work processor, getting a ref to pass to the work queue.
+ */
+bool rxrpc_queue_call(struct rxrpc_call *call)
+{
+	const void *here = __builtin_return_address(0);
+	int n = __atomic_add_unless(&call->usage, 1, 0);
+	int m = atomic_read(&call->skb_count);
+	if (n == 0)
+		return false;
+	if (rxrpc_queue_work(&call->processor))
+		trace_rxrpc_call(call, rxrpc_call_queued, n + 1, m, here, NULL);
+	else
+		rxrpc_put_call(call, rxrpc_call_put_noqueue);
+	return true;
+}
+
+/*
+ * Queue a call's work processor, passing the callers ref to the work queue.
+ */
+bool __rxrpc_queue_call(struct rxrpc_call *call)
+{
+	const void *here = __builtin_return_address(0);
+	int n = atomic_read(&call->usage);
+	int m = atomic_read(&call->skb_count);
+	ASSERTCMP(n, >=, 1);
+	if (rxrpc_queue_work(&call->processor))
+		trace_rxrpc_call(call, rxrpc_call_queued_ref, n, m, here, NULL);
+	else
+		rxrpc_put_call(call, rxrpc_call_put_noqueue);
+	return true;
+}
+
 /*
  * Note the re-emergence of a call.
  */
@@ -493,11 +513,8 @@ void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 /*
  * detach a call from a socket and set up for release
  */
-void rxrpc_release_call(struct rxrpc_call *call)
+void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 {
-	struct rxrpc_connection *conn = call->conn;
-	struct rxrpc_sock *rx = call->socket;
-
 	_enter("{%d,%d,%d,%d}",
 	       call->debug_id, atomic_read(&call->usage),
 	       atomic_read(&call->ackr_not_idle),
@@ -513,7 +530,7 @@ void rxrpc_release_call(struct rxrpc_call *call)
 	/* dissociate from the socket
 	 * - the socket's ref on the call is passed to the death timer
 	 */
-	_debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+	_debug("RELEASE CALL %p (%d)", call, call->debug_id);
 
 	if (call->peer) {
 		spin_lock(&call->peer->lock);
@@ -532,20 +549,30 @@ void rxrpc_release_call(struct rxrpc_call *call)
 		rb_erase(&call->sock_node, &rx->calls);
 		memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
 		clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+		rxrpc_put_call(call, rxrpc_call_put_userid);
 	}
 	write_unlock_bh(&rx->call_lock);
 
 	/* free up the channel for reuse */
-	write_lock_bh(&call->state_lock);
+	if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) {
+		clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+		rxrpc_call_completed(call);
+	} else {
+		write_lock_bh(&call->state_lock);
+
+		if (call->state < RXRPC_CALL_COMPLETE) {
+			_debug("+++ ABORTING STATE %d +++\n", call->state);
+			__rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET);
+			clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
+			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+		}
 
-	if (call->state < RXRPC_CALL_COMPLETE &&
-	    call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
-		_debug("+++ ABORTING STATE %d +++\n", call->state);
-		__rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET);
+		write_unlock_bh(&call->state_lock);
 	}
-	write_unlock_bh(&call->state_lock);
 
-	rxrpc_disconnect_call(call);
+	if (call->conn)
+		rxrpc_disconnect_call(call);
 
 	/* clean up the Rx queue */
 	if (!skb_queue_empty(&call->rx_queue) ||
@@ -569,52 +596,15 @@ void rxrpc_release_call(struct rxrpc_call *call)
 		}
 		spin_unlock_bh(&call->lock);
 	}
+	rxrpc_purge_queue(&call->knlrecv_queue);
 
 	del_timer_sync(&call->resend_timer);
 	del_timer_sync(&call->ack_timer);
 	del_timer_sync(&call->lifetimer);
-	call->deadspan.expires = jiffies + rxrpc_dead_call_expiry;
-	add_timer(&call->deadspan);
 
 	_leave("");
 }
 
-/*
- * handle a dead call being ready for reaping
- */
-static void rxrpc_dead_call_expired(unsigned long _call)
-{
-	struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
-	_enter("{%d}", call->debug_id);
-
-	rxrpc_see_call(call);
-	write_lock_bh(&call->state_lock);
-	call->state = RXRPC_CALL_DEAD;
-	write_unlock_bh(&call->state_lock);
-	rxrpc_put_call(call, rxrpc_call_put);
-}
-
-/*
- * mark a call as to be released, aborting it if it's still in progress
- * - called with softirqs disabled
- */
-static void rxrpc_mark_call_released(struct rxrpc_call *call)
-{
-	bool sched = false;
-
-	rxrpc_see_call(call);
-	write_lock(&call->state_lock);
-	if (call->state < RXRPC_CALL_DEAD) {
-		sched = __rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET);
-		if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
-			sched = true;
-	}
-	write_unlock(&call->state_lock);
-	if (sched)
-		rxrpc_queue_call(call);
-}
-
 /*
  * release all the calls associated with a socket
  */
@@ -629,17 +619,17 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 
 	/* kill the not-yet-accepted incoming calls */
 	list_for_each_entry(call, &rx->secureq, accept_link) {
-		rxrpc_mark_call_released(call);
+		rxrpc_release_call(rx, call);
 	}
 
 	list_for_each_entry(call, &rx->acceptq, accept_link) {
-		rxrpc_mark_call_released(call);
+		rxrpc_release_call(rx, call);
 	}
 
 	/* mark all the calls as no longer wanting incoming packets */
 	for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
 		call = rb_entry(p, struct rxrpc_call, sock_node);
-		rxrpc_mark_call_released(call);
+		rxrpc_release_call(rx, call);
 	}
 
 	read_unlock_bh(&rx->call_lock);
@@ -663,8 +653,7 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
 		WARN_ON(m != 0);
-		ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
-		rxrpc_queue_work(&call->destroyer);
+		rxrpc_cleanup_call(call);
 	}
 }
 
@@ -683,8 +672,7 @@ void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
 		WARN_ON(m != 0);
-		ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
-		rxrpc_queue_work(&call->destroyer);
+		rxrpc_cleanup_call(call);
 	}
 }
 
@@ -708,23 +696,19 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call)
 {
 	_net("DESTROY CALL %d", call->debug_id);
 
-	ASSERT(call->socket);
+	write_lock_bh(&rxrpc_call_lock);
+	list_del_init(&call->link);
+	write_unlock_bh(&rxrpc_call_lock);
 
 	memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
 
 	del_timer_sync(&call->lifetimer);
-	del_timer_sync(&call->deadspan);
 	del_timer_sync(&call->ack_timer);
 	del_timer_sync(&call->resend_timer);
 
+	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 	ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
-	ASSERTCMP(call->events, ==, 0);
-	if (work_pending(&call->processor)) {
-		_debug("defer destroy");
-		rxrpc_queue_work(&call->destroyer);
-		return;
-	}
-
+	ASSERT(!work_pending(&call->processor));
 	ASSERTCMP(call->conn, ==, NULL);
 
 	if (call->acks_window) {
@@ -753,40 +737,21 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call)
 	rxrpc_purge_queue(&call->rx_queue);
 	ASSERT(skb_queue_empty(&call->rx_oos_queue));
 	rxrpc_purge_queue(&call->knlrecv_queue);
-	sock_put(&call->socket->sk);
 	call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
 }
 
 /*
- * destroy a call
- */
-static void rxrpc_destroy_call(struct work_struct *work)
-{
-	struct rxrpc_call *call =
-		container_of(work, struct rxrpc_call, destroyer);
-
-	_enter("%p{%d,%x,%p}",
-	       call, atomic_read(&call->usage), call->cid, call->conn);
-
-	ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
-
-	write_lock_bh(&rxrpc_call_lock);
-	list_del_init(&call->link);
-	write_unlock_bh(&rxrpc_call_lock);
-
-	rxrpc_cleanup_call(call);
-	_leave("");
-}
-
-/*
- * preemptively destroy all the call records from a transport endpoint rather
- * than waiting for them to time out
+ * Make sure that all calls are gone.
  */
 void __exit rxrpc_destroy_all_calls(void)
 {
 	struct rxrpc_call *call;
 
 	_enter("");
+
+	if (list_empty(&rxrpc_calls))
+		return;
+	
 	write_lock_bh(&rxrpc_call_lock);
 
 	while (!list_empty(&rxrpc_calls)) {
@@ -796,28 +761,15 @@ void __exit rxrpc_destroy_all_calls(void)
 		rxrpc_see_call(call);
 		list_del_init(&call->link);
 
-		switch (atomic_read(&call->usage)) {
-		case 0:
-			ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
-			break;
-		case 1:
-			if (del_timer_sync(&call->deadspan) != 0 &&
-			    call->state != RXRPC_CALL_DEAD)
-				rxrpc_dead_call_expired((unsigned long) call);
-			if (call->state != RXRPC_CALL_DEAD)
-				break;
-		default:
-			pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
-			       call, atomic_read(&call->usage),
-			       atomic_read(&call->ackr_not_idle),
-			       rxrpc_call_states[call->state],
-			       call->flags, call->events);
-			if (!skb_queue_empty(&call->rx_queue))
-				pr_err("Rx queue occupied\n");
-			if (!skb_queue_empty(&call->rx_oos_queue))
-				pr_err("OOS queue occupied\n");
-			break;
-		}
+		pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
+		       call, atomic_read(&call->usage),
+		       atomic_read(&call->ackr_not_idle),
+		       rxrpc_call_states[call->state],
+		       call->flags, call->events);
+		if (!skb_queue_empty(&call->rx_queue))
+			pr_err("Rx queue occupied\n");
+		if (!skb_queue_empty(&call->rx_oos_queue))
+			pr_err("OOS queue occupied\n");
 
 		write_unlock_bh(&rxrpc_call_lock);
 		cond_resched();
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 8267f42a7753..79f3f585cdc3 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -39,7 +39,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 			bool force, bool terminal)
 {
 	struct rxrpc_skb_priv *sp;
-	struct rxrpc_sock *rx = call->socket;
+	struct rxrpc_sock *rx;
 	struct sock *sk;
 	int ret;
 
@@ -59,7 +59,15 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 		return 0;
 	}
 
+	/* The socket may go away under us */
+	ret = 0;
+	rcu_read_lock();
+	rx = rcu_dereference(call->socket);
+	if (!rx)
+		goto out;
 	sk = &rx->sk;
+	if (sock_flag(sk, SOCK_DEAD))
+		goto out;
 
 	if (!force) {
 		/* cast skb->rcvbuf to unsigned...  It's pointless, but
@@ -78,7 +86,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
 	    !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    call->socket->sk.sk_state != RXRPC_CLOSE) {
+	    sk->sk_state != RXRPC_CLOSE) {
 		skb->destructor = rxrpc_packet_destructor;
 		skb->dev = NULL;
 		skb->sk = sk;
@@ -104,8 +112,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
 			spin_unlock_bh(&sk->sk_receive_queue.lock);
 
-			if (!sock_flag(sk, SOCK_DEAD))
-				sk->sk_data_ready(sk);
+			sk->sk_data_ready(sk);
 		}
 		skb = NULL;
 	} else {
@@ -115,6 +122,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 
 out:
 	rxrpc_free_skb(skb);
+	rcu_read_unlock();
 
 	_leave(" = %d", ret);
 	return ret;
@@ -266,7 +274,7 @@ enqueue_packet:
 	skb_queue_tail(&call->rx_queue, skb);
 	atomic_inc(&call->ackr_not_idle);
 	read_lock(&call->state_lock);
-	if (call->state < RXRPC_CALL_DEAD)
+	if (call->state < RXRPC_CALL_COMPLETE)
 		rxrpc_queue_call(call);
 	read_unlock(&call->state_lock);
 	_leave(" = 0 [queued]");
@@ -408,7 +416,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	case RXRPC_PACKET_TYPE_ACK:
 		/* ACK processing is done in process context */
 		read_lock_bh(&call->state_lock);
-		if (call->state < RXRPC_CALL_DEAD) {
+		if (call->state < RXRPC_CALL_COMPLETE) {
 			skb_queue_tail(&call->rx_queue, skb);
 			rxrpc_queue_call(call);
 			skb = NULL;
@@ -511,9 +519,6 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
 
 	read_lock(&call->state_lock);
 	switch (call->state) {
-	case RXRPC_CALL_DEAD:
-		goto dead_call;
-
 	case RXRPC_CALL_COMPLETE:
 		switch (call->completion) {
 		case RXRPC_CALL_LOCALLY_ABORTED:
@@ -538,7 +543,6 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
 	}
 
 	read_unlock(&call->state_lock);
-	rxrpc_get_call(call, rxrpc_call_got);
 
 	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
 	    sp->hdr.flags & RXRPC_JUMBO_PACKET)
@@ -546,12 +550,10 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
 	else
 		rxrpc_fast_process_packet(call, skb);
 
-	rxrpc_put_call(call, rxrpc_call_put);
 	goto done;
 
 resend_final_ack:
 	_debug("final ack again");
-	rxrpc_get_call(call, rxrpc_call_got);
 	set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 	rxrpc_queue_call(call);
 	goto free_unlock;
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 5b5508f6fc2a..8756d74fd74b 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -19,6 +19,151 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
+struct rxrpc_pkt_buffer {
+	struct rxrpc_wire_header whdr;
+	union {
+		struct {
+			struct rxrpc_ackpacket ack;
+			u8 acks[255];
+			u8 pad[3];
+		};
+		__be32 abort_code;
+	};
+	struct rxrpc_ackinfo ackinfo;
+};
+
+/*
+ * Fill out an ACK packet.
+ */
+static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
+				 struct rxrpc_pkt_buffer *pkt)
+{
+	u32 mtu, jmax;
+	u8 *ackp = pkt->acks;
+
+	pkt->ack.bufferSpace	= htons(8);
+	pkt->ack.maxSkew	= htons(0);
+	pkt->ack.firstPacket	= htonl(call->rx_data_eaten + 1);
+	pkt->ack.previousPacket	= htonl(call->ackr_prev_seq);
+	pkt->ack.serial		= htonl(call->ackr_serial);
+	pkt->ack.reason		= RXRPC_ACK_IDLE;
+	pkt->ack.nAcks		= 0;
+
+	mtu = call->peer->if_mtu;
+	mtu -= call->peer->hdrsize;
+	jmax = rxrpc_rx_jumbo_max;
+	pkt->ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
+	pkt->ackinfo.maxMTU	= htonl(mtu);
+	pkt->ackinfo.rwind	= htonl(rxrpc_rx_window_size);
+	pkt->ackinfo.jumbo_max	= htonl(jmax);
+
+	*ackp++ = 0;
+	*ackp++ = 0;
+	*ackp++ = 0;
+	return 3;
+}
+
+/*
+ * Send a final ACK or ABORT call packet.
+ */
+int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
+{
+	struct rxrpc_connection *conn = NULL;
+	struct rxrpc_pkt_buffer *pkt;
+	struct msghdr msg;
+	struct kvec iov[2];
+	rxrpc_serial_t serial;
+	size_t len, n;
+	int ioc, ret;
+	u32 abort_code;
+
+	_enter("%u,%s", call->debug_id, rxrpc_pkts[type]);
+
+	spin_lock_bh(&call->lock);
+	if (call->conn)
+		conn = rxrpc_get_connection_maybe(call->conn);
+	spin_unlock_bh(&call->lock);
+	if (!conn)
+		return -ECONNRESET;
+
+	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt) {
+		rxrpc_put_connection(conn);
+		return -ENOMEM;
+	}
+
+	serial = atomic_inc_return(&conn->serial);
+
+	msg.msg_name	= &call->peer->srx.transport;
+	msg.msg_namelen	= call->peer->srx.transport_len;
+	msg.msg_control	= NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags	= 0;
+
+	pkt->whdr.epoch		= htonl(conn->proto.epoch);
+	pkt->whdr.cid		= htonl(call->cid);
+	pkt->whdr.callNumber	= htonl(call->call_id);
+	pkt->whdr.seq		= 0;
+	pkt->whdr.serial	= htonl(serial);
+	pkt->whdr.type		= type;
+	pkt->whdr.flags		= conn->out_clientflag;
+	pkt->whdr.userStatus	= 0;
+	pkt->whdr.securityIndex	= call->security_ix;
+	pkt->whdr._rsvd		= 0;
+	pkt->whdr.serviceId	= htons(call->service_id);
+
+	iov[0].iov_base	= pkt;
+	iov[0].iov_len	= sizeof(pkt->whdr);
+	len = sizeof(pkt->whdr);
+
+	switch (type) {
+	case RXRPC_PACKET_TYPE_ACK:
+		spin_lock_bh(&call->lock);
+		n = rxrpc_fill_out_ack(call, pkt);
+		call->ackr_reason = 0;
+
+		spin_unlock_bh(&call->lock);
+
+		_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+		       serial,
+		       ntohs(pkt->ack.maxSkew),
+		       ntohl(pkt->ack.firstPacket),
+		       ntohl(pkt->ack.previousPacket),
+		       ntohl(pkt->ack.serial),
+		       rxrpc_acks(pkt->ack.reason),
+		       pkt->ack.nAcks);
+
+		iov[0].iov_len += sizeof(pkt->ack) + n;
+		iov[1].iov_base = &pkt->ackinfo;
+		iov[1].iov_len	= sizeof(pkt->ackinfo);
+		len += sizeof(pkt->ack) + n + sizeof(pkt->ackinfo);
+		ioc = 2;
+		break;
+
+	case RXRPC_PACKET_TYPE_ABORT:
+		abort_code = call->abort_code;
+		pkt->abort_code = htonl(abort_code);
+		_proto("Tx ABORT %%%u { %d }", serial, abort_code);
+		iov[0].iov_len += sizeof(pkt->abort_code);
+		len += sizeof(pkt->abort_code);
+		ioc = 1;
+		break;
+
+	default:
+		BUG();
+		ret = -ENOANO;
+		goto out;
+	}
+
+	ret = kernel_sendmsg(conn->params.local->socket,
+			     &msg, iov, ioc, len);
+
+out:
+	rxrpc_put_connection(conn);
+	kfree(pkt);
+	return ret;
+}
+
 /*
  * send a packet through the transport endpoint
  */
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 82c64055449d..dfad23821a62 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -29,6 +29,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
  */
 static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
 {
+	rcu_read_lock();
 	read_lock(&rxrpc_call_lock);
 	return seq_list_start_head(&rxrpc_calls, *_pos);
 }
@@ -41,6 +42,7 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
 {
 	read_unlock(&rxrpc_call_lock);
+	rcu_read_unlock();
 }
 
 static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
@@ -61,7 +63,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 
 	call = list_entry(v, struct rxrpc_call, link);
 
-	rx = READ_ONCE(call->socket);
+	rx = rcu_dereference(call->socket);
 	if (rx) {
 		local = READ_ONCE(rx->local);
 		if (local)
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 97f8ee76c67c..6876ffb3b410 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -18,28 +18,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-/*
- * removal a call's user ID from the socket tree to make the user ID available
- * again and so that it won't be seen again in association with that call
- */
-void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
-{
-	_debug("RELEASE CALL %d", call->debug_id);
-
-	if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
-		write_lock_bh(&rx->call_lock);
-		rb_erase(&call->sock_node, &call->socket->calls);
-		clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
-		write_unlock_bh(&rx->call_lock);
-	}
-
-	read_lock_bh(&call->state_lock);
-	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
-		rxrpc_queue_call(call);
-	read_unlock_bh(&call->state_lock);
-}
-
 /*
  * receive a message from an RxRPC socket
  * - we need to be careful about two or more threads calling recvmsg
@@ -338,7 +316,7 @@ terminal_message:
 		if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
 			BUG();
 		rxrpc_free_skb(skb);
-		rxrpc_remove_user_ID(rx, call);
+		rxrpc_release_call(rx, call);
 	}
 
 	release_sock(&rx->sk);
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index c0613ab6d2d5..9b8f8456d3bf 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -33,9 +33,6 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call)
 		call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
 		_debug("request final ACK");
 
-		/* get an extra ref on the call for the final-ACK generator to
-		 * release */
-		rxrpc_get_call(call, rxrpc_call_got);
 		set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 		if (try_to_del_timer_sync(&call->ack_timer) >= 0)
 			rxrpc_queue_call(call);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index dc380af8a81e..b7ca8cf13c84 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -88,14 +88,6 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 		.extra1		= (void *)&one,
 	},
-	{
-		.procname	= "dead_call_expiry",
-		.data		= &rxrpc_dead_call_expiry,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-		.extra1		= (void *)&one,
-	},
 
 	/* Non-time values */
 	{
-- 
cgit v1.2.3


From e8d6bbb05aa5cb985c3661d0db4f858f1d251326 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Sep 2016 16:34:12 +0100
Subject: rxrpc: Fix returns of call completion helpers

rxrpc_set_call_completion() returns bool, not int, so the ret variable
should match this.

rxrpc_call_completed() and __rxrpc_call_completed() should return the value
of rxrpc_set_call_completion().

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 3addda4bfa6b..0353399792b6 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -608,7 +608,7 @@ static inline bool rxrpc_set_call_completion(struct rxrpc_call *call,
 					     u32 abort_code,
 					     int error)
 {
-	int ret;
+	bool ret;
 
 	write_lock_bh(&call->state_lock);
 	ret = __rxrpc_set_call_completion(call, compl, abort_code, error);
@@ -619,16 +619,19 @@ static inline bool rxrpc_set_call_completion(struct rxrpc_call *call,
 /*
  * Record that a call successfully completed.
  */
-static inline void __rxrpc_call_completed(struct rxrpc_call *call)
+static inline bool __rxrpc_call_completed(struct rxrpc_call *call)
 {
-	__rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0);
+	return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0);
 }
 
-static inline void rxrpc_call_completed(struct rxrpc_call *call)
+static inline bool rxrpc_call_completed(struct rxrpc_call *call)
 {
+	bool ret;
+
 	write_lock_bh(&call->state_lock);
-	__rxrpc_call_completed(call);
+	ret = __rxrpc_call_completed(call);
 	write_unlock_bh(&call->state_lock);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From 5a42976d4fe5d7fddce133de995c742c87b1b7e3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 6 Sep 2016 22:19:51 +0100
Subject: rxrpc: Add tracepoint for working out where aborts happen

Add a tracepoint for working out where local aborts happen.  Each
tracepoint call is labelled with a 3-letter code so that they can be
distinguished - and the DATA sequence number is added too where available.

rxrpc_kernel_abort_call() also takes a 3-letter code so that AFS can
indicate the circumstances when it aborts a call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c               |  17 ++++---
 include/net/af_rxrpc.h       |   3 +-
 include/trace/events/rxrpc.h |  29 ++++++++++++
 net/rxrpc/ar-internal.h      |  14 ++++--
 net/rxrpc/call_event.c       |   7 +--
 net/rxrpc/call_object.c      |   2 +-
 net/rxrpc/conn_event.c       |   6 +++
 net/rxrpc/input.c            |   7 +--
 net/rxrpc/insecure.c         |  19 ++++----
 net/rxrpc/rxkad.c            | 108 +++++++++++++++++++------------------------
 net/rxrpc/sendmsg.c          |  18 ++++----
 11 files changed, 132 insertions(+), 98 deletions(-)

(limited to 'net')

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 37608be52abd..53750dece80e 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -377,7 +377,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	return wait_mode->wait(call);
 
 error_do_abort:
-	rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT);
+	rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD");
 error_kill_call:
 	afs_end_call(call);
 	_leave(" = %d", ret);
@@ -425,12 +425,12 @@ static void afs_deliver_to_call(struct afs_call *call)
 		case -ENOTCONN:
 			abort_code = RX_CALL_DEAD;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code);
+						abort_code, -ret, "KNC");
 			goto do_abort;
 		case -ENOTSUPP:
 			abort_code = RX_INVALID_OPERATION;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code);
+						abort_code, -ret, "KIV");
 			goto do_abort;
 		case -ENODATA:
 		case -EBADMSG:
@@ -440,7 +440,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 			if (call->state != AFS_CALL_AWAIT_REPLY)
 				abort_code = RXGEN_SS_UNMARSHAL;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code);
+						abort_code, EBADMSG, "KUM");
 			goto do_abort;
 		}
 	}
@@ -463,6 +463,7 @@ do_abort:
  */
 static int afs_wait_for_call_to_complete(struct afs_call *call)
 {
+	const char *abort_why;
 	int ret;
 
 	DECLARE_WAITQUEUE(myself, current);
@@ -481,9 +482,11 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 			continue;
 		}
 
+		abort_why = "KWC";
 		ret = call->error;
 		if (call->state == AFS_CALL_COMPLETE)
 			break;
+		abort_why = "KWI";
 		ret = -EINTR;
 		if (signal_pending(current))
 			break;
@@ -497,7 +500,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 	if (call->state < AFS_CALL_COMPLETE) {
 		_debug("call incomplete");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_CALL_DEAD);
+					RX_CALL_DEAD, -ret, abort_why);
 	}
 
 	_debug("call complete");
@@ -695,7 +698,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	case -ENOMEM:
 		_debug("oom");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT);
+					RX_USER_ABORT, ENOMEM, "KOO");
 	default:
 		afs_end_call(call);
 		_leave(" [error]");
@@ -734,7 +737,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	if (n == -ENOMEM) {
 		_debug("oom");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT);
+					RX_USER_ABORT, ENOMEM, "KOO");
 	}
 	afs_end_call(call);
 	_leave(" [error]");
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index b4b6a3664dda..08ed8729126c 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -35,7 +35,8 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *,
 			   struct msghdr *, size_t);
 int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
 			   void *, size_t, size_t *, bool, u32 *);
-void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32);
+void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
+			     u32, int, const char *);
 void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
 struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long,
 					    rxrpc_notify_rx_t);
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 30164896f1f6..85ee035774ae 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -84,6 +84,35 @@ TRACE_EVENT(rxrpc_skb,
 		      __entry->where)
 	    );
 
+TRACE_EVENT(rxrpc_abort,
+	    TP_PROTO(const char *why, u32 cid, u32 call_id, rxrpc_seq_t seq,
+		     int abort_code, int error),
+
+	    TP_ARGS(why, cid, call_id, seq, abort_code, error),
+
+	    TP_STRUCT__entry(
+		    __array(char,			why, 4		)
+		    __field(u32,			cid		)
+		    __field(u32,			call_id		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(int,			abort_code	)
+		    __field(int,			error		)
+			     ),
+
+	    TP_fast_assign(
+		    memcpy(__entry->why, why, 4);
+		    __entry->cid = cid;
+		    __entry->call_id = call_id;
+		    __entry->abort_code = abort_code;
+		    __entry->error = error;
+		    __entry->seq = seq;
+			   ),
+
+	    TP_printk("%08x:%08x s=%u a=%d e=%d %s",
+		      __entry->cid, __entry->call_id, __entry->seq,
+		      __entry->abort_code, __entry->error, __entry->why)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 0353399792b6..dbfb9ed17483 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -155,7 +155,8 @@ struct rxrpc_security {
 			     void *);
 
 	/* verify the security on a received packet */
-	int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *);
+	int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
+			     rxrpc_seq_t, u16);
 
 	/* issue a challenge */
 	int (*issue_challenge)(struct rxrpc_connection *);
@@ -637,9 +638,12 @@ static inline bool rxrpc_call_completed(struct rxrpc_call *call)
 /*
  * Record that a call is locally aborted.
  */
-static inline bool __rxrpc_abort_call(struct rxrpc_call *call,
+static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+				      rxrpc_seq_t seq,
 				      u32 abort_code, int error)
 {
+	trace_rxrpc_abort(why, call->cid, call->call_id, seq,
+			  abort_code, error);
 	if (__rxrpc_set_call_completion(call,
 					RXRPC_CALL_LOCALLY_ABORTED,
 					abort_code, error)) {
@@ -649,13 +653,13 @@ static inline bool __rxrpc_abort_call(struct rxrpc_call *call,
 	return false;
 }
 
-static inline bool rxrpc_abort_call(struct rxrpc_call *call,
-				    u32 abort_code, int error)
+static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+				    rxrpc_seq_t seq, u32 abort_code, int error)
 {
 	bool ret;
 
 	write_lock_bh(&call->state_lock);
-	ret = __rxrpc_abort_call(call, abort_code, error);
+	ret = __rxrpc_abort_call(why, call, seq, abort_code, error);
 	write_unlock_bh(&call->state_lock);
 	return ret;
 }
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 8365d3366114..af88ad7d2cf9 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -598,7 +598,8 @@ process_further:
 
 		/* secured packets must be verified and possibly decrypted */
 		if (call->conn->security->verify_packet(call, skb,
-							_abort_code) < 0)
+							sp->hdr.seq,
+							sp->hdr.cksum) < 0)
 			goto protocol_error;
 
 		rxrpc_insert_oos_packet(call, skb);
@@ -982,7 +983,7 @@ skip_msg_init:
 	}
 
 	if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
-		rxrpc_abort_call(call, RX_CALL_TIMEOUT, ETIME);
+		rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
 
 		_debug("post timeout");
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
@@ -1005,7 +1006,7 @@ skip_msg_init:
 		case -EKEYEXPIRED:
 		case -EKEYREJECTED:
 		case -EPROTO:
-			rxrpc_abort_call(call, abort_code, -ret);
+			rxrpc_abort_call("PRO", call, 0, abort_code, -ret);
 			goto kill_ACKs;
 		}
 	}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index be5733d55794..9efd9b0b0bdf 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -563,7 +563,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 
 		if (call->state < RXRPC_CALL_COMPLETE) {
 			_debug("+++ ABORTING STATE %d +++\n", call->state);
-			__rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET);
+			__rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
 			clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 		}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 9db90f4f768d..8c7938ba6a84 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -158,6 +158,11 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			lockdep_is_held(&conn->channel_lock));
 		if (call) {
 			rxrpc_see_call(call);
+			if (compl == RXRPC_CALL_LOCALLY_ABORTED)
+				trace_rxrpc_abort("CON", call->cid,
+						  call->call_id, 0,
+						  abort_code, error);
+
 			write_lock_bh(&call->state_lock);
 			if (rxrpc_set_call_completion(call, compl, abort_code,
 						      error)) {
@@ -167,6 +172,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			write_unlock_bh(&call->state_lock);
 			if (queue)
 				rxrpc_queue_call(call);
+
 		}
 	}
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 79f3f585cdc3..8e624109750a 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -429,7 +429,7 @@ protocol_error:
 	_debug("protocol error");
 	write_lock_bh(&call->state_lock);
 protocol_error_locked:
-	if (__rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO))
+	if (__rxrpc_abort_call("FPR", call, 0, RX_PROTOCOL_ERROR, EPROTO))
 		rxrpc_queue_call(call);
 free_packet_unlock:
 	write_unlock_bh(&call->state_lock);
@@ -495,9 +495,10 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
 protocol_error:
 	_debug("protocol error");
 	rxrpc_free_skb(part);
-	rxrpc_free_skb(jumbo);
-	if (rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO))
+	if (rxrpc_abort_call("PJP", call, sp->hdr.seq,
+			     RX_PROTOCOL_ERROR, EPROTO))
 		rxrpc_queue_call(call);
+	rxrpc_free_skb(jumbo);
 	_leave("");
 }
 
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index c21ad213b337..a4aba0246731 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -23,31 +23,32 @@ static int none_prime_packet_security(struct rxrpc_connection *conn)
 }
 
 static int none_secure_packet(struct rxrpc_call *call,
-			       struct sk_buff *skb,
-			       size_t data_size,
-			       void *sechdr)
+			      struct sk_buff *skb,
+			      size_t data_size,
+			      void *sechdr)
 {
 	return 0;
 }
 
 static int none_verify_packet(struct rxrpc_call *call,
-			       struct sk_buff *skb,
-			       u32 *_abort_code)
+			      struct sk_buff *skb,
+			      rxrpc_seq_t seq,
+			      u16 expected_cksum)
 {
 	return 0;
 }
 
 static int none_respond_to_challenge(struct rxrpc_connection *conn,
-				      struct sk_buff *skb,
-				      u32 *_abort_code)
+				     struct sk_buff *skb,
+				     u32 *_abort_code)
 {
 	*_abort_code = RX_PROTOCOL_ERROR;
 	return -EPROTO;
 }
 
 static int none_verify_response(struct rxrpc_connection *conn,
-				 struct sk_buff *skb,
-				 u32 *_abort_code)
+				struct sk_buff *skb,
+				u32 *_abort_code)
 {
 	*_abort_code = RX_PROTOCOL_ERROR;
 	return -EPROTO;
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 89f475febfd7..3777432df10b 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -316,12 +316,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
 /*
  * decrypt partial encryption on a packet (level 1 security)
  */
-static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
-				    struct sk_buff *skb,
-				    u32 *_abort_code)
+static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
+				 rxrpc_seq_t seq)
 {
 	struct rxkad_level1_hdr sechdr;
-	struct rxrpc_skb_priv *sp;
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[16];
@@ -332,7 +330,10 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 
 	_enter("");
 
-	sp = rxrpc_skb(skb);
+	if (skb->len < 8) {
+		rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO);
+		goto protocol_error;
+	}
 
 	/* we want to decrypt the skbuff in-place */
 	nsg = skb_cow_data(skb, 0, &trailer);
@@ -351,9 +352,11 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 	crypto_skcipher_decrypt(req);
 	skcipher_request_zero(req);
 
-	/* remove the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
-		goto datalen_error;
+	/* Extract the decrypted packet length */
+	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+		rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
 	if (!skb_pull(skb, sizeof(sechdr)))
 		BUG();
 
@@ -361,24 +364,24 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+	check ^= seq ^ call->call_id;
 	check &= 0xffff;
 	if (check != 0) {
-		*_abort_code = RXKADSEALEDINCON;
+		rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
 	/* shorten the packet to remove the padding */
-	if (data_size > skb->len)
-		goto datalen_error;
-	else if (data_size < skb->len)
+	if (data_size > skb->len) {
+		rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
+	if (data_size < skb->len)
 		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
-datalen_error:
-	*_abort_code = RXKADDATALEN;
 protocol_error:
 	_leave(" = -EPROTO");
 	return -EPROTO;
@@ -391,13 +394,11 @@ nomem:
 /*
  * wholly decrypt a packet (level 2 security)
  */
-static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
-				       struct sk_buff *skb,
-				       u32 *_abort_code)
+static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
+				 rxrpc_seq_t seq)
 {
 	const struct rxrpc_key_token *token;
 	struct rxkad_level2_hdr sechdr;
-	struct rxrpc_skb_priv *sp;
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist _sg[4], *sg;
@@ -408,7 +409,10 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 
 	_enter(",{%d}", skb->len);
 
-	sp = rxrpc_skb(skb);
+	if (skb->len < 8) {
+		rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO);
+		goto protocol_error;
+	}
 
 	/* we want to decrypt the skbuff in-place */
 	nsg = skb_cow_data(skb, 0, &trailer);
@@ -437,9 +441,11 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	if (sg != _sg)
 		kfree(sg);
 
-	/* remove the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
-		goto datalen_error;
+	/* Extract the decrypted packet length */
+	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+		rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
 	if (!skb_pull(skb, sizeof(sechdr)))
 		BUG();
 
@@ -447,24 +453,23 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+	check ^= seq ^ call->call_id;
 	check &= 0xffff;
 	if (check != 0) {
-		*_abort_code = RXKADSEALEDINCON;
+		rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
-	/* shorten the packet to remove the padding */
-	if (data_size > skb->len)
-		goto datalen_error;
-	else if (data_size < skb->len)
+	if (data_size > skb->len) {
+		rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO);
+		goto protocol_error;
+	}
+	if (data_size < skb->len)
 		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
-datalen_error:
-	*_abort_code = RXKADDATALEN;
 protocol_error:
 	_leave(" = -EPROTO");
 	return -EPROTO;
@@ -475,40 +480,30 @@ nomem:
 }
 
 /*
- * verify the security on a received packet
+ * Verify the security on a received packet or subpacket (if part of a
+ * jumbo packet).
  */
-static int rxkad_verify_packet(struct rxrpc_call *call,
-			       struct sk_buff *skb,
-			       u32 *_abort_code)
+static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			       rxrpc_seq_t seq, u16 expected_cksum)
 {
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
-	struct rxrpc_skb_priv *sp;
 	struct rxrpc_crypt iv;
 	struct scatterlist sg;
 	u16 cksum;
 	u32 x, y;
-	int ret;
-
-	sp = rxrpc_skb(skb);
 
 	_enter("{%d{%x}},{#%u}",
-	       call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq);
+	       call->debug_id, key_serial(call->conn->params.key), seq);
 
 	if (!call->conn->cipher)
 		return 0;
 
-	if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) {
-		*_abort_code = RXKADINCONSISTENCY;
-		_leave(" = -EPROTO [not rxkad]");
-		return -EPROTO;
-	}
-
 	/* continue encrypting from where we left off */
 	memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
 
 	/* validate the security checksum */
 	x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
-	x |= sp->hdr.seq & 0x3fffffff;
+	x |= seq & 0x3fffffff;
 	call->crypto_buf[0] = htonl(call->call_id);
 	call->crypto_buf[1] = htonl(x);
 
@@ -524,29 +519,22 @@ static int rxkad_verify_packet(struct rxrpc_call *call,
 	if (cksum == 0)
 		cksum = 1; /* zero checksums are not permitted */
 
-	if (sp->hdr.cksum != cksum) {
-		*_abort_code = RXKADSEALEDINCON;
+	if (cksum != expected_cksum) {
+		rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
 		_leave(" = -EPROTO [csum failed]");
 		return -EPROTO;
 	}
 
 	switch (call->conn->params.security_level) {
 	case RXRPC_SECURITY_PLAIN:
-		ret = 0;
-		break;
+		return 0;
 	case RXRPC_SECURITY_AUTH:
-		ret = rxkad_verify_packet_auth(call, skb, _abort_code);
-		break;
+		return rxkad_verify_packet_1(call, skb, seq);
 	case RXRPC_SECURITY_ENCRYPT:
-		ret = rxkad_verify_packet_encrypt(call, skb, _abort_code);
-		break;
+		return rxkad_verify_packet_2(call, skb, seq);
 	default:
-		ret = -ENOANO;
-		break;
+		return -ENOANO;
 	}
-
-	_leave(" = %d", ret);
-	return ret;
 }
 
 /*
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 2439aff131c7..9a4af992fcdf 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -454,14 +454,15 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
 /*
  * abort a call, sending an ABORT packet to the peer
  */
-static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
+static void rxrpc_send_abort(struct rxrpc_call *call, const char *why,
+			     u32 abort_code, int error)
 {
 	if (call->state >= RXRPC_CALL_COMPLETE)
 		return;
 
 	write_lock_bh(&call->state_lock);
 
-	if (__rxrpc_abort_call(call, abort_code, ECONNABORTED)) {
+	if (__rxrpc_abort_call(why, call, 0, abort_code, error)) {
 		del_timer_sync(&call->resend_timer);
 		del_timer_sync(&call->ack_timer);
 		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
@@ -556,7 +557,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		/* it's too late for this call */
 		ret = -ESHUTDOWN;
 	} else if (cmd == RXRPC_CMD_SEND_ABORT) {
-		rxrpc_send_abort(call, abort_code);
+		rxrpc_send_abort(call, "CMD", abort_code, ECONNABORTED);
 		ret = 0;
 	} else if (cmd != RXRPC_CMD_SEND_DATA) {
 		ret = -EINVAL;
@@ -626,20 +627,19 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data);
  * @sock: The socket the call is on
  * @call: The call to be aborted
  * @abort_code: The abort code to stick into the ABORT packet
+ * @error: Local error value
+ * @why: 3-char string indicating why.
  *
  * Allow a kernel service to abort a call, if it's still in an abortable state.
  */
 void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
-			     u32 abort_code)
+			     u32 abort_code, int error, const char *why)
 {
-	_enter("{%d},%d", call->debug_id, abort_code);
+	_enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
 
 	lock_sock(sock->sk);
 
-	_debug("CALL %d USR %lx ST %d on CONN %p",
-	       call->debug_id, call->user_call_ID, call->state, call->conn);
-
-	rxrpc_send_abort(call, abort_code);
+	rxrpc_send_abort(call, why, abort_code, error);
 
 	release_sock(sock->sk);
 	_leave("");
-- 
cgit v1.2.3


From 733ade23de1b72c1f11c5e4a1a9020a6f48decd2 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 6 Sep 2016 21:31:17 -0700
Subject: netlink: don't forget to release a rhashtable_iter structure

This bug was detected by kmemleak:
unreferenced object 0xffff8804269cc3c0 (size 64):
  comm "criu", pid 1042, jiffies 4294907360 (age 13.713s)
  hex dump (first 32 bytes):
    a0 32 cc 2c 04 88 ff ff 00 00 00 00 00 00 00 00  .2.,............
    00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de  ................
  backtrace:
    [<ffffffff8184dffa>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff8124720f>] kmem_cache_alloc_trace+0x10f/0x280
    [<ffffffffa02864cc>] __netlink_diag_dump+0x26c/0x290 [netlink_diag]

v2: don't remove a reference on a rhashtable_iter structure to
    release it from netlink_diag_dump_done

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Fixes: ad202074320c ("netlink: Use rhashtable walk interface in diag dump")
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/diag.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3e3e2534478a..b2f0e986a6f4 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -127,7 +127,6 @@ stop:
 		goto done;
 
 	rhashtable_walk_exit(hti);
-	cb->args[2] = 0;
 	num++;
 
 mc_list:
-- 
cgit v1.2.3


From f95bf346226b9b79352e05508beececc807cc37a Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 7 Sep 2016 13:38:35 +0900
Subject: net: diag: make udp_diag_destroy work for mapped addresses.

udp_diag_destroy does look up the IPv4 UDP hashtable for mapped
addresses, but it gets the IPv4 address to look up from the
beginning of the IPv6 address instead of the end.

Tested: https://android-review.googlesource.com/269874
Fixes: 5d77dca82839 ("net: diag: support SOCK_DESTROY for UDP sockets")
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_diag.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8a9f6e535caa..58b79c0c0d69 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -186,8 +186,8 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
 		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
 		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
 			sk = __udp4_lib_lookup(net,
-					req->id.idiag_dst[0], req->id.idiag_dport,
-					req->id.idiag_src[0], req->id.idiag_sport,
+					req->id.idiag_dst[3], req->id.idiag_dport,
+					req->id.idiag_src[3], req->id.idiag_sport,
 					req->id.idiag_if, tbl, NULL);
 
 		else
-- 
cgit v1.2.3


From 0f76d2564469fd3a337de088f533364cef206130 Mon Sep 17 00:00:00 2001
From: "subashab@codeaurora.org" <subashab@codeaurora.org>
Date: Tue, 6 Sep 2016 18:09:31 -0600
Subject: net: xfrm: Change u32 sysctl entries to use proc_douintvec

proc_dointvec limits the values to INT_MAX in u32 sysctl entries.
proc_douintvec allows to write upto UINT_MAX.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c
index 05a6e3d9c258..35a7e794ad04 100644
--- a/net/xfrm/xfrm_sysctl.c
+++ b/net/xfrm/xfrm_sysctl.c
@@ -17,13 +17,13 @@ static struct ctl_table xfrm_table[] = {
 		.procname	= "xfrm_aevent_etime",
 		.maxlen		= sizeof(u32),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_douintvec
 	},
 	{
 		.procname	= "xfrm_aevent_rseqth",
 		.maxlen		= sizeof(u32),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_douintvec
 	},
 	{
 		.procname	= "xfrm_larval_drop",
-- 
cgit v1.2.3


From cf13258fd4cb86478dfcb7e2c93a0d844307abc6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:11 +0100
Subject: rxrpc: Fix ASSERTCMP and ASSERTIFCMP to handle signed values

Fix ASSERTCMP and ASSERTIFCMP to be able to handle signed values by casting
both parameters to the type of the first before comparing.  Without this,
both values are cast to unsigned long, which means that checks for values
less than zero don't work.

The downside of this is that the state enum values in struct rxrpc_call and
struct rxrpc_connection can't be bitfields as __typeof__ can't handle them.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index dbfb9ed17483..6dc3a59ce0f2 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -344,8 +344,8 @@ struct rxrpc_connection {
 	unsigned long		events;
 	unsigned long		idle_timestamp;	/* Time at which last became idle */
 	spinlock_t		state_lock;	/* state-change lock */
-	enum rxrpc_conn_cache_state cache_state : 8;
-	enum rxrpc_conn_proto_state state : 8;	/* current state of connection */
+	enum rxrpc_conn_cache_state cache_state;
+	enum rxrpc_conn_proto_state state;	/* current state of connection */
 	u32			local_abort;	/* local abort code */
 	u32			remote_abort;	/* remote abort code */
 	int			debug_id;	/* debug ID for printks */
@@ -464,8 +464,8 @@ struct rxrpc_call {
 	rwlock_t		state_lock;	/* lock for state transition */
 	u32			abort_code;	/* Local/remote abort code */
 	int			error;		/* Local error incurred */
-	enum rxrpc_call_state	state : 8;	/* current state of call */
-	enum rxrpc_call_completion completion : 8; /* Call completion condition */
+	enum rxrpc_call_state	state;		/* current state of call */
+	enum rxrpc_call_completion completion;	/* Call completion condition */
 	atomic_t		usage;
 	atomic_t		skb_count;	/* Outstanding packets on this call */
 	atomic_t		sequence;	/* Tx data packet sequence counter */
@@ -1014,11 +1014,12 @@ do {								\
 
 #define ASSERTCMP(X, OP, Y)						\
 do {									\
-	unsigned long _x = (unsigned long)(X);				\
-	unsigned long _y = (unsigned long)(Y);				\
+	__typeof__(X) _x = (X);						\
+	__typeof__(Y) _y = (__typeof__(X))(Y);				\
 	if (unlikely(!(_x OP _y))) {					\
-		pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n",			\
-		       _x, _x, #OP, _y, _y);				\
+		pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
+		       (unsigned long)_x, (unsigned long)_x, #OP,	\
+		       (unsigned long)_y, (unsigned long)_y);		\
 		BUG();							\
 	}								\
 } while (0)
@@ -1033,11 +1034,12 @@ do {								\
 
 #define ASSERTIFCMP(C, X, OP, Y)					\
 do {									\
-	unsigned long _x = (unsigned long)(X);				\
-	unsigned long _y = (unsigned long)(Y);				\
+	__typeof__(X) _x = (X);						\
+	__typeof__(Y) _y = (__typeof__(X))(Y);				\
 	if (unlikely((C) && !(_x OP _y))) {				\
 		pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
-		       _x, _x, #OP, _y, _y);				\
+		       (unsigned long)_x, (unsigned long)_x, #OP,	\
+		       (unsigned long)_y, (unsigned long)_y);		\
 		BUG();							\
 	}								\
 } while (0)
-- 
cgit v1.2.3


From de8d6c7401ae8f25db3788804c86887ad7347bee Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:11 +0100
Subject: rxrpc: Convert rxrpc_local::services to an hlist

Convert the rxrpc_local::services list to an hlist so that it can be
accessed under RCU conditions more readily.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c     | 10 +++++-----
 net/rxrpc/ar-internal.h  |  4 ++--
 net/rxrpc/call_accept.c  |  2 +-
 net/rxrpc/local_object.c |  4 ++--
 net/rxrpc/security.c     |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 77a132abf140..f13cca1e973e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -156,13 +156,13 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 
 	if (rx->srx.srx_service) {
 		write_lock_bh(&local->services_lock);
-		list_for_each_entry(prx, &local->services, listen_link) {
+		hlist_for_each_entry(prx, &local->services, listen_link) {
 			if (prx->srx.srx_service == rx->srx.srx_service)
 				goto service_in_use;
 		}
 
 		rx->local = local;
-		list_add_tail(&rx->listen_link, &local->services);
+		hlist_add_head_rcu(&rx->listen_link, &local->services);
 		write_unlock_bh(&local->services_lock);
 
 		rx->sk.sk_state = RXRPC_SERVER_BOUND;
@@ -567,7 +567,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	rx->family = protocol;
 	rx->calls = RB_ROOT;
 
-	INIT_LIST_HEAD(&rx->listen_link);
+	INIT_HLIST_NODE(&rx->listen_link);
 	INIT_LIST_HEAD(&rx->secureq);
 	INIT_LIST_HEAD(&rx->acceptq);
 	rwlock_init(&rx->call_lock);
@@ -615,9 +615,9 @@ static int rxrpc_release_sock(struct sock *sk)
 
 	ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
 
-	if (!list_empty(&rx->listen_link)) {
+	if (!hlist_unhashed(&rx->listen_link)) {
 		write_lock_bh(&rx->local->services_lock);
-		list_del(&rx->listen_link);
+		hlist_del_rcu(&rx->listen_link);
 		write_unlock_bh(&rx->local->services_lock);
 	}
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 6dc3a59ce0f2..fd438dc93ee9 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -71,7 +71,7 @@ struct rxrpc_sock {
 	struct sock		sk;
 	rxrpc_notify_new_call_t	notify_new_call; /* Func to notify of new call */
 	struct rxrpc_local	*local;		/* local endpoint */
-	struct list_head	listen_link;	/* link in the local endpoint's listen list */
+	struct hlist_node	listen_link;	/* link in the local endpoint's listen list */
 	struct list_head	secureq;	/* calls awaiting connection security clearance */
 	struct list_head	acceptq;	/* calls awaiting acceptance */
 	struct key		*key;		/* security for this socket */
@@ -186,7 +186,7 @@ struct rxrpc_local {
 	struct list_head	link;
 	struct socket		*socket;	/* my UDP socket */
 	struct work_struct	processor;
-	struct list_head	services;	/* services listening on this endpoint */
+	struct hlist_head	services;	/* services listening on this endpoint */
 	struct rw_semaphore	defrag_sem;	/* control re-enablement of IP DF bit */
 	struct sk_buff_head	accept_queue;	/* incoming calls awaiting acceptance */
 	struct sk_buff_head	reject_queue;	/* packets awaiting rejection */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 879a964de80c..4c71efcf82ed 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -217,7 +217,7 @@ void rxrpc_accept_incoming_calls(struct rxrpc_local *local)
 
 	/* get the socket providing the service */
 	read_lock_bh(&local->services_lock);
-	list_for_each_entry(rx, &local->services, listen_link) {
+	hlist_for_each_entry(rx, &local->services, listen_link) {
 		if (rx->srx.srx_service == sp->hdr.serviceId &&
 		    rx->sk.sk_state != RXRPC_CLOSE)
 			goto found_service;
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index a753796fbe8f..610916f4ae34 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -75,7 +75,7 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
 		atomic_set(&local->usage, 1);
 		INIT_LIST_HEAD(&local->link);
 		INIT_WORK(&local->processor, rxrpc_local_processor);
-		INIT_LIST_HEAD(&local->services);
+		INIT_HLIST_HEAD(&local->services);
 		init_rwsem(&local->defrag_sem);
 		skb_queue_head_init(&local->accept_queue);
 		skb_queue_head_init(&local->reject_queue);
@@ -296,7 +296,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
 	mutex_unlock(&rxrpc_local_mutex);
 
 	ASSERT(RB_EMPTY_ROOT(&local->client_conns));
-	ASSERT(list_empty(&local->services));
+	ASSERT(hlist_empty(&local->services));
 
 	if (socket) {
 		local->socket = NULL;
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 814d285ff802..5d79d5a9c944 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -131,7 +131,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 
 	/* find the service */
 	read_lock_bh(&local->services_lock);
-	list_for_each_entry(rx, &local->services, listen_link) {
+	hlist_for_each_entry(rx, &local->services, listen_link) {
 		if (rx->srx.srx_service == conn->params.service_id)
 			goto found_service;
 	}
-- 
cgit v1.2.3


From 2ab27215ea27475a0b279732ba8a934bfab57ef0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Remove skb_count from struct rxrpc_call

Remove the sk_buff count from the rxrpc_call struct as it's less useful
once we stop queueing sk_buffs.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 10 +++-------
 net/rxrpc/ar-internal.h      |  1 -
 net/rxrpc/call_object.c      | 34 ++++++++++++----------------------
 3 files changed, 15 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 85ee035774ae..6b06cf050bc0 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -18,16 +18,14 @@
 
 TRACE_EVENT(rxrpc_call,
 	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op,
-		     int usage, int nskb,
-		     const void *where, const void *aux),
+		     int usage, const void *where, const void *aux),
 
-	    TP_ARGS(call, op, usage, nskb, where, aux),
+	    TP_ARGS(call, op, usage, where, aux),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,	call		)
 		    __field(int,			op		)
 		    __field(int,			usage		)
-		    __field(int,			nskb		)
 		    __field(const void *,		where		)
 		    __field(const void *,		aux		)
 			     ),
@@ -36,16 +34,14 @@ TRACE_EVENT(rxrpc_call,
 		    __entry->call = call;
 		    __entry->op = op;
 		    __entry->usage = usage;
-		    __entry->nskb = nskb;
 		    __entry->where = where;
 		    __entry->aux = aux;
 			   ),
 
-	    TP_printk("c=%p %s u=%d s=%d p=%pSR a=%p",
+	    TP_printk("c=%p %s u=%d sp=%pSR a=%p",
 		      __entry->call,
 		      rxrpc_call_traces[__entry->op],
 		      __entry->usage,
-		      __entry->nskb,
 		      __entry->where,
 		      __entry->aux)
 	    );
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index fd438dc93ee9..027791261768 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -467,7 +467,6 @@ struct rxrpc_call {
 	enum rxrpc_call_state	state;		/* current state of call */
 	enum rxrpc_call_completion completion;	/* Call completion condition */
 	atomic_t		usage;
-	atomic_t		skb_count;	/* Outstanding packets on this call */
 	atomic_t		sequence;	/* Tx data packet sequence counter */
 	u16			service_id;	/* service ID */
 	u8			security_ix;	/* Security type */
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 9efd9b0b0bdf..f843397e03b6 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -232,9 +232,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 		return call;
 	}
 
-	trace_rxrpc_call(call, rxrpc_call_new_client,
-			 atomic_read(&call->usage), 0,
-			 here, (const void *)user_call_ID);
+	trace_rxrpc_call(call, 0, atomic_read(&call->usage), here,
+			 (const void *)user_call_ID);
 
 	/* Publish the call, even though it is incompletely set up as yet */
 	call->user_call_ID = user_call_ID;
@@ -325,7 +324,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 		return ERR_PTR(-EBUSY);
 
 	trace_rxrpc_call(candidate, rxrpc_call_new_service,
-			 atomic_read(&candidate->usage), 0, here, NULL);
+			 atomic_read(&candidate->usage), here, NULL);
 
 	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
 	candidate->conn		= conn;
@@ -446,11 +445,10 @@ bool rxrpc_queue_call(struct rxrpc_call *call)
 {
 	const void *here = __builtin_return_address(0);
 	int n = __atomic_add_unless(&call->usage, 1, 0);
-	int m = atomic_read(&call->skb_count);
 	if (n == 0)
 		return false;
 	if (rxrpc_queue_work(&call->processor))
-		trace_rxrpc_call(call, rxrpc_call_queued, n + 1, m, here, NULL);
+		trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL);
 	else
 		rxrpc_put_call(call, rxrpc_call_put_noqueue);
 	return true;
@@ -463,10 +461,9 @@ bool __rxrpc_queue_call(struct rxrpc_call *call)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_read(&call->usage);
-	int m = atomic_read(&call->skb_count);
 	ASSERTCMP(n, >=, 1);
 	if (rxrpc_queue_work(&call->processor))
-		trace_rxrpc_call(call, rxrpc_call_queued_ref, n, m, here, NULL);
+		trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL);
 	else
 		rxrpc_put_call(call, rxrpc_call_put_noqueue);
 	return true;
@@ -480,9 +477,8 @@ void rxrpc_see_call(struct rxrpc_call *call)
 	const void *here = __builtin_return_address(0);
 	if (call) {
 		int n = atomic_read(&call->usage);
-		int m = atomic_read(&call->skb_count);
 
-		trace_rxrpc_call(call, rxrpc_call_seen, n, m, here, NULL);
+		trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL);
 	}
 }
 
@@ -493,9 +489,8 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_inc_return(&call->usage);
-	int m = atomic_read(&call->skb_count);
 
-	trace_rxrpc_call(call, op, n, m, here, NULL);
+	trace_rxrpc_call(call, op, n, here, NULL);
 }
 
 /*
@@ -505,9 +500,8 @@ void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 {
 	const void *here = __builtin_return_address(0);
 	int n = atomic_inc_return(&call->usage);
-	int m = atomic_inc_return(&call->skb_count);
 
-	trace_rxrpc_call(call, rxrpc_call_got_skb, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_got_skb, n, here, skb);
 }
 
 /*
@@ -642,17 +636,15 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
 	const void *here = __builtin_return_address(0);
-	int n, m;
+	int n;
 
 	ASSERT(call != NULL);
 
 	n = atomic_dec_return(&call->usage);
-	m = atomic_read(&call->skb_count);
-	trace_rxrpc_call(call, op, n, m, here, NULL);
+	trace_rxrpc_call(call, op, n, here, NULL);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
-		WARN_ON(m != 0);
 		rxrpc_cleanup_call(call);
 	}
 }
@@ -663,15 +655,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
 {
 	const void *here = __builtin_return_address(0);
-	int n, m;
+	int n;
 
 	n = atomic_dec_return(&call->usage);
-	m = atomic_dec_return(&call->skb_count);
-	trace_rxrpc_call(call, rxrpc_call_put_skb, n, m, here, skb);
+	trace_rxrpc_call(call, rxrpc_call_put_skb, n, here, skb);
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
-		WARN_ON(m != 0);
 		rxrpc_cleanup_call(call);
 	}
 }
-- 
cgit v1.2.3


From 49e19ec7d3499f79d2b3a45bb28418e89512fd7a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Add tracepoints to record received packets and end of
 data_ready

Add two tracepoints:

 (1) Record the RxRPC protocol header of packets retrieved from the UDP
     socket by the data_ready handler.

 (2) Record the outcome of the data_ready handler.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 38 ++++++++++++++++++++++++++++++++++++++
 net/rxrpc/input.c            |  8 ++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 6b06cf050bc0..ea3b10ed91a8 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -80,6 +80,44 @@ TRACE_EVENT(rxrpc_skb,
 		      __entry->where)
 	    );
 
+TRACE_EVENT(rxrpc_rx_packet,
+	    TP_PROTO(struct rxrpc_skb_priv *sp),
+
+	    TP_ARGS(sp),
+
+	    TP_STRUCT__entry(
+		    __field_struct(struct rxrpc_host_header,	hdr		)
+			     ),
+
+	    TP_fast_assign(
+		    memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
+			   ),
+
+	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x",
+		      __entry->hdr.epoch, __entry->hdr.cid,
+		      __entry->hdr.callNumber, __entry->hdr.serviceId,
+		      __entry->hdr.serial, __entry->hdr.seq,
+		      __entry->hdr.type, __entry->hdr.flags)
+	    );
+
+TRACE_EVENT(rxrpc_rx_done,
+	    TP_PROTO(int result, int abort_code),
+
+	    TP_ARGS(result, abort_code),
+
+	    TP_STRUCT__entry(
+		    __field(int,			result		)
+		    __field(int,			abort_code	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->result = result;
+		    __entry->abort_code = abort_code;
+			   ),
+
+	    TP_printk("r=%d a=%d", __entry->result, __entry->abort_code)
+	    );
+
 TRACE_EVENT(rxrpc_abort,
 	    TP_PROTO(const char *why, u32 cid, u32 call_id, rxrpc_seq_t seq,
 		     int abort_code, int error),
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 8e624109750a..6c4b7df05e95 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -683,6 +683,7 @@ void rxrpc_data_ready(struct sock *sk)
 	/* dig out the RxRPC connection details */
 	if (rxrpc_extract_header(sp, skb) < 0)
 		goto bad_message;
+	trace_rxrpc_rx_packet(sp);
 
 	_net("Rx RxRPC %s ep=%x call=%x:%x",
 	     sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
@@ -767,6 +768,7 @@ discard_unlock:
 out_unlock:
 	rcu_read_unlock();
 out:
+	trace_rxrpc_rx_done(0, 0);
 	return;
 
 cant_route_call:
@@ -780,7 +782,7 @@ cant_route_call:
 			skb_queue_tail(&local->accept_queue, skb);
 			rxrpc_queue_work(&local->processor);
 			_leave(" [incoming]");
-			return;
+			goto out;
 		}
 		skb->priority = RX_INVALID_OPERATION;
 	} else {
@@ -789,7 +791,7 @@ cant_route_call:
 
 	if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
 		_debug("reject type %d",sp->hdr.type);
-		rxrpc_reject_packet(local, skb);
+		goto reject_packet;
 	} else {
 		rxrpc_free_skb(skb);
 	}
@@ -798,6 +800,8 @@ cant_route_call:
 
 bad_message:
 	skb->priority = RX_PROTOCOL_ERROR;
+reject_packet:
+	trace_rxrpc_rx_done(skb->mark, skb->priority);
 	rxrpc_reject_packet(local, skb);
 	_leave(" [badmsg]");
 }
-- 
cgit v1.2.3


From 00e907127e6f86d0f9b122d9b4347a8aa09a8b61 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Preallocate peers, conns and calls for incoming service
 requests

Make it possible for the data_ready handler called from the UDP transport
socket to completely instantiate an rxrpc_call structure and make it
immediately live by preallocating all the memory it might need.  The idea
is to cut out the background thread usage as much as possible.

[Note that the preallocated structs are not actually used in this patch -
 that will be done in a future patch.]

If insufficient resources are available in the preallocation buffers, it
will be possible to discard the DATA packet in the data_ready handler or
schedule a BUSY packet without the need to schedule an attempt at
allocation in a background thread.

To this end:

 (1) Preallocate rxrpc_peer, rxrpc_connection and rxrpc_call structs to a
     maximum number each of the listen backlog size.  The backlog size is
     limited to a maxmimum of 32.  Only this many of each can be in the
     preallocation buffer.

 (2) For userspace sockets, the preallocation is charged initially by
     listen() and will be recharged by accepting or rejecting pending
     new incoming calls.

 (3) For kernel services {,re,dis}charging of the preallocation buffers is
     handled manually.  Two notifier callbacks have to be provided before
     kernel_listen() is invoked:

     (a) An indication that a new call has been instantiated.  This can be
     	 used to trigger background recharging.

     (b) An indication that a call is being discarded.  This is used when
     	 the socket is being released.

     A function, rxrpc_kernel_charge_accept() is called by the kernel
     service to preallocate a single call.  It should be passed the user ID
     to be used for that call and a callback to associate the rxrpc call
     with the kernel service's side of the ID.

 (4) Discard the preallocation when the socket is closed.

 (5) Temporarily bump the refcount on the call allocated in
     rxrpc_incoming_call() so that rxrpc_release_call() can ditch the
     preallocation ref on service calls unconditionally.  This will no
     longer be necessary once the preallocation is used.

Note that this does not yet control the number of active service calls on a
client - that will come in a later patch.

A future development would be to provide a setsockopt() call that allows a
userspace server to manually charge the preallocation buffer.  This would
allow user call IDs to be provided in advance and the awkward manual accept
stage to be bypassed.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c           |  71 ++++++++++++++-
 include/net/af_rxrpc.h   |  10 ++-
 net/rxrpc/af_rxrpc.c     |  16 +++-
 net/rxrpc/ar-internal.h  |  32 ++++++-
 net/rxrpc/call_accept.c  | 229 +++++++++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/call_object.c  |  12 ++-
 net/rxrpc/conn_object.c  |   2 +
 net/rxrpc/conn_service.c |  24 +++++
 net/rxrpc/input.c        |   2 +-
 net/rxrpc/proc.c         |   8 +-
 10 files changed, 391 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 53750dece80e..720ef05a24fe 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,6 +18,7 @@
 
 struct socket *afs_socket; /* my RxRPC socket */
 static struct workqueue_struct *afs_async_calls;
+static struct afs_call *afs_spare_incoming_call;
 static atomic_t afs_outstanding_calls;
 
 static void afs_free_call(struct afs_call *);
@@ -26,7 +27,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *);
 static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
 static int afs_dont_wait_for_call_to_complete(struct afs_call *);
 static void afs_process_async_call(struct work_struct *);
-static void afs_rx_new_call(struct sock *);
+static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
+static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
 static int afs_deliver_cm_op_id(struct afs_call *);
 
 /* synchronous call management */
@@ -54,8 +56,10 @@ static const struct afs_call_type afs_RXCMxxxx = {
 };
 
 static void afs_collect_incoming_call(struct work_struct *);
+static void afs_charge_preallocation(struct work_struct *);
 
 static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
 static int afs_wait_atomic_t(atomic_t *p)
 {
@@ -100,13 +104,15 @@ int afs_open_socket(void)
 	if (ret < 0)
 		goto error_2;
 
-	rxrpc_kernel_new_call_notification(socket, afs_rx_new_call);
+	rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
+					   afs_rx_discard_new_call);
 
 	ret = kernel_listen(socket, INT_MAX);
 	if (ret < 0)
 		goto error_2;
 
 	afs_socket = socket;
+	afs_charge_preallocation(NULL);
 	_leave(" = 0");
 	return 0;
 
@@ -126,6 +132,12 @@ void afs_close_socket(void)
 {
 	_enter("");
 
+	if (afs_spare_incoming_call) {
+		atomic_inc(&afs_outstanding_calls);
+		afs_free_call(afs_spare_incoming_call);
+		afs_spare_incoming_call = NULL;
+	}
+
 	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
 	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
 			 TASK_UNINTERRUPTIBLE);
@@ -635,12 +647,65 @@ static void afs_collect_incoming_call(struct work_struct *work)
 		afs_free_call(call);
 }
 
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
+{
+	struct afs_call *call = (struct afs_call *)user_call_ID;
+
+	call->rxcall = rxcall;
+}
+
+/*
+ * Charge the incoming call preallocation.
+ */
+static void afs_charge_preallocation(struct work_struct *work)
+{
+	struct afs_call *call = afs_spare_incoming_call;
+
+	for (;;) {
+		if (!call) {
+			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+			if (!call)
+				break;
+
+			INIT_WORK(&call->async_work, afs_process_async_call);
+			call->wait_mode = &afs_async_incoming_call;
+			call->type = &afs_RXCMxxxx;
+			init_waitqueue_head(&call->waitq);
+			call->state = AFS_CALL_AWAIT_OP_ID;
+		}
+
+		if (rxrpc_kernel_charge_accept(afs_socket,
+					       afs_wake_up_async_call,
+					       afs_rx_attach,
+					       (unsigned long)call,
+					       GFP_KERNEL) < 0)
+			break;
+		call = NULL;
+	}
+	afs_spare_incoming_call = call;
+}
+
+/*
+ * Discard a preallocated call when a socket is shut down.
+ */
+static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
+				    unsigned long user_call_ID)
+{
+	struct afs_call *call = (struct afs_call *)user_call_ID;
+
+	atomic_inc(&afs_outstanding_calls);
+	call->rxcall = NULL;
+	afs_free_call(call);
+}
+
 /*
  * Notification of an incoming call.
  */
-static void afs_rx_new_call(struct sock *sk)
+static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
+			    unsigned long user_call_ID)
 {
 	queue_work(afs_wq, &afs_collect_incoming_call_work);
+	queue_work(afs_wq, &afs_charge_preallocation_work);
 }
 
 /*
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 08ed8729126c..9cf551be916b 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -21,10 +21,14 @@ struct rxrpc_call;
 
 typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *,
 				  unsigned long);
-typedef void (*rxrpc_notify_new_call_t)(struct sock *);
+typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *,
+					unsigned long);
+typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long);
+typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long);
 
 void rxrpc_kernel_new_call_notification(struct socket *,
-					rxrpc_notify_new_call_t);
+					rxrpc_notify_new_call_t,
+					rxrpc_discard_new_call_t);
 struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *,
 					   struct sockaddr_rxrpc *,
 					   struct key *,
@@ -43,5 +47,7 @@ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long,
 int rxrpc_kernel_reject_call(struct socket *);
 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 			   struct sockaddr_rxrpc *);
+int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
+			       rxrpc_user_attach_call_t, unsigned long, gfp_t);
 
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index f13cca1e973e..1e8cf3ded81f 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -193,7 +193,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
 	struct rxrpc_sock *rx = rxrpc_sk(sk);
-	unsigned int max;
+	unsigned int max, old;
 	int ret;
 
 	_enter("%p,%d", rx, backlog);
@@ -212,9 +212,13 @@ static int rxrpc_listen(struct socket *sock, int backlog)
 			backlog = max;
 		else if (backlog < 0 || backlog > max)
 			break;
+		old = sk->sk_max_ack_backlog;
 		sk->sk_max_ack_backlog = backlog;
-		rx->sk.sk_state = RXRPC_SERVER_LISTENING;
-		ret = 0;
+		ret = rxrpc_service_prealloc(rx, GFP_KERNEL);
+		if (ret == 0)
+			rx->sk.sk_state = RXRPC_SERVER_LISTENING;
+		else
+			sk->sk_max_ack_backlog = old;
 		break;
 	default:
 		ret = -EBUSY;
@@ -303,16 +307,19 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call);
  * rxrpc_kernel_new_call_notification - Get notifications of new calls
  * @sock: The socket to intercept received messages on
  * @notify_new_call: Function to be called when new calls appear
+ * @discard_new_call: Function to discard preallocated calls
  *
  * Allow a kernel service to be given notifications about new calls.
  */
 void rxrpc_kernel_new_call_notification(
 	struct socket *sock,
-	rxrpc_notify_new_call_t notify_new_call)
+	rxrpc_notify_new_call_t notify_new_call,
+	rxrpc_discard_new_call_t discard_new_call)
 {
 	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
 
 	rx->notify_new_call = notify_new_call;
+	rx->discard_new_call = discard_new_call;
 }
 EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
 
@@ -622,6 +629,7 @@ static int rxrpc_release_sock(struct sock *sk)
 	}
 
 	/* try to flush out this socket */
+	rxrpc_discard_prealloc(rx);
 	rxrpc_release_calls_on_socket(rx);
 	flush_workqueue(rxrpc_workqueue);
 	rxrpc_purge_queue(&sk->sk_receive_queue);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 027791261768..45e1c269f90e 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -63,6 +63,27 @@ enum {
 	RXRPC_CLOSE,			/* socket is being closed */
 };
 
+/*
+ * Service backlog preallocation.
+ *
+ * This contains circular buffers of preallocated peers, connections and calls
+ * for incoming service calls and their head and tail pointers.  This allows
+ * calls to be set up in the data_ready handler, thereby avoiding the need to
+ * shuffle packets around so much.
+ */
+struct rxrpc_backlog {
+	unsigned short		peer_backlog_head;
+	unsigned short		peer_backlog_tail;
+	unsigned short		conn_backlog_head;
+	unsigned short		conn_backlog_tail;
+	unsigned short		call_backlog_head;
+	unsigned short		call_backlog_tail;
+#define RXRPC_BACKLOG_MAX	32
+	struct rxrpc_peer	*peer_backlog[RXRPC_BACKLOG_MAX];
+	struct rxrpc_connection	*conn_backlog[RXRPC_BACKLOG_MAX];
+	struct rxrpc_call	*call_backlog[RXRPC_BACKLOG_MAX];
+};
+
 /*
  * RxRPC socket definition
  */
@@ -70,13 +91,15 @@ struct rxrpc_sock {
 	/* WARNING: sk has to be the first member */
 	struct sock		sk;
 	rxrpc_notify_new_call_t	notify_new_call; /* Func to notify of new call */
+	rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
 	struct rxrpc_local	*local;		/* local endpoint */
 	struct hlist_node	listen_link;	/* link in the local endpoint's listen list */
 	struct list_head	secureq;	/* calls awaiting connection security clearance */
 	struct list_head	acceptq;	/* calls awaiting acceptance */
+	struct rxrpc_backlog	*backlog;	/* Preallocation for services */
 	struct key		*key;		/* security for this socket */
 	struct key		*securities;	/* list of server security descriptors */
-	struct rb_root		calls;		/* outstanding calls on this socket */
+	struct rb_root		calls;		/* User ID -> call mapping */
 	unsigned long		flags;
 #define RXRPC_SOCK_CONNECTED		0	/* connect_srx is set */
 	rwlock_t		call_lock;	/* lock for calls */
@@ -290,6 +313,7 @@ enum rxrpc_conn_cache_state {
 enum rxrpc_conn_proto_state {
 	RXRPC_CONN_UNUSED,		/* Connection not yet attempted */
 	RXRPC_CONN_CLIENT,		/* Client connection */
+	RXRPC_CONN_SERVICE_PREALLOC,	/* Service connection preallocation */
 	RXRPC_CONN_SERVICE_UNSECURED,	/* Service unsecured connection */
 	RXRPC_CONN_SERVICE_CHALLENGING,	/* Service challenging for security */
 	RXRPC_CONN_SERVICE,		/* Service secured connection */
@@ -408,6 +432,7 @@ enum rxrpc_call_state {
 	RXRPC_CALL_CLIENT_AWAIT_REPLY,	/* - client awaiting reply */
 	RXRPC_CALL_CLIENT_RECV_REPLY,	/* - client receiving reply phase */
 	RXRPC_CALL_CLIENT_FINAL_ACK,	/* - client sending final ACK phase */
+	RXRPC_CALL_SERVER_PREALLOC,	/* - service preallocation */
 	RXRPC_CALL_SERVER_SECURING,	/* - server securing request connection */
 	RXRPC_CALL_SERVER_ACCEPTING,	/* - server accepting request */
 	RXRPC_CALL_SERVER_RECV_REQUEST,	/* - server receiving request */
@@ -534,6 +559,8 @@ extern struct workqueue_struct *rxrpc_workqueue;
 /*
  * call_accept.c
  */
+int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
+void rxrpc_discard_prealloc(struct rxrpc_sock *);
 void rxrpc_accept_incoming_calls(struct rxrpc_local *);
 struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
 				     rxrpc_notify_rx_t);
@@ -557,6 +584,7 @@ extern struct list_head rxrpc_calls;
 extern rwlock_t rxrpc_call_lock;
 
 struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
+struct rxrpc_call *rxrpc_alloc_call(gfp_t);
 struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
@@ -573,6 +601,7 @@ void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *);
+void rxrpc_cleanup_call(struct rxrpc_call *);
 void __exit rxrpc_destroy_all_calls(void);
 
 static inline bool rxrpc_is_service_call(const struct rxrpc_call *call)
@@ -757,6 +786,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
 struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
 						   struct sockaddr_rxrpc *,
 						   struct sk_buff *);
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t);
 void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
 
 /*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 4c71efcf82ed..cc7194e05a15 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -20,11 +20,209 @@
 #include <linux/in6.h>
 #include <linux/icmp.h>
 #include <linux/gfp.h>
+#include <linux/circ_buf.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <net/ip.h>
 #include "ar-internal.h"
 
+/*
+ * Preallocate a single service call, connection and peer and, if possible,
+ * give them a user ID and attach the user's side of the ID to them.
+ */
+static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
+				      struct rxrpc_backlog *b,
+				      rxrpc_notify_rx_t notify_rx,
+				      rxrpc_user_attach_call_t user_attach_call,
+				      unsigned long user_call_ID, gfp_t gfp)
+{
+	const void *here = __builtin_return_address(0);
+	struct rxrpc_call *call;
+	int max, tmp;
+	unsigned int size = RXRPC_BACKLOG_MAX;
+	unsigned int head, tail, call_head, call_tail;
+
+	max = rx->sk.sk_max_ack_backlog;
+	tmp = rx->sk.sk_ack_backlog;
+	if (tmp >= max) {
+		_leave(" = -ENOBUFS [full %u]", max);
+		return -ENOBUFS;
+	}
+	max -= tmp;
+
+	/* We don't need more conns and peers than we have calls, but on the
+	 * other hand, we shouldn't ever use more peers than conns or conns
+	 * than calls.
+	 */
+	call_head = b->call_backlog_head;
+	call_tail = READ_ONCE(b->call_backlog_tail);
+	tmp = CIRC_CNT(call_head, call_tail, size);
+	if (tmp >= max) {
+		_leave(" = -ENOBUFS [enough %u]", tmp);
+		return -ENOBUFS;
+	}
+	max = tmp + 1;
+
+	head = b->peer_backlog_head;
+	tail = READ_ONCE(b->peer_backlog_tail);
+	if (CIRC_CNT(head, tail, size) < max) {
+		struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp);
+		if (!peer)
+			return -ENOMEM;
+		b->peer_backlog[head] = peer;
+		smp_store_release(&b->peer_backlog_head,
+				  (head + 1) & (size - 1));
+	}
+
+	head = b->conn_backlog_head;
+	tail = READ_ONCE(b->conn_backlog_tail);
+	if (CIRC_CNT(head, tail, size) < max) {
+		struct rxrpc_connection *conn;
+
+		conn = rxrpc_prealloc_service_connection(gfp);
+		if (!conn)
+			return -ENOMEM;
+		b->conn_backlog[head] = conn;
+		smp_store_release(&b->conn_backlog_head,
+				  (head + 1) & (size - 1));
+	}
+
+	/* Now it gets complicated, because calls get registered with the
+	 * socket here, particularly if a user ID is preassigned by the user.
+	 */
+	call = rxrpc_alloc_call(gfp);
+	if (!call)
+		return -ENOMEM;
+	call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
+	call->state = RXRPC_CALL_SERVER_PREALLOC;
+
+	trace_rxrpc_call(call, rxrpc_call_new_service,
+			 atomic_read(&call->usage),
+			 here, (const void *)user_call_ID);
+
+	write_lock(&rx->call_lock);
+	if (user_attach_call) {
+		struct rxrpc_call *xcall;
+		struct rb_node *parent, **pp;
+
+		/* Check the user ID isn't already in use */
+		pp = &rx->calls.rb_node;
+		parent = NULL;
+		while (*pp) {
+			parent = *pp;
+			xcall = rb_entry(parent, struct rxrpc_call, sock_node);
+			if (user_call_ID < call->user_call_ID)
+				pp = &(*pp)->rb_left;
+			else if (user_call_ID > call->user_call_ID)
+				pp = &(*pp)->rb_right;
+			else
+				goto id_in_use;
+		}
+
+		call->user_call_ID = user_call_ID;
+		call->notify_rx = notify_rx;
+		rxrpc_get_call(call, rxrpc_call_got);
+		user_attach_call(call, user_call_ID);
+		rxrpc_get_call(call, rxrpc_call_got_userid);
+		rb_link_node(&call->sock_node, parent, pp);
+		rb_insert_color(&call->sock_node, &rx->calls);
+		set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+	}
+
+	write_unlock(&rx->call_lock);
+
+	write_lock(&rxrpc_call_lock);
+	list_add_tail(&call->link, &rxrpc_calls);
+	write_unlock(&rxrpc_call_lock);
+
+	b->call_backlog[call_head] = call;
+	smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1));
+	_leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID);
+	return 0;
+
+id_in_use:
+	write_unlock(&rx->call_lock);
+	rxrpc_cleanup_call(call);
+	_leave(" = -EBADSLT");
+	return -EBADSLT;
+}
+
+/*
+ * Preallocate sufficient service connections, calls and peers to cover the
+ * entire backlog of a socket.  When a new call comes in, if we don't have
+ * sufficient of each available, the call gets rejected as busy or ignored.
+ *
+ * The backlog is replenished when a connection is accepted or rejected.
+ */
+int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp)
+{
+	struct rxrpc_backlog *b = rx->backlog;
+
+	if (!b) {
+		b = kzalloc(sizeof(struct rxrpc_backlog), gfp);
+		if (!b)
+			return -ENOMEM;
+		rx->backlog = b;
+	}
+
+	if (rx->discard_new_call)
+		return 0;
+
+	while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0)
+		;
+
+	return 0;
+}
+
+/*
+ * Discard the preallocation on a service.
+ */
+void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
+{
+	struct rxrpc_backlog *b = rx->backlog;
+	unsigned int size = RXRPC_BACKLOG_MAX, head, tail;
+
+	if (!b)
+		return;
+	rx->backlog = NULL;
+
+	head = b->peer_backlog_head;
+	tail = b->peer_backlog_tail;
+	while (CIRC_CNT(head, tail, size) > 0) {
+		struct rxrpc_peer *peer = b->peer_backlog[tail];
+		kfree(peer);
+		tail = (tail + 1) & (size - 1);
+	}
+
+	head = b->conn_backlog_head;
+	tail = b->conn_backlog_tail;
+	while (CIRC_CNT(head, tail, size) > 0) {
+		struct rxrpc_connection *conn = b->conn_backlog[tail];
+		write_lock(&rxrpc_connection_lock);
+		list_del(&conn->link);
+		list_del(&conn->proc_link);
+		write_unlock(&rxrpc_connection_lock);
+		kfree(conn);
+		tail = (tail + 1) & (size - 1);
+	}
+
+	head = b->call_backlog_head;
+	tail = b->call_backlog_tail;
+	while (CIRC_CNT(head, tail, size) > 0) {
+		struct rxrpc_call *call = b->call_backlog[tail];
+		if (rx->discard_new_call) {
+			_debug("discard %lx", call->user_call_ID);
+			rx->discard_new_call(call, call->user_call_ID);
+		}
+		rxrpc_call_completed(call);
+		rxrpc_release_call(rx, call);
+		rxrpc_put_call(call, rxrpc_call_put);
+		tail = (tail + 1) & (size - 1);
+	}
+
+	kfree(b);
+}
+
 /*
  * generate a connection-level abort
  */
@@ -450,3 +648,34 @@ int rxrpc_kernel_reject_call(struct socket *sock)
 	return ret;
 }
 EXPORT_SYMBOL(rxrpc_kernel_reject_call);
+
+/*
+ * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
+ * @sock: The socket on which to preallocate
+ * @notify_rx: Event notification function for the call
+ * @user_attach_call: Func to attach call to user_call_ID
+ * @user_call_ID: The tag to attach to the preallocated call
+ * @gfp: The allocation conditions.
+ *
+ * Charge up the socket with preallocated calls, each with a user ID.  A
+ * function should be provided to effect the attachment from the user's side.
+ * The user is given a ref to hold on the call.
+ *
+ * Note that the call may be come connected before this function returns.
+ */
+int rxrpc_kernel_charge_accept(struct socket *sock,
+			       rxrpc_notify_rx_t notify_rx,
+			       rxrpc_user_attach_call_t user_attach_call,
+			       unsigned long user_call_ID, gfp_t gfp)
+{
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	struct rxrpc_backlog *b = rx->backlog;
+
+	if (sock->sk->sk_state == RXRPC_CLOSE)
+		return -ESHUTDOWN;
+
+	return rxrpc_service_prealloc_one(rx, b, notify_rx,
+					  user_attach_call, user_call_ID,
+					  gfp);
+}
+EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f843397e03b6..d233adc9b5e5 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -31,6 +31,7 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_CLIENT_AWAIT_REPLY]		= "ClAwtRpl",
 	[RXRPC_CALL_CLIENT_RECV_REPLY]		= "ClRcvRpl",
 	[RXRPC_CALL_CLIENT_FINAL_ACK]		= "ClFnlACK",
+	[RXRPC_CALL_SERVER_PREALLOC]		= "SvPrealc",
 	[RXRPC_CALL_SERVER_SECURING]		= "SvSecure",
 	[RXRPC_CALL_SERVER_ACCEPTING]		= "SvAccept",
 	[RXRPC_CALL_SERVER_RECV_REQUEST]	= "SvRcvReq",
@@ -71,7 +72,6 @@ DEFINE_RWLOCK(rxrpc_call_lock);
 static void rxrpc_call_life_expired(unsigned long _call);
 static void rxrpc_ack_time_expired(unsigned long _call);
 static void rxrpc_resend_time_expired(unsigned long _call);
-static void rxrpc_cleanup_call(struct rxrpc_call *call);
 
 /*
  * find an extant server call
@@ -113,7 +113,7 @@ found_extant_call:
 /*
  * allocate a new call
  */
-static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 {
 	struct rxrpc_call *call;
 
@@ -392,6 +392,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	if (call_id <= conn->channels[chan].call_counter)
 		goto old_call; /* TODO: Just drop packet */
 
+	/* Temporary: Mirror the backlog prealloc ref (TODO: use prealloc) */
+	rxrpc_get_call(candidate, rxrpc_call_got);
+
 	/* make the call available */
 	_debug("new call");
 	call = candidate;
@@ -596,6 +599,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 	del_timer_sync(&call->ack_timer);
 	del_timer_sync(&call->lifetimer);
 
+	/* We have to release the prealloc backlog ref */
+	if (rxrpc_is_service_call(call))
+		rxrpc_put_call(call, rxrpc_call_put);
 	_leave("");
 }
 
@@ -682,7 +688,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
 /*
  * clean up a call
  */
-static void rxrpc_cleanup_call(struct rxrpc_call *call)
+void rxrpc_cleanup_call(struct rxrpc_call *call)
 {
 	_net("DESTROY CALL %d", call->debug_id);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 9c6685b97e70..8da82e3aa00e 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -286,6 +286,8 @@ static void rxrpc_connection_reaper(struct work_struct *work)
 		ASSERTCMP(atomic_read(&conn->usage), >, 0);
 		if (likely(atomic_read(&conn->usage) > 1))
 			continue;
+		if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
+			continue;
 
 		idle_timestamp = READ_ONCE(conn->idle_timestamp);
 		_debug("reap CONN %d { u=%d,t=%ld }",
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 316a92107fee..189338a60457 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -118,6 +118,30 @@ replace_old_connection:
 	goto conn_published;
 }
 
+/*
+ * Preallocate a service connection.  The connection is placed on the proc and
+ * reap lists so that we don't have to get the lock from BH context.
+ */
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
+{
+	struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp);
+
+	if (conn) {
+		/* We maintain an extra ref on the connection whilst it is on
+		 * the rxrpc_connections list.
+		 */
+		conn->state = RXRPC_CONN_SERVICE_PREALLOC;
+		atomic_set(&conn->usage, 2);
+
+		write_lock(&rxrpc_connection_lock);
+		list_add_tail(&conn->link, &rxrpc_connections);
+		list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
+		write_unlock(&rxrpc_connection_lock);
+	}
+
+	return conn;
+}
+
 /*
  * get a record of an incoming connection
  */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 6c4b7df05e95..5906579060cd 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -102,7 +102,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
 		    rx->notify_new_call) {
 			spin_unlock_bh(&sk->sk_receive_queue.lock);
 			skb_queue_tail(&call->knlrecv_queue, skb);
-			rx->notify_new_call(&rx->sk);
+			rx->notify_new_call(&rx->sk, NULL, 0);
 		} else if (call->notify_rx) {
 			spin_unlock_bh(&sk->sk_receive_queue.lock);
 			skb_queue_tail(&call->knlrecv_queue, skb);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index dfad23821a62..d529d1b4021c 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -17,6 +17,7 @@
 static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
 	[RXRPC_CONN_UNUSED]			= "Unused  ",
 	[RXRPC_CONN_CLIENT]			= "Client  ",
+	[RXRPC_CONN_SERVICE_PREALLOC]		= "SvPrealc",
 	[RXRPC_CONN_SERVICE_UNSECURED]		= "SvUnsec ",
 	[RXRPC_CONN_SERVICE_CHALLENGING]	= "SvChall ",
 	[RXRPC_CONN_SERVICE]			= "SvSecure",
@@ -156,6 +157,11 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 	}
 
 	conn = list_entry(v, struct rxrpc_connection, proc_link);
+	if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) {
+		strcpy(lbuff, "no_local");
+		strcpy(rbuff, "no_connection");
+		goto print;
+	}
 
 	sprintf(lbuff, "%pI4:%u",
 		&conn->params.local->srx.transport.sin.sin_addr,
@@ -164,7 +170,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 	sprintf(rbuff, "%pI4:%u",
 		&conn->params.peer->srx.transport.sin.sin_addr,
 		ntohs(conn->params.peer->srx.transport.sin.sin_port));
-
+print:
 	seq_printf(seq,
 		   "UDP   %-22.22s %-22.22s %4x %08x %s %3u"
 		   " %s %08x %08x %08x\n",
-- 
cgit v1.2.3


From 248f219cb8bcbfbd7f132752d44afa2df7c241d1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Sep 2016 11:10:12 +0100
Subject: rxrpc: Rewrite the data and ack handling code

Rewrite the data and ack handling code such that:

 (1) Parsing of received ACK and ABORT packets and the distribution and the
     filing of DATA packets happens entirely within the data_ready context
     called from the UDP socket.  This allows us to process and discard ACK
     and ABORT packets much more quickly (they're no longer stashed on a
     queue for a background thread to process).

 (2) We avoid calling skb_clone(), pskb_pull() and pskb_trim().  We instead
     keep track of the offset and length of the content of each packet in
     the sk_buff metadata.  This means we don't do any allocation in the
     receive path.

 (3) Jumbo DATA packet parsing is now done in data_ready context.  Rather
     than cloning the packet once for each subpacket and pulling/trimming
     it, we file the packet multiple times with an annotation for each
     indicating which subpacket is there.  From that we can directly
     calculate the offset and length.

 (4) A call's receive queue can be accessed without taking locks (memory
     barriers do have to be used, though).

 (5) Incoming calls are set up from preallocated resources and immediately
     made live.  They can than have packets queued upon them and ACKs
     generated.  If insufficient resources exist, DATA packet #1 is given a
     BUSY reply and other DATA packets are discarded).

 (6) sk_buffs no longer take a ref on their parent call.

To make this work, the following changes are made:

 (1) Each call's receive buffer is now a circular buffer of sk_buff
     pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
     between the call and the socket.  This permits each sk_buff to be in
     the buffer multiple times.  The receive buffer is reused for the
     transmit buffer.

 (2) A circular buffer of annotations (rxtx_annotations) is kept parallel
     to the data buffer.  Transmission phase annotations indicate whether a
     buffered packet has been ACK'd or not and whether it needs
     retransmission.

     Receive phase annotations indicate whether a slot holds a whole packet
     or a jumbo subpacket and, if the latter, which subpacket.  They also
     note whether the packet has been decrypted in place.

 (3) DATA packet window tracking is much simplified.  Each phase has just
     two numbers representing the window (rx_hard_ack/rx_top and
     tx_hard_ack/tx_top).

     The hard_ack number is the sequence number before base of the window,
     representing the last packet the other side says it has consumed.
     hard_ack starts from 0 and the first packet is sequence number 1.

     The top number is the sequence number of the highest-numbered packet
     residing in the buffer.  Packets between hard_ack+1 and top are
     soft-ACK'd to indicate they've been received, but not yet consumed.

     Four macros, before(), before_eq(), after() and after_eq() are added
     to compare sequence numbers within the window.  This allows for the
     top of the window to wrap when the hard-ack sequence number gets close
     to the limit.

     Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
     to indicate when rx_top and tx_top point at the packets with the
     LAST_PACKET bit set, indicating the end of the phase.

 (4) Calls are queued on the socket 'receive queue' rather than packets.
     This means that we don't need have to invent dummy packets to queue to
     indicate abnormal/terminal states and we don't have to keep metadata
     packets (such as ABORTs) around

 (5) The offset and length of a (sub)packet's content are now passed to
     the verify_packet security op.  This is currently expected to decrypt
     the packet in place and validate it.

     However, there's now nowhere to store the revised offset and length of
     the actual data within the decrypted blob (there may be a header and
     padding to skip) because an sk_buff may represent multiple packets, so
     a locate_data security op is added to retrieve these details from the
     sk_buff content when needed.

 (6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
     individually secured and needs to be individually decrypted.  The code
     to do this is broken out into rxrpc_recvmsg_data() and shared with the
     kernel API.  It now iterates over the call's receive buffer rather
     than walking the socket receive queue.

Additional changes:

 (1) The timers are condensed to a single timer that is set for the soonest
     of three timeouts (delayed ACK generation, DATA retransmission and
     call lifespan).

 (2) Transmission of ACK and ABORT packets is effected immediately from
     process-context socket ops/kernel API calls that cause them instead of
     them being punted off to a background work item.  The data_ready
     handler still has to defer to the background, though.

 (3) A shutdown op is added to the AF_RXRPC socket so that the AFS
     filesystem can shut down the socket and flush its own work items
     before closing the socket to deal with any in-progress service calls.

Future additional changes that will need to be considered:

 (1) Make sure that a call doesn't hog the front of the queue by receiving
     data from the network as fast as userspace is consuming it to the
     exclusion of other calls.

 (2) Transmit delayed ACKs from within recvmsg() when we've consumed
     sufficiently more packets to avoid the background work item needing to
     run.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c           |   51 +-
 include/net/af_rxrpc.h   |    3 -
 include/rxrpc/packet.h   |    7 +
 net/rxrpc/af_rxrpc.c     |   57 +-
 net/rxrpc/ar-internal.h  |  177 +++---
 net/rxrpc/call_accept.c  |  472 +++++++---------
 net/rxrpc/call_event.c   | 1357 +++++++---------------------------------------
 net/rxrpc/call_object.c  |  535 +++++-------------
 net/rxrpc/conn_event.c   |  137 +----
 net/rxrpc/conn_object.c  |    6 +-
 net/rxrpc/conn_service.c |  101 +---
 net/rxrpc/input.c        | 1044 ++++++++++++++++++-----------------
 net/rxrpc/insecure.c     |   13 +-
 net/rxrpc/local_event.c  |    2 +-
 net/rxrpc/local_object.c |    7 -
 net/rxrpc/misc.c         |    2 +-
 net/rxrpc/output.c       |  125 ++++-
 net/rxrpc/peer_event.c   |   17 +-
 net/rxrpc/peer_object.c  |   82 ++-
 net/rxrpc/recvmsg.c      |  764 ++++++++++++++------------
 net/rxrpc/rxkad.c        |  108 +++-
 net/rxrpc/security.c     |   10 +-
 net/rxrpc/sendmsg.c      |  126 ++---
 net/rxrpc/skbuff.c       |  127 -----
 24 files changed, 1993 insertions(+), 3337 deletions(-)

(limited to 'net')

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 720ef05a24fe..59bdaa7527b6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -55,10 +55,8 @@ static const struct afs_call_type afs_RXCMxxxx = {
 	.abort_to_error	= afs_abort_to_error,
 };
 
-static void afs_collect_incoming_call(struct work_struct *);
 static void afs_charge_preallocation(struct work_struct *);
 
-static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
 static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
 static int afs_wait_atomic_t(atomic_t *p)
@@ -143,6 +141,8 @@ void afs_close_socket(void)
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
+	flush_workqueue(afs_async_calls);
+	kernel_sock_shutdown(afs_socket, SHUT_RDWR);
 	flush_workqueue(afs_async_calls);
 	sock_release(afs_socket);
 
@@ -602,51 +602,6 @@ static void afs_process_async_call(struct work_struct *work)
 	_leave("");
 }
 
-/*
- * accept the backlog of incoming calls
- */
-static void afs_collect_incoming_call(struct work_struct *work)
-{
-	struct rxrpc_call *rxcall;
-	struct afs_call *call = NULL;
-
-	_enter("");
-
-	do {
-		if (!call) {
-			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
-			if (!call) {
-				rxrpc_kernel_reject_call(afs_socket);
-				return;
-			}
-
-			INIT_WORK(&call->async_work, afs_process_async_call);
-			call->wait_mode = &afs_async_incoming_call;
-			call->type = &afs_RXCMxxxx;
-			init_waitqueue_head(&call->waitq);
-			call->state = AFS_CALL_AWAIT_OP_ID;
-
-			_debug("CALL %p{%s} [%d]",
-			       call, call->type->name,
-			       atomic_read(&afs_outstanding_calls));
-			atomic_inc(&afs_outstanding_calls);
-		}
-
-		rxcall = rxrpc_kernel_accept_call(afs_socket,
-						  (unsigned long)call,
-						  afs_wake_up_async_call);
-		if (!IS_ERR(rxcall)) {
-			call->rxcall = rxcall;
-			call->need_attention = true;
-			queue_work(afs_async_calls, &call->async_work);
-			call = NULL;
-		}
-	} while (!call);
-
-	if (call)
-		afs_free_call(call);
-}
-
 static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
 {
 	struct afs_call *call = (struct afs_call *)user_call_ID;
@@ -704,7 +659,7 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
 			    unsigned long user_call_ID)
 {
-	queue_work(afs_wq, &afs_collect_incoming_call_work);
+	atomic_inc(&afs_outstanding_calls);
 	queue_work(afs_wq, &afs_charge_preallocation_work);
 }
 
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 9cf551be916b..1061a472a3e3 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -42,9 +42,6 @@ int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
 void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
 			     u32, int, const char *);
 void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
-struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long,
-					    rxrpc_notify_rx_t);
-int rxrpc_kernel_reject_call(struct socket *);
 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 			   struct sockaddr_rxrpc *);
 int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index b0ae5c1a6ce6..fd6eb3a60a8c 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -133,6 +133,13 @@ struct rxrpc_ackpacket {
 
 } __packed;
 
+/* Some ACKs refer to specific packets and some are general and can be updated. */
+#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED)	|	\
+			      (1 << RXRPC_ACK_PING_RESPONSE)	|	\
+			      (1 << RXRPC_ACK_DELAY)		|	\
+			      (1 << RXRPC_ACK_IDLE))
+
+
 /*
  * ACK packets can have a further piece of information tagged on the end
  */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 1e8cf3ded81f..caa226dd436e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -155,7 +155,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 	}
 
 	if (rx->srx.srx_service) {
-		write_lock_bh(&local->services_lock);
+		write_lock(&local->services_lock);
 		hlist_for_each_entry(prx, &local->services, listen_link) {
 			if (prx->srx.srx_service == rx->srx.srx_service)
 				goto service_in_use;
@@ -163,7 +163,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 
 		rx->local = local;
 		hlist_add_head_rcu(&rx->listen_link, &local->services);
-		write_unlock_bh(&local->services_lock);
+		write_unlock(&local->services_lock);
 
 		rx->sk.sk_state = RXRPC_SERVER_BOUND;
 	} else {
@@ -176,7 +176,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 	return 0;
 
 service_in_use:
-	write_unlock_bh(&local->services_lock);
+	write_unlock(&local->services_lock);
 	rxrpc_put_local(local);
 	ret = -EADDRINUSE;
 error_unlock:
@@ -515,15 +515,16 @@ error:
 static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
 			       poll_table *wait)
 {
-	unsigned int mask;
 	struct sock *sk = sock->sk;
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	unsigned int mask;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* the socket is readable if there are any messages waiting on the Rx
 	 * queue */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!list_empty(&rx->recvmsg_q))
 		mask |= POLLIN | POLLRDNORM;
 
 	/* the socket is writable if there is space to add new data to the
@@ -575,8 +576,11 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	rx->calls = RB_ROOT;
 
 	INIT_HLIST_NODE(&rx->listen_link);
-	INIT_LIST_HEAD(&rx->secureq);
-	INIT_LIST_HEAD(&rx->acceptq);
+	spin_lock_init(&rx->incoming_lock);
+	INIT_LIST_HEAD(&rx->sock_calls);
+	INIT_LIST_HEAD(&rx->to_be_accepted);
+	INIT_LIST_HEAD(&rx->recvmsg_q);
+	rwlock_init(&rx->recvmsg_lock);
 	rwlock_init(&rx->call_lock);
 	memset(&rx->srx, 0, sizeof(rx->srx));
 
@@ -584,6 +588,39 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	return 0;
 }
 
+/*
+ * Kill all the calls on a socket and shut it down.
+ */
+static int rxrpc_shutdown(struct socket *sock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	int ret = 0;
+
+	_enter("%p,%d", sk, flags);
+
+	if (flags != SHUT_RDWR)
+		return -EOPNOTSUPP;
+	if (sk->sk_state == RXRPC_CLOSE)
+		return -ESHUTDOWN;
+
+	lock_sock(sk);
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	if (sk->sk_state < RXRPC_CLOSE) {
+		sk->sk_state = RXRPC_CLOSE;
+		sk->sk_shutdown = SHUTDOWN_MASK;
+	} else {
+		ret = -ESHUTDOWN;
+	}
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+	rxrpc_discard_prealloc(rx);
+
+	release_sock(sk);
+	return ret;
+}
+
 /*
  * RxRPC socket destructor
  */
@@ -623,9 +660,9 @@ static int rxrpc_release_sock(struct sock *sk)
 	ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
 
 	if (!hlist_unhashed(&rx->listen_link)) {
-		write_lock_bh(&rx->local->services_lock);
+		write_lock(&rx->local->services_lock);
 		hlist_del_rcu(&rx->listen_link);
-		write_unlock_bh(&rx->local->services_lock);
+		write_unlock(&rx->local->services_lock);
 	}
 
 	/* try to flush out this socket */
@@ -678,7 +715,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
 	.poll		= rxrpc_poll,
 	.ioctl		= sock_no_ioctl,
 	.listen		= rxrpc_listen,
-	.shutdown	= sock_no_shutdown,
+	.shutdown	= rxrpc_shutdown,
 	.setsockopt	= rxrpc_setsockopt,
 	.getsockopt	= sock_no_getsockopt,
 	.sendmsg	= rxrpc_sendmsg,
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 45e1c269f90e..b1cb79ec4e96 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -94,9 +94,12 @@ struct rxrpc_sock {
 	rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
 	struct rxrpc_local	*local;		/* local endpoint */
 	struct hlist_node	listen_link;	/* link in the local endpoint's listen list */
-	struct list_head	secureq;	/* calls awaiting connection security clearance */
-	struct list_head	acceptq;	/* calls awaiting acceptance */
 	struct rxrpc_backlog	*backlog;	/* Preallocation for services */
+	spinlock_t		incoming_lock;	/* Incoming call vs service shutdown lock */
+	struct list_head	sock_calls;	/* List of calls owned by this socket */
+	struct list_head	to_be_accepted;	/* calls awaiting acceptance */
+	struct list_head	recvmsg_q;	/* Calls awaiting recvmsg's attention  */
+	rwlock_t		recvmsg_lock;	/* Lock for recvmsg_q */
 	struct key		*key;		/* security for this socket */
 	struct key		*securities;	/* list of server security descriptors */
 	struct rb_root		calls;		/* User ID -> call mapping */
@@ -138,13 +141,16 @@ struct rxrpc_host_header {
  * - max 48 bytes (struct sk_buff::cb)
  */
 struct rxrpc_skb_priv {
-	struct rxrpc_call	*call;		/* call with which associated */
-	unsigned long		resend_at;	/* time in jiffies at which to resend */
+	union {
+		unsigned long	resend_at;	/* time in jiffies at which to resend */
+		struct {
+			u8	nr_jumbo;	/* Number of jumbo subpackets */
+		};
+	};
 	union {
 		unsigned int	offset;		/* offset into buffer of next read */
 		int		remain;		/* amount of space remaining for next write */
 		u32		error;		/* network error code */
-		bool		need_resend;	/* T if needs resending */
 	};
 
 	struct rxrpc_host_header hdr;		/* RxRPC packet header from this packet */
@@ -179,7 +185,11 @@ struct rxrpc_security {
 
 	/* verify the security on a received packet */
 	int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
-			     rxrpc_seq_t, u16);
+			     unsigned int, unsigned int, rxrpc_seq_t, u16);
+
+	/* Locate the data in a received packet that has been verified. */
+	void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
+			    unsigned int *, unsigned int *);
 
 	/* issue a challenge */
 	int (*issue_challenge)(struct rxrpc_connection *);
@@ -211,7 +221,6 @@ struct rxrpc_local {
 	struct work_struct	processor;
 	struct hlist_head	services;	/* services listening on this endpoint */
 	struct rw_semaphore	defrag_sem;	/* control re-enablement of IP DF bit */
-	struct sk_buff_head	accept_queue;	/* incoming calls awaiting acceptance */
 	struct sk_buff_head	reject_queue;	/* packets awaiting rejection */
 	struct sk_buff_head	event_queue;	/* endpoint event packets awaiting processing */
 	struct rb_root		client_conns;	/* Client connections by socket params */
@@ -388,38 +397,21 @@ struct rxrpc_connection {
  */
 enum rxrpc_call_flag {
 	RXRPC_CALL_RELEASED,		/* call has been released - no more message to userspace */
-	RXRPC_CALL_TERMINAL_MSG,	/* call has given the socket its final message */
-	RXRPC_CALL_RCVD_LAST,		/* all packets received */
-	RXRPC_CALL_RUN_RTIMER,		/* Tx resend timer started */
-	RXRPC_CALL_TX_SOFT_ACK,		/* sent some soft ACKs */
-	RXRPC_CALL_INIT_ACCEPT,		/* acceptance was initiated */
 	RXRPC_CALL_HAS_USERID,		/* has a user ID attached */
-	RXRPC_CALL_EXPECT_OOS,		/* expect out of sequence packets */
 	RXRPC_CALL_IS_SERVICE,		/* Call is service call */
 	RXRPC_CALL_EXPOSED,		/* The call was exposed to the world */
-	RXRPC_CALL_RX_NO_MORE,		/* Don't indicate MSG_MORE from recvmsg() */
+	RXRPC_CALL_RX_LAST,		/* Received the last packet (at rxtx_top) */
+	RXRPC_CALL_TX_LAST,		/* Last packet in Tx buffer (at rxtx_top) */
 };
 
 /*
  * Events that can be raised on a call.
  */
 enum rxrpc_call_event {
-	RXRPC_CALL_EV_RCVD_ACKALL,	/* ACKALL or reply received */
-	RXRPC_CALL_EV_RCVD_BUSY,	/* busy packet received */
-	RXRPC_CALL_EV_RCVD_ABORT,	/* abort packet received */
-	RXRPC_CALL_EV_RCVD_ERROR,	/* network error received */
-	RXRPC_CALL_EV_ACK_FINAL,	/* need to generate final ACK (and release call) */
 	RXRPC_CALL_EV_ACK,		/* need to generate ACK */
-	RXRPC_CALL_EV_REJECT_BUSY,	/* need to generate busy message */
 	RXRPC_CALL_EV_ABORT,		/* need to generate abort */
-	RXRPC_CALL_EV_CONN_ABORT,	/* local connection abort generated */
-	RXRPC_CALL_EV_RESEND_TIMER,	/* Tx resend timer expired */
+	RXRPC_CALL_EV_TIMER,		/* Timer expired */
 	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
-	RXRPC_CALL_EV_DRAIN_RX_OOS,	/* drain the Rx out of sequence queue */
-	RXRPC_CALL_EV_LIFE_TIMER,	/* call's lifetimer ran out */
-	RXRPC_CALL_EV_ACCEPTED,		/* incoming call accepted by userspace app */
-	RXRPC_CALL_EV_SECURED,		/* incoming call's connection is now secure */
-	RXRPC_CALL_EV_POST_ACCEPT,	/* need to post an "accept?" message to the app */
 };
 
 /*
@@ -431,7 +423,6 @@ enum rxrpc_call_state {
 	RXRPC_CALL_CLIENT_SEND_REQUEST,	/* - client sending request phase */
 	RXRPC_CALL_CLIENT_AWAIT_REPLY,	/* - client awaiting reply */
 	RXRPC_CALL_CLIENT_RECV_REPLY,	/* - client receiving reply phase */
-	RXRPC_CALL_CLIENT_FINAL_ACK,	/* - client sending final ACK phase */
 	RXRPC_CALL_SERVER_PREALLOC,	/* - service preallocation */
 	RXRPC_CALL_SERVER_SECURING,	/* - server securing request connection */
 	RXRPC_CALL_SERVER_ACCEPTING,	/* - server accepting request */
@@ -448,7 +439,6 @@ enum rxrpc_call_state {
  */
 enum rxrpc_call_completion {
 	RXRPC_CALL_SUCCEEDED,		/* - Normal termination */
-	RXRPC_CALL_SERVER_BUSY,		/* - call rejected by busy server */
 	RXRPC_CALL_REMOTELY_ABORTED,	/* - call aborted by peer */
 	RXRPC_CALL_LOCALLY_ABORTED,	/* - call aborted locally on error or close */
 	RXRPC_CALL_LOCAL_ERROR,		/* - call failed due to local error */
@@ -465,24 +455,23 @@ struct rxrpc_call {
 	struct rxrpc_connection	*conn;		/* connection carrying call */
 	struct rxrpc_peer	*peer;		/* Peer record for remote address */
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
-	struct timer_list	lifetimer;	/* lifetime remaining on call */
-	struct timer_list	ack_timer;	/* ACK generation timer */
-	struct timer_list	resend_timer;	/* Tx resend timer */
-	struct work_struct	processor;	/* packet processor and ACK generator */
+	unsigned long		ack_at;		/* When deferred ACK needs to happen */
+	unsigned long		resend_at;	/* When next resend needs to happen */
+	unsigned long		expire_at;	/* When the call times out */
+	struct timer_list	timer;		/* Combined event timer */
+	struct work_struct	processor;	/* Event processor */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
 	struct list_head	link;		/* link in master call list */
 	struct list_head	chan_wait_link;	/* Link in conn->waiting_calls */
 	struct hlist_node	error_link;	/* link in error distribution list */
-	struct list_head	accept_link;	/* calls awaiting acceptance */
-	struct rb_node		sock_node;	/* node in socket call tree */
-	struct sk_buff_head	rx_queue;	/* received packets */
-	struct sk_buff_head	rx_oos_queue;	/* packets received out of sequence */
-	struct sk_buff_head	knlrecv_queue;	/* Queue for kernel_recv [TODO: replace this] */
+	struct list_head	accept_link;	/* Link in rx->acceptq */
+	struct list_head	recvmsg_link;	/* Link in rx->recvmsg_q */
+	struct list_head	sock_link;	/* Link in rx->sock_calls */
+	struct rb_node		sock_node;	/* Node in rx->calls */
 	struct sk_buff		*tx_pending;	/* Tx socket buffer being filled */
 	wait_queue_head_t	waitq;		/* Wait queue for channel or Tx */
 	__be32			crypto_buf[2];	/* Temporary packet crypto buffer */
 	unsigned long		user_call_ID;	/* user-defined call ID */
-	unsigned long		creation_jif;	/* time of call creation */
 	unsigned long		flags;
 	unsigned long		events;
 	spinlock_t		lock;
@@ -492,40 +481,55 @@ struct rxrpc_call {
 	enum rxrpc_call_state	state;		/* current state of call */
 	enum rxrpc_call_completion completion;	/* Call completion condition */
 	atomic_t		usage;
-	atomic_t		sequence;	/* Tx data packet sequence counter */
 	u16			service_id;	/* service ID */
 	u8			security_ix;	/* Security type */
 	u32			call_id;	/* call ID on connection  */
 	u32			cid;		/* connection ID plus channel index */
 	int			debug_id;	/* debug ID for printks */
 
-	/* transmission-phase ACK management */
-	u8			acks_head;	/* offset into window of first entry */
-	u8			acks_tail;	/* offset into window of last entry */
-	u8			acks_winsz;	/* size of un-ACK'd window */
-	u8			acks_unacked;	/* lowest unacked packet in last ACK received */
-	int			acks_latest;	/* serial number of latest ACK received */
-	rxrpc_seq_t		acks_hard;	/* highest definitively ACK'd msg seq */
-	unsigned long		*acks_window;	/* sent packet window
-						 * - elements are pointers with LSB set if ACK'd
+	/* Rx/Tx circular buffer, depending on phase.
+	 *
+	 * In the Rx phase, packets are annotated with 0 or the number of the
+	 * segment of a jumbo packet each buffer refers to.  There can be up to
+	 * 47 segments in a maximum-size UDP packet.
+	 *
+	 * In the Tx phase, packets are annotated with which buffers have been
+	 * acked.
+	 */
+#define RXRPC_RXTX_BUFF_SIZE	64
+#define RXRPC_RXTX_BUFF_MASK	(RXRPC_RXTX_BUFF_SIZE - 1)
+	struct sk_buff		**rxtx_buffer;
+	u8			*rxtx_annotations;
+#define RXRPC_TX_ANNO_ACK	0
+#define RXRPC_TX_ANNO_UNACK	1
+#define RXRPC_TX_ANNO_NAK	2
+#define RXRPC_TX_ANNO_RETRANS	3
+#define RXRPC_RX_ANNO_JUMBO	0x3f		/* Jumbo subpacket number + 1 if not zero */
+#define RXRPC_RX_ANNO_JLAST	0x40		/* Set if last element of a jumbo packet */
+#define RXRPC_RX_ANNO_VERIFIED	0x80		/* Set if verified and decrypted */
+	rxrpc_seq_t		tx_hard_ack;	/* Dead slot in buffer; the first transmitted but
+						 * not hard-ACK'd packet follows this.
+						 */
+	rxrpc_seq_t		tx_top;		/* Highest Tx slot allocated. */
+	rxrpc_seq_t		rx_hard_ack;	/* Dead slot in buffer; the first received but not
+						 * consumed packet follows this.
 						 */
+	rxrpc_seq_t		rx_top;		/* Highest Rx slot allocated. */
+	rxrpc_seq_t		rx_expect_next;	/* Expected next packet sequence number */
+	u8			rx_winsize;	/* Size of Rx window */
+	u8			tx_winsize;	/* Maximum size of Tx window */
+	u8			nr_jumbo_dup;	/* Number of jumbo duplicates */
 
 	/* receive-phase ACK management */
-	rxrpc_seq_t		rx_data_expect;	/* next data seq ID expected to be received */
-	rxrpc_seq_t		rx_data_post;	/* next data seq ID expected to be posted */
-	rxrpc_seq_t		rx_data_recv;	/* last data seq ID encountered by recvmsg */
-	rxrpc_seq_t		rx_data_eaten;	/* last data seq ID consumed by recvmsg */
-	rxrpc_seq_t		rx_first_oos;	/* first packet in rx_oos_queue (or 0) */
-	rxrpc_seq_t		ackr_win_top;	/* top of ACK window (rx_data_eaten is bottom) */
-	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
 	u8			ackr_reason;	/* reason to ACK */
 	u16			ackr_skew;	/* skew on packet being ACK'd */
 	rxrpc_serial_t		ackr_serial;	/* serial of packet being ACK'd */
-	atomic_t		ackr_not_idle;	/* number of packets in Rx queue */
+	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
+	unsigned short		rx_pkt_offset;	/* Current recvmsg packet offset */
+	unsigned short		rx_pkt_len;	/* Current recvmsg packet len */
 
-	/* received packet records, 1 bit per record */
-#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG)
-	unsigned long		ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
+	/* transmission-phase ACK management */
+	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 };
 
 enum rxrpc_call_trace {
@@ -535,10 +539,8 @@ enum rxrpc_call_trace {
 	rxrpc_call_queued_ref,
 	rxrpc_call_seen,
 	rxrpc_call_got,
-	rxrpc_call_got_skb,
 	rxrpc_call_got_userid,
 	rxrpc_call_put,
-	rxrpc_call_put_skb,
 	rxrpc_call_put_userid,
 	rxrpc_call_put_noqueue,
 	rxrpc_call__nr_trace
@@ -561,6 +563,9 @@ extern struct workqueue_struct *rxrpc_workqueue;
  */
 int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
 void rxrpc_discard_prealloc(struct rxrpc_sock *);
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
+					   struct rxrpc_connection *,
+					   struct sk_buff *);
 void rxrpc_accept_incoming_calls(struct rxrpc_local *);
 struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
 				     rxrpc_notify_rx_t);
@@ -569,8 +574,7 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
 void rxrpc_process_call(struct work_struct *);
 
 /*
@@ -589,9 +593,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
 					 unsigned long, gfp_t);
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
-				       struct rxrpc_connection *,
-				       struct sk_buff *);
+void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
+			 struct sk_buff *);
 void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
 bool __rxrpc_queue_call(struct rxrpc_call *);
@@ -599,8 +602,6 @@ bool rxrpc_queue_call(struct rxrpc_call *);
 void rxrpc_see_call(struct rxrpc_call *);
 void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
 void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
-void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *);
-void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_cleanup_call(struct rxrpc_call *);
 void __exit rxrpc_destroy_all_calls(void);
 
@@ -672,13 +673,8 @@ static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
 {
 	trace_rxrpc_abort(why, call->cid, call->call_id, seq,
 			  abort_code, error);
-	if (__rxrpc_set_call_completion(call,
-					RXRPC_CALL_LOCALLY_ABORTED,
-					abort_code, error)) {
-		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
-		return true;
-	}
-	return false;
+	return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED,
+					   abort_code, error);
 }
 
 static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
@@ -713,8 +709,6 @@ void __exit rxrpc_destroy_all_client_connections(void);
  * conn_event.c
  */
 void rxrpc_process_connection(struct work_struct *);
-void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
-void rxrpc_reject_packets(struct rxrpc_local *);
 
 /*
  * conn_object.c
@@ -783,18 +777,14 @@ static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
  */
 struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
 						     struct sk_buff *);
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
-						   struct sockaddr_rxrpc *,
-						   struct sk_buff *);
 struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t);
+void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *);
 void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
 
 /*
  * input.c
  */
 void rxrpc_data_ready(struct sock *);
-int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
-void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
 
 /*
  * insecure.c
@@ -868,6 +858,7 @@ extern const char *rxrpc_acks(u8 reason);
  */
 int rxrpc_send_call_packet(struct rxrpc_call *, u8);
 int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+void rxrpc_reject_packets(struct rxrpc_local *);
 
 /*
  * peer_event.c
@@ -883,6 +874,8 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
 struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
 				     struct sockaddr_rxrpc *, gfp_t);
 struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
+					      struct rxrpc_peer *);
 
 static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
 {
@@ -912,6 +905,7 @@ extern const struct file_operations rxrpc_connection_seq_fops;
 /*
  * recvmsg.c
  */
+void rxrpc_notify_socket(struct rxrpc_call *);
 int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
 
 /*
@@ -961,6 +955,23 @@ static inline void rxrpc_sysctl_exit(void) {}
  */
 int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
 
+static inline bool before(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) < 0;
+}
+static inline bool before_eq(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) <= 0;
+}
+static inline bool after(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) > 0;
+}
+static inline bool after_eq(u32 seq1, u32 seq2)
+{
+        return (s32)(seq1 - seq2) >= 0;
+}
+
 /*
  * debug tracing
  */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index cc7194e05a15..b8acec0d596e 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -129,6 +129,8 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 		set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
 	}
 
+	list_add(&call->sock_link, &rx->sock_calls);
+
 	write_unlock(&rx->call_lock);
 
 	write_lock(&rxrpc_call_lock);
@@ -186,6 +188,12 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 		return;
 	rx->backlog = NULL;
 
+	/* Make sure that there aren't any incoming calls in progress before we
+	 * clear the preallocation buffers.
+	 */
+	spin_lock_bh(&rx->incoming_lock);
+	spin_unlock_bh(&rx->incoming_lock);
+
 	head = b->peer_backlog_head;
 	tail = b->peer_backlog_tail;
 	while (CIRC_CNT(head, tail, size) > 0) {
@@ -224,251 +232,179 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 }
 
 /*
- * generate a connection-level abort
+ * Allocate a new incoming call from the prealloc pool, along with a connection
+ * and a peer as necessary.
  */
-static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
-		      struct rxrpc_wire_header *whdr)
+static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
+						    struct rxrpc_local *local,
+						    struct rxrpc_connection *conn,
+						    struct sk_buff *skb)
 {
-	struct msghdr msg;
-	struct kvec iov[1];
-	size_t len;
-	int ret;
-
-	_enter("%d,,", local->debug_id);
-
-	whdr->type	= RXRPC_PACKET_TYPE_BUSY;
-	whdr->serial	= htonl(1);
-
-	msg.msg_name	= &srx->transport.sin;
-	msg.msg_namelen	= sizeof(srx->transport.sin);
-	msg.msg_control	= NULL;
-	msg.msg_controllen = 0;
-	msg.msg_flags	= 0;
-
-	iov[0].iov_base	= whdr;
-	iov[0].iov_len	= sizeof(*whdr);
-
-	len = iov[0].iov_len;
-
-	_proto("Tx BUSY %%1");
+	struct rxrpc_backlog *b = rx->backlog;
+	struct rxrpc_peer *peer, *xpeer;
+	struct rxrpc_call *call;
+	unsigned short call_head, conn_head, peer_head;
+	unsigned short call_tail, conn_tail, peer_tail;
+	unsigned short call_count, conn_count;
+
+	/* #calls >= #conns >= #peers must hold true. */
+	call_head = smp_load_acquire(&b->call_backlog_head);
+	call_tail = b->call_backlog_tail;
+	call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX);
+	conn_head = smp_load_acquire(&b->conn_backlog_head);
+	conn_tail = b->conn_backlog_tail;
+	conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX);
+	ASSERTCMP(conn_count, >=, call_count);
+	peer_head = smp_load_acquire(&b->peer_backlog_head);
+	peer_tail = b->peer_backlog_tail;
+	ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=,
+		  conn_count);
+
+	if (call_count == 0)
+		return NULL;
+
+	if (!conn) {
+		/* No connection.  We're going to need a peer to start off
+		 * with.  If one doesn't yet exist, use a spare from the
+		 * preallocation set.  We dump the address into the spare in
+		 * anticipation - and to save on stack space.
+		 */
+		xpeer = b->peer_backlog[peer_tail];
+		if (rxrpc_extract_addr_from_skb(&xpeer->srx, skb) < 0)
+			return NULL;
+
+		peer = rxrpc_lookup_incoming_peer(local, xpeer);
+		if (peer == xpeer) {
+			b->peer_backlog[peer_tail] = NULL;
+			smp_store_release(&b->peer_backlog_tail,
+					  (peer_tail + 1) &
+					  (RXRPC_BACKLOG_MAX - 1));
+		}
 
-	ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
-	if (ret < 0) {
-		_leave(" = -EAGAIN [sendmsg failed: %d]", ret);
-		return -EAGAIN;
+		/* Now allocate and set up the connection */
+		conn = b->conn_backlog[conn_tail];
+		b->conn_backlog[conn_tail] = NULL;
+		smp_store_release(&b->conn_backlog_tail,
+				  (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
+		rxrpc_get_local(local);
+		conn->params.local = local;
+		conn->params.peer = peer;
+		rxrpc_new_incoming_connection(conn, skb);
+	} else {
+		rxrpc_get_connection(conn);
 	}
 
-	_leave(" = 0");
-	return 0;
+	/* And now we can allocate and set up a new call */
+	call = b->call_backlog[call_tail];
+	b->call_backlog[call_tail] = NULL;
+	smp_store_release(&b->call_backlog_tail,
+			  (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
+
+	call->conn = conn;
+	call->peer = rxrpc_get_peer(conn->params.peer);
+	return call;
 }
 
 /*
- * accept an incoming call that needs peer, transport and/or connection setting
- * up
+ * Set up a new incoming call.  Called in BH context with the RCU read lock
+ * held.
+ *
+ * If this is for a kernel service, when we allocate the call, it will have
+ * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the
+ * retainer ref obtained from the backlog buffer.  Prealloc calls for userspace
+ * services only have the ref from the backlog buffer.  We want to pass this
+ * ref to non-BH context to dispose of.
+ *
+ * If we want to report an error, we mark the skb with the packet type and
+ * abort code and return NULL.
  */
-static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
-				      struct rxrpc_sock *rx,
-				      struct sk_buff *skb,
-				      struct sockaddr_rxrpc *srx)
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
+					   struct rxrpc_connection *conn,
+					   struct sk_buff *skb)
 {
-	struct rxrpc_connection *conn;
-	struct rxrpc_skb_priv *sp, *nsp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_sock *rx;
 	struct rxrpc_call *call;
-	struct sk_buff *notification;
-	int ret;
 
 	_enter("");
 
-	sp = rxrpc_skb(skb);
-
-	/* get a notification message to send to the server app */
-	notification = alloc_skb(0, GFP_NOFS);
-	if (!notification) {
-		_debug("no memory");
-		ret = -ENOMEM;
-		goto error_nofree;
-	}
-	rxrpc_new_skb(notification);
-	notification->mark = RXRPC_SKB_MARK_NEW_CALL;
-
-	conn = rxrpc_incoming_connection(local, srx, skb);
-	if (IS_ERR(conn)) {
-		_debug("no conn");
-		ret = PTR_ERR(conn);
-		goto error;
-	}
-
-	call = rxrpc_incoming_call(rx, conn, skb);
-	rxrpc_put_connection(conn);
-	if (IS_ERR(call)) {
-		_debug("no call");
-		ret = PTR_ERR(call);
-		goto error;
+	/* Get the socket providing the service */
+	hlist_for_each_entry_rcu_bh(rx, &local->services, listen_link) {
+		if (rx->srx.srx_service == sp->hdr.serviceId)
+			goto found_service;
 	}
 
-	/* attach the call to the socket */
-	read_lock_bh(&local->services_lock);
-	if (rx->sk.sk_state == RXRPC_CLOSE)
-		goto invalid_service;
-
-	write_lock(&rx->call_lock);
-	if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
-		rxrpc_get_call(call, rxrpc_call_got);
-
-		spin_lock(&call->conn->state_lock);
-		if (sp->hdr.securityIndex > 0 &&
-		    call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
-			_debug("await conn sec");
-			list_add_tail(&call->accept_link, &rx->secureq);
-			call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
-			set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
-			rxrpc_queue_conn(call->conn);
-		} else {
-			_debug("conn ready");
-			call->state = RXRPC_CALL_SERVER_ACCEPTING;
-			list_add_tail(&call->accept_link, &rx->acceptq);
-			rxrpc_get_call_for_skb(call, notification);
-			nsp = rxrpc_skb(notification);
-			nsp->call = call;
-
-			ASSERTCMP(atomic_read(&call->usage), >=, 3);
-
-			_debug("notify");
-			spin_lock(&call->lock);
-			ret = rxrpc_queue_rcv_skb(call, notification, true,
-						  false);
-			spin_unlock(&call->lock);
-			notification = NULL;
-			BUG_ON(ret < 0);
-		}
-		spin_unlock(&call->conn->state_lock);
+	trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+			  RX_INVALID_OPERATION, EOPNOTSUPP);
+	skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+	skb->priority = RX_INVALID_OPERATION;
+	_leave(" = NULL [service]");
+	return NULL;
 
-		_debug("queued");
+found_service:
+	spin_lock(&rx->incoming_lock);
+	if (rx->sk.sk_state == RXRPC_CLOSE) {
+		trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
+				  sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
+		skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+		skb->priority = RX_INVALID_OPERATION;
+		_leave(" = NULL [close]");
+		call = NULL;
+		goto out;
 	}
-	write_unlock(&rx->call_lock);
 
-	_debug("process");
-	rxrpc_fast_process_packet(call, skb);
-
-	_debug("done");
-	read_unlock_bh(&local->services_lock);
-	rxrpc_free_skb(notification);
-	rxrpc_put_call(call, rxrpc_call_put);
-	_leave(" = 0");
-	return 0;
-
-invalid_service:
-	_debug("invalid");
-	read_unlock_bh(&local->services_lock);
-
-	rxrpc_release_call(rx, call);
-	rxrpc_put_call(call, rxrpc_call_put);
-	ret = -ECONNREFUSED;
-error:
-	rxrpc_free_skb(notification);
-error_nofree:
-	_leave(" = %d", ret);
-	return ret;
-}
+	call = rxrpc_alloc_incoming_call(rx, local, conn, skb);
+	if (!call) {
+		skb->mark = RXRPC_SKB_MARK_BUSY;
+		_leave(" = NULL [busy]");
+		call = NULL;
+		goto out;
+	}
 
-/*
- * accept incoming calls that need peer, transport and/or connection setting up
- * - the packets we get are all incoming client DATA packets that have seq == 1
- */
-void rxrpc_accept_incoming_calls(struct rxrpc_local *local)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sockaddr_rxrpc srx;
-	struct rxrpc_sock *rx;
-	struct rxrpc_wire_header whdr;
-	struct sk_buff *skb;
-	int ret;
+	/* Make the call live. */
+	rxrpc_incoming_call(rx, call, skb);
+	conn = call->conn;
 
-	_enter("%d", local->debug_id);
+	if (rx->notify_new_call)
+		rx->notify_new_call(&rx->sk, call, call->user_call_ID);
 
-	skb = skb_dequeue(&local->accept_queue);
-	if (!skb) {
-		_leave("\n");
-		return;
-	}
+	spin_lock(&conn->state_lock);
+	switch (conn->state) {
+	case RXRPC_CONN_SERVICE_UNSECURED:
+		conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
+		set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
+		rxrpc_queue_conn(call->conn);
+		break;
 
-	_net("incoming call skb %p", skb);
-
-	rxrpc_see_skb(skb);
-	sp = rxrpc_skb(skb);
-
-	/* Set up a response packet header in case we need it */
-	whdr.epoch	= htonl(sp->hdr.epoch);
-	whdr.cid	= htonl(sp->hdr.cid);
-	whdr.callNumber	= htonl(sp->hdr.callNumber);
-	whdr.seq	= htonl(sp->hdr.seq);
-	whdr.serial	= 0;
-	whdr.flags	= 0;
-	whdr.type	= 0;
-	whdr.userStatus	= 0;
-	whdr.securityIndex = sp->hdr.securityIndex;
-	whdr._rsvd	= 0;
-	whdr.serviceId	= htons(sp->hdr.serviceId);
-
-	if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
-		goto drop;
-
-	/* get the socket providing the service */
-	read_lock_bh(&local->services_lock);
-	hlist_for_each_entry(rx, &local->services, listen_link) {
-		if (rx->srx.srx_service == sp->hdr.serviceId &&
-		    rx->sk.sk_state != RXRPC_CLOSE)
-			goto found_service;
-	}
-	read_unlock_bh(&local->services_lock);
-	goto invalid_service;
+	case RXRPC_CONN_SERVICE:
+		write_lock(&call->state_lock);
+		if (rx->discard_new_call)
+			call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+		else
+			call->state = RXRPC_CALL_SERVER_ACCEPTING;
+		write_unlock(&call->state_lock);
+		break;
 
-found_service:
-	_debug("found service %hd", rx->srx.srx_service);
-	if (sk_acceptq_is_full(&rx->sk))
-		goto backlog_full;
-	sk_acceptq_added(&rx->sk);
-	read_unlock_bh(&local->services_lock);
-
-	ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
-	if (ret < 0)
-		sk_acceptq_removed(&rx->sk);
-	switch (ret) {
-	case -ECONNRESET: /* old calls are ignored */
-	case -ECONNABORTED: /* aborted calls are reaborted or ignored */
-	case 0:
-		return;
-	case -ECONNREFUSED:
-		goto invalid_service;
-	case -EBUSY:
-		goto busy;
-	case -EKEYREJECTED:
-		goto security_mismatch;
+	case RXRPC_CONN_REMOTELY_ABORTED:
+		rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+					  conn->remote_abort, ECONNABORTED);
+		break;
+	case RXRPC_CONN_LOCALLY_ABORTED:
+		rxrpc_abort_call("CON", call, sp->hdr.seq,
+				 conn->local_abort, ECONNABORTED);
+		break;
 	default:
 		BUG();
 	}
+	spin_unlock(&conn->state_lock);
 
-backlog_full:
-	read_unlock_bh(&local->services_lock);
-busy:
-	rxrpc_busy(local, &srx, &whdr);
-	rxrpc_free_skb(skb);
-	return;
-
-drop:
-	rxrpc_free_skb(skb);
-	return;
+	if (call->state == RXRPC_CALL_SERVER_ACCEPTING)
+		rxrpc_notify_socket(call);
 
-invalid_service:
-	skb->priority = RX_INVALID_OPERATION;
-	rxrpc_reject_packet(local, skb);
-	return;
-
-	/* can't change connection security type mid-flow */
-security_mismatch:
-	skb->priority = RX_PROTOCOL_ERROR;
-	rxrpc_reject_packet(local, skb);
-	return;
+	_leave(" = %p{%d}", call, call->debug_id);
+out:
+	spin_unlock(&rx->incoming_lock);
+	return call;
 }
 
 /*
@@ -490,11 +426,10 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	write_lock(&rx->call_lock);
 
 	ret = -ENODATA;
-	if (list_empty(&rx->acceptq))
+	if (list_empty(&rx->to_be_accepted))
 		goto out;
 
 	/* check the user ID isn't already in use */
-	ret = -EBADSLT;
 	pp = &rx->calls.rb_node;
 	parent = NULL;
 	while (*pp) {
@@ -506,11 +441,14 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 		else if (user_call_ID > call->user_call_ID)
 			pp = &(*pp)->rb_right;
 		else
-			goto out;
+			goto id_in_use;
 	}
 
-	/* dequeue the first call and check it's still valid */
-	call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+	/* Dequeue the first call and check it's still valid.  We gain
+	 * responsibility for the queue's reference.
+	 */
+	call = list_entry(rx->to_be_accepted.next,
+			  struct rxrpc_call, accept_link);
 	list_del_init(&call->accept_link);
 	sk_acceptq_removed(&rx->sk);
 	rxrpc_see_call(call);
@@ -528,31 +466,35 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	}
 
 	/* formalise the acceptance */
-	rxrpc_get_call(call, rxrpc_call_got_userid);
+	rxrpc_get_call(call, rxrpc_call_got);
 	call->notify_rx = notify_rx;
 	call->user_call_ID = user_call_ID;
+	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
 	rb_insert_color(&call->sock_node, &rx->calls);
 	if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
 		BUG();
-	if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
-		BUG();
 
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
-	rxrpc_queue_call(call);
+	rxrpc_notify_socket(call);
+	rxrpc_service_prealloc(rx, GFP_KERNEL);
 	_leave(" = %p{%d}", call, call->debug_id);
 	return call;
 
 out_release:
+	_debug("release %p", call);
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
-	_debug("release %p", call);
 	rxrpc_release_call(rx, call);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-out:
+	rxrpc_put_call(call, rxrpc_call_put);
+	goto out;
+
+id_in_use:
+	ret = -EBADSLT;
 	write_unlock(&rx->call_lock);
+out:
+	rxrpc_service_prealloc(rx, GFP_KERNEL);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
@@ -564,6 +506,7 @@ out:
 int rxrpc_reject_call(struct rxrpc_sock *rx)
 {
 	struct rxrpc_call *call;
+	bool abort = false;
 	int ret;
 
 	_enter("");
@@ -572,15 +515,16 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 
 	write_lock(&rx->call_lock);
 
-	ret = -ENODATA;
-	if (list_empty(&rx->acceptq)) {
+	if (list_empty(&rx->to_be_accepted)) {
 		write_unlock(&rx->call_lock);
-		_leave(" = -ENODATA");
 		return -ENODATA;
 	}
 
-	/* dequeue the first call and check it's still valid */
-	call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+	/* Dequeue the first call and check it's still valid.  We gain
+	 * responsibility for the queue's reference.
+	 */
+	call = list_entry(rx->to_be_accepted.next,
+			  struct rxrpc_call, accept_link);
 	list_del_init(&call->accept_link);
 	sk_acceptq_removed(&rx->sk);
 	rxrpc_see_call(call);
@@ -588,66 +532,28 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 	write_lock_bh(&call->state_lock);
 	switch (call->state) {
 	case RXRPC_CALL_SERVER_ACCEPTING:
-		__rxrpc_set_call_completion(call, RXRPC_CALL_SERVER_BUSY,
-					    0, ECONNABORTED);
-		if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
-			rxrpc_queue_call(call);
-		ret = 0;
-		break;
+		__rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED);
+		abort = true;
+		/* fall through */
 	case RXRPC_CALL_COMPLETE:
 		ret = call->error;
-		break;
+		goto out_discard;
 	default:
 		BUG();
 	}
 
+out_discard:
 	write_unlock_bh(&call->state_lock);
 	write_unlock(&rx->call_lock);
-	rxrpc_release_call(rx, call);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/**
- * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call
- * @sock: The socket on which the impending call is waiting
- * @user_call_ID: The tag to attach to the call
- * @notify_rx: Where to send notifications instead of socket queue
- *
- * Allow a kernel service to accept an incoming call, assuming the incoming
- * call is still valid.  The caller should immediately trigger their own
- * notification as there must be data waiting.
- */
-struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
-					    unsigned long user_call_ID,
-					    rxrpc_notify_rx_t notify_rx)
-{
-	struct rxrpc_call *call;
-
-	_enter(",%lx", user_call_ID);
-	call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID, notify_rx);
-	_leave(" = %p", call);
-	return call;
-}
-EXPORT_SYMBOL(rxrpc_kernel_accept_call);
-
-/**
- * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
- * @sock: The socket on which the impending call is waiting
- *
- * Allow a kernel service to reject an incoming call with a BUSY message,
- * assuming the incoming call is still valid.
- */
-int rxrpc_kernel_reject_call(struct socket *sock)
-{
-	int ret;
-
-	_enter("");
-	ret = rxrpc_reject_call(rxrpc_sk(sock->sk));
+	if (abort) {
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+		rxrpc_release_call(rx, call);
+		rxrpc_put_call(call, rxrpc_call_put);
+	}
+	rxrpc_service_prealloc(rx, GFP_KERNEL);
 	_leave(" = %d", ret);
 	return ret;
 }
-EXPORT_SYMBOL(rxrpc_kernel_reject_call);
 
 /*
  * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index af88ad7d2cf9..2b976e789562 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -22,1257 +22,286 @@
 #include "ar-internal.h"
 
 /*
- * propose an ACK be sent
+ * Set the timer
  */
-void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-			 u16 skew, u32 serial, bool immediate)
+static void rxrpc_set_timer(struct rxrpc_call *call)
 {
-	unsigned long expiry;
-	s8 prior = rxrpc_ack_priority[ack_reason];
-
-	ASSERTCMP(prior, >, 0);
-
-	_enter("{%d},%s,%%%x,%u",
-	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
+	unsigned long t, now = jiffies;
 
-	if (prior < rxrpc_ack_priority[call->ackr_reason]) {
-		if (immediate)
-			goto cancel_timer;
-		return;
-	}
-
-	/* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
-	 * numbers */
-	if (prior == rxrpc_ack_priority[call->ackr_reason]) {
-		if (prior <= 4) {
-			call->ackr_skew = skew;
-			call->ackr_serial = serial;
-		}
-		if (immediate)
-			goto cancel_timer;
-		return;
-	}
-
-	call->ackr_reason = ack_reason;
-	call->ackr_serial = serial;
-
-	switch (ack_reason) {
-	case RXRPC_ACK_DELAY:
-		_debug("run delay timer");
-		expiry = rxrpc_soft_ack_delay;
-		goto run_timer;
-
-	case RXRPC_ACK_IDLE:
-		if (!immediate) {
-			_debug("run defer timer");
-			expiry = rxrpc_idle_ack_delay;
-			goto run_timer;
-		}
-		goto cancel_timer;
+	_enter("{%ld,%ld,%ld:%ld}",
+	       call->ack_at - now, call->resend_at - now, call->expire_at - now,
+	       call->timer.expires - now);
+	
+	read_lock_bh(&call->state_lock);
 
-	case RXRPC_ACK_REQUESTED:
-		expiry = rxrpc_requested_ack_delay;
-		if (!expiry)
-			goto cancel_timer;
-		if (!immediate || serial == 1) {
-			_debug("run defer timer");
-			goto run_timer;
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		t = call->ack_at;
+		if (time_before(call->resend_at, t))
+			t = call->resend_at;
+		if (time_before(call->expire_at, t))
+			t = call->expire_at;
+		if (!timer_pending(&call->timer) ||
+		    time_before(t, call->timer.expires)) {
+			_debug("set timer %ld", t - now);
+			mod_timer(&call->timer, t);
 		}
-
-	default:
-		_debug("immediate ACK");
-		goto cancel_timer;
 	}
-
-run_timer:
-	expiry += jiffies;
-	if (!timer_pending(&call->ack_timer) ||
-	    time_after(call->ack_timer.expires, expiry))
-		mod_timer(&call->ack_timer, expiry);
-	return;
-
-cancel_timer:
-	_debug("cancel timer %%%u", serial);
-	try_to_del_timer_sync(&call->ack_timer);
-	read_lock_bh(&call->state_lock);
-	if (call->state < RXRPC_CALL_COMPLETE &&
-	    !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
-		rxrpc_queue_call(call);
 	read_unlock_bh(&call->state_lock);
 }
 
 /*
- * propose an ACK be sent, locking the call structure
+ * propose an ACK be sent
  */
-void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-		       u16 skew, u32 serial, bool immediate)
+static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+				u16 skew, u32 serial, bool immediate,
+				bool background)
 {
+	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
-	if (prior > rxrpc_ack_priority[call->ackr_reason]) {
-		spin_lock_bh(&call->lock);
-		__rxrpc_propose_ACK(call, ack_reason, skew, serial, immediate);
-		spin_unlock_bh(&call->lock);
-	}
-}
-
-/*
- * set the resend timer
- */
-static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
-			     unsigned long resend_at)
-{
-	read_lock_bh(&call->state_lock);
-	if (call->state == RXRPC_CALL_COMPLETE)
-		resend = 0;
-
-	if (resend & 1) {
-		_debug("SET RESEND");
-		set_bit(RXRPC_CALL_EV_RESEND, &call->events);
-	}
-
-	if (resend & 2) {
-		_debug("MODIFY RESEND TIMER");
-		set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-		mod_timer(&call->resend_timer, resend_at);
-	} else {
-		_debug("KILL RESEND TIMER");
-		del_timer_sync(&call->resend_timer);
-		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	}
-	read_unlock_bh(&call->state_lock);
-}
-
-/*
- * resend packets
- */
-static void rxrpc_resend(struct rxrpc_call *call)
-{
-	struct rxrpc_wire_header *whdr;
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *txb;
-	unsigned long *p_txb, resend_at;
-	bool stop;
-	int loop;
-	u8 resend;
-
-	_enter("{%d,%d,%d,%d},",
-	       call->acks_hard, call->acks_unacked,
-	       atomic_read(&call->sequence),
-	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
-
-	stop = false;
-	resend = 0;
-	resend_at = 0;
-
-	for (loop = call->acks_tail;
-	     loop != call->acks_head || stop;
-	     loop = (loop + 1) &  (call->acks_winsz - 1)
-	     ) {
-		p_txb = call->acks_window + loop;
-		smp_read_barrier_depends();
-		if (*p_txb & 1)
-			continue;
-
-		txb = (struct sk_buff *) *p_txb;
-		sp = rxrpc_skb(txb);
-
-		if (sp->need_resend) {
-			sp->need_resend = false;
-
-			/* each Tx packet has a new serial number */
-			sp->hdr.serial = atomic_inc_return(&call->conn->serial);
-
-			whdr = (struct rxrpc_wire_header *)txb->head;
-			whdr->serial = htonl(sp->hdr.serial);
-
-			_proto("Tx DATA %%%u { #%d }",
-			       sp->hdr.serial, sp->hdr.seq);
-			if (rxrpc_send_data_packet(call->conn, txb) < 0) {
-				stop = true;
-				sp->resend_at = jiffies + 3;
-			} else {
-				if (rxrpc_is_client_call(call))
-					rxrpc_expose_client_call(call);
-				sp->resend_at =
-					jiffies + rxrpc_resend_timeout;
-			}
-		}
-
-		if (time_after_eq(jiffies + 1, sp->resend_at)) {
-			sp->need_resend = true;
-			resend |= 1;
-		} else if (resend & 2) {
-			if (time_before(sp->resend_at, resend_at))
-				resend_at = sp->resend_at;
-		} else {
-			resend_at = sp->resend_at;
-			resend |= 2;
-		}
-	}
-
-	rxrpc_set_resend(call, resend, resend_at);
-	_leave("");
-}
-
-/*
- * handle resend timer expiry
- */
-static void rxrpc_resend_timer(struct rxrpc_call *call)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *txb;
-	unsigned long *p_txb, resend_at;
-	int loop;
-	u8 resend;
-
-	_enter("%d,%d,%d",
-	       call->acks_tail, call->acks_unacked, call->acks_head);
-
-	if (call->state == RXRPC_CALL_COMPLETE)
-		return;
-
-	resend = 0;
-	resend_at = 0;
-
-	for (loop = call->acks_unacked;
-	     loop != call->acks_head;
-	     loop = (loop + 1) &  (call->acks_winsz - 1)
-	     ) {
-		p_txb = call->acks_window + loop;
-		smp_read_barrier_depends();
-		txb = (struct sk_buff *) (*p_txb & ~1);
-		sp = rxrpc_skb(txb);
-
-		ASSERT(!(*p_txb & 1));
+	_enter("{%d},%s,%%%x,%u",
+	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
 
-		if (sp->need_resend) {
-			;
-		} else if (time_after_eq(jiffies + 1, sp->resend_at)) {
-			sp->need_resend = true;
-			resend |= 1;
-		} else if (resend & 2) {
-			if (time_before(sp->resend_at, resend_at))
-				resend_at = sp->resend_at;
-		} else {
-			resend_at = sp->resend_at;
-			resend |= 2;
+	/* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
+	 * numbers, but we don't alter the timeout.
+	 */
+	_debug("prior %u %u vs %u %u",
+	       ack_reason, prior,
+	       call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
+	if (ack_reason == call->ackr_reason) {
+		if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
+			call->ackr_serial = serial;
+			call->ackr_skew = skew;
 		}
+		if (!immediate)
+			return;
+	} else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
+		call->ackr_reason = ack_reason;
+		call->ackr_serial = serial;
+		call->ackr_skew = skew;
 	}
 
-	rxrpc_set_resend(call, resend, resend_at);
-	_leave("");
-}
-
-/*
- * process soft ACKs of our transmitted packets
- * - these indicate packets the peer has or has not received, but hasn't yet
- *   given to the consumer, and so can still be discarded and re-requested
- */
-static int rxrpc_process_soft_ACKs(struct rxrpc_call *call,
-				   struct rxrpc_ackpacket *ack,
-				   struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *txb;
-	unsigned long *p_txb, resend_at;
-	int loop;
-	u8 sacks[RXRPC_MAXACKS], resend;
-
-	_enter("{%d,%d},{%d},",
-	       call->acks_hard,
-	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
-	       ack->nAcks);
+	switch (ack_reason) {
+	case RXRPC_ACK_REQUESTED:
+		if (rxrpc_requested_ack_delay < expiry)
+			expiry = rxrpc_requested_ack_delay;
+		if (serial == 1)
+			immediate = false;
+		break;
 
-	if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
-		goto protocol_error;
+	case RXRPC_ACK_DELAY:
+		if (rxrpc_soft_ack_delay < expiry)
+			expiry = rxrpc_soft_ack_delay;
+		break;
 
-	resend = 0;
-	resend_at = 0;
-	for (loop = 0; loop < ack->nAcks; loop++) {
-		p_txb = call->acks_window;
-		p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
-		smp_read_barrier_depends();
-		txb = (struct sk_buff *) (*p_txb & ~1);
-		sp = rxrpc_skb(txb);
+	case RXRPC_ACK_IDLE:
+		if (rxrpc_soft_ack_delay < expiry)
+			expiry = rxrpc_idle_ack_delay;
+		break;
 
-		switch (sacks[loop]) {
-		case RXRPC_ACK_TYPE_ACK:
-			sp->need_resend = false;
-			*p_txb |= 1;
-			break;
-		case RXRPC_ACK_TYPE_NACK:
-			sp->need_resend = true;
-			*p_txb &= ~1;
-			resend = 1;
-			break;
-		default:
-			_debug("Unsupported ACK type %d", sacks[loop]);
-			goto protocol_error;
-		}
+	default:
+		immediate = true;
+		break;
 	}
 
-	smp_mb();
-	call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
-
-	/* anything not explicitly ACK'd is implicitly NACK'd, but may just not
-	 * have been received or processed yet by the far end */
-	for (loop = call->acks_unacked;
-	     loop != call->acks_head;
-	     loop = (loop + 1) &  (call->acks_winsz - 1)
-	     ) {
-		p_txb = call->acks_window + loop;
-		smp_read_barrier_depends();
-		txb = (struct sk_buff *) (*p_txb & ~1);
-		sp = rxrpc_skb(txb);
-
-		if (*p_txb & 1) {
-			/* packet must have been discarded */
-			sp->need_resend = true;
-			*p_txb &= ~1;
-			resend |= 1;
-		} else if (sp->need_resend) {
-			;
-		} else if (time_after_eq(jiffies + 1, sp->resend_at)) {
-			sp->need_resend = true;
-			resend |= 1;
-		} else if (resend & 2) {
-			if (time_before(sp->resend_at, resend_at))
-				resend_at = sp->resend_at;
-		} else {
-			resend_at = sp->resend_at;
-			resend |= 2;
+	now = jiffies;
+	if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+		_debug("already scheduled");
+	} else if (immediate || expiry == 0) {
+		_debug("immediate ACK %lx", call->events);
+		if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) &&
+		    background)
+			rxrpc_queue_call(call);
+	} else {
+		ack_at = now + expiry;
+		_debug("deferred ACK %ld < %ld", expiry, call->ack_at - now);
+		if (time_before(ack_at, call->ack_at)) {
+			call->ack_at = ack_at;
+			rxrpc_set_timer(call);
 		}
 	}
-
-	rxrpc_set_resend(call, resend, resend_at);
-	_leave(" = 0");
-	return 0;
-
-protocol_error:
-	_leave(" = -EPROTO");
-	return -EPROTO;
 }
 
 /*
- * discard hard-ACK'd packets from the Tx window
- */
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
-{
-	unsigned long _skb;
-	int tail = call->acks_tail, old_tail;
-	int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
-
-	_enter("{%u,%u},%u", call->acks_hard, win, hard);
-
-	ASSERTCMP(hard - call->acks_hard, <=, win);
-
-	while (call->acks_hard < hard) {
-		smp_read_barrier_depends();
-		_skb = call->acks_window[tail] & ~1;
-		rxrpc_free_skb((struct sk_buff *) _skb);
-		old_tail = tail;
-		tail = (tail + 1) & (call->acks_winsz - 1);
-		call->acks_tail = tail;
-		if (call->acks_unacked == old_tail)
-			call->acks_unacked = tail;
-		call->acks_hard++;
-	}
-
-	wake_up(&call->waitq);
-}
-
-/*
- * clear the Tx window in the event of a failure
+ * propose an ACK be sent, locking the call structure
  */
-static void rxrpc_clear_tx_window(struct rxrpc_call *call)
+void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+		       u16 skew, u32 serial, bool immediate, bool background)
 {
-	rxrpc_rotate_tx_window(call, atomic_read(&call->sequence));
+	spin_lock_bh(&call->lock);
+	__rxrpc_propose_ACK(call, ack_reason, skew, serial,
+			    immediate, background);
+	spin_unlock_bh(&call->lock);
 }
 
 /*
- * drain the out of sequence received packet queue into the packet Rx queue
+ * Perform retransmission of NAK'd and unack'd packets.
  */
-static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
+static void rxrpc_resend(struct rxrpc_call *call)
 {
+	struct rxrpc_wire_header *whdr;
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
-	bool terminal;
-	int ret;
+	rxrpc_seq_t cursor, seq, top;
+	unsigned long resend_at, now;
+	int ix;
+	u8 annotation;
 
-	_enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
+	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
 	spin_lock_bh(&call->lock);
 
-	ret = -ECONNRESET;
-	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
-		goto socket_unavailable;
+	cursor = call->tx_hard_ack;
+	top = call->tx_top;
+	ASSERT(before_eq(cursor, top));
+	if (cursor == top)
+		goto out_unlock;
+
+	/* Scan the packet list without dropping the lock and decide which of
+	 * the packets in the Tx buffer we're going to resend and what the new
+	 * resend timeout will be.
+	 */
+	now = jiffies;
+	resend_at = now + rxrpc_resend_timeout;
+	seq = cursor + 1;
+	do {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		annotation = call->rxtx_annotations[ix];
+		if (annotation == RXRPC_TX_ANNO_ACK)
+			continue;
 
-	skb = skb_dequeue(&call->rx_oos_queue);
-	if (skb) {
+		skb = call->rxtx_buffer[ix];
 		rxrpc_see_skb(skb);
 		sp = rxrpc_skb(skb);
 
-		_debug("drain OOS packet %d [%d]",
-		       sp->hdr.seq, call->rx_first_oos);
-
-		if (sp->hdr.seq != call->rx_first_oos) {
-			skb_queue_head(&call->rx_oos_queue, skb);
-			call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
-			_debug("requeue %p {%u}", skb, call->rx_first_oos);
-		} else {
-			skb->mark = RXRPC_SKB_MARK_DATA;
-			terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
-				!(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
-			ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
-			BUG_ON(ret < 0);
-			_debug("drain #%u", call->rx_data_post);
-			call->rx_data_post++;
-
-			/* find out what the next packet is */
-			skb = skb_peek(&call->rx_oos_queue);
-			rxrpc_see_skb(skb);
-			if (skb)
-				call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
-			else
-				call->rx_first_oos = 0;
-			_debug("peek %p {%u}", skb, call->rx_first_oos);
-		}
-	}
-
-	ret = 0;
-socket_unavailable:
-	spin_unlock_bh(&call->lock);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * insert an out of sequence packet into the buffer
- */
-static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
-				    struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp, *psp;
-	struct sk_buff *p;
-	u32 seq;
-
-	sp = rxrpc_skb(skb);
-	seq = sp->hdr.seq;
-	_enter(",,{%u}", seq);
-
-	skb->destructor = rxrpc_packet_destructor;
-	ASSERTCMP(sp->call, ==, NULL);
-	sp->call = call;
-	rxrpc_get_call_for_skb(call, skb);
-
-	/* insert into the buffer in sequence order */
-	spin_lock_bh(&call->lock);
-
-	skb_queue_walk(&call->rx_oos_queue, p) {
-		psp = rxrpc_skb(p);
-		if (psp->hdr.seq > seq) {
-			_debug("insert oos #%u before #%u", seq, psp->hdr.seq);
-			skb_insert(p, skb, &call->rx_oos_queue);
-			goto inserted;
-		}
-	}
-
-	_debug("append oos #%u", seq);
-	skb_queue_tail(&call->rx_oos_queue, skb);
-inserted:
-
-	/* we might now have a new front to the queue */
-	if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
-		call->rx_first_oos = seq;
-
-	read_lock(&call->state_lock);
-	if (call->state < RXRPC_CALL_COMPLETE &&
-	    call->rx_data_post == call->rx_first_oos) {
-		_debug("drain rx oos now");
-		set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
-	}
-	read_unlock(&call->state_lock);
-
-	spin_unlock_bh(&call->lock);
-	_leave(" [stored #%u]", call->rx_first_oos);
-}
-
-/*
- * clear the Tx window on final ACK reception
- */
-static void rxrpc_zap_tx_window(struct rxrpc_call *call)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	unsigned long _skb, *acks_window;
-	u8 winsz = call->acks_winsz;
-	int tail;
-
-	acks_window = call->acks_window;
-	call->acks_window = NULL;
-
-	while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
-		tail = call->acks_tail;
-		smp_read_barrier_depends();
-		_skb = acks_window[tail] & ~1;
-		smp_mb();
-		call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
-
-		skb = (struct sk_buff *) _skb;
-		sp = rxrpc_skb(skb);
-		_debug("+++ clear Tx %u", sp->hdr.seq);
-		rxrpc_free_skb(skb);
-	}
-
-	kfree(acks_window);
-}
-
-/*
- * process the extra information that may be appended to an ACK packet
- */
-static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
-				  unsigned int latest, int nAcks)
-{
-	struct rxrpc_ackinfo ackinfo;
-	struct rxrpc_peer *peer;
-	unsigned int mtu;
-
-	if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
-		_leave(" [no ackinfo]");
-		return;
-	}
-
-	_proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
-	       latest,
-	       ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
-	       ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
-
-	mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
-
-	peer = call->peer;
-	if (mtu < peer->maxdata) {
-		spin_lock_bh(&peer->lock);
-		peer->maxdata = mtu;
-		peer->mtu = mtu + peer->hdrsize;
-		spin_unlock_bh(&peer->lock);
-		_net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
-	}
-}
-
-/*
- * process packets in the reception queue
- */
-static int rxrpc_process_rx_queue(struct rxrpc_call *call,
-				  u32 *_abort_code)
-{
-	struct rxrpc_ackpacket ack;
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	bool post_ACK;
-	int latest;
-	u32 hard, tx;
-
-	_enter("");
-
-process_further:
-	skb = skb_dequeue(&call->rx_queue);
-	if (!skb)
-		return -EAGAIN;
-
-	rxrpc_see_skb(skb);
-	_net("deferred skb %p", skb);
-
-	sp = rxrpc_skb(skb);
-
-	_debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
-
-	post_ACK = false;
-
-	switch (sp->hdr.type) {
-		/* data packets that wind up here have been received out of
-		 * order, need security processing or are jumbo packets */
-	case RXRPC_PACKET_TYPE_DATA:
-		_proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
-		/* secured packets must be verified and possibly decrypted */
-		if (call->conn->security->verify_packet(call, skb,
-							sp->hdr.seq,
-							sp->hdr.cksum) < 0)
-			goto protocol_error;
-
-		rxrpc_insert_oos_packet(call, skb);
-		goto process_further;
-
-		/* partial ACK to process */
-	case RXRPC_PACKET_TYPE_ACK:
-		if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
-			_debug("extraction failure");
-			goto protocol_error;
-		}
-		if (!skb_pull(skb, sizeof(ack)))
-			BUG();
-
-		latest = sp->hdr.serial;
-		hard = ntohl(ack.firstPacket);
-		tx = atomic_read(&call->sequence);
-
-		_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-		       latest,
-		       ntohs(ack.maxSkew),
-		       hard,
-		       ntohl(ack.previousPacket),
-		       ntohl(ack.serial),
-		       rxrpc_acks(ack.reason),
-		       ack.nAcks);
-
-		rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
-
-		if (ack.reason == RXRPC_ACK_PING) {
-			_proto("Rx ACK %%%u PING Request", latest);
-			rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
-					  skb->priority, sp->hdr.serial, true);
-		}
-
-		/* discard any out-of-order or duplicate ACKs */
-		if (latest - call->acks_latest <= 0) {
-			_debug("discard ACK %d <= %d",
-			       latest, call->acks_latest);
-			goto discard;
-		}
-		call->acks_latest = latest;
-
-		if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
-		    call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
-		    call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
-		    call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
-			goto discard;
-
-		_debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
-
-		if (hard > 0) {
-			if (hard - 1 > tx) {
-				_debug("hard-ACK'd packet %d not transmitted"
-				       " (%d top)",
-				       hard - 1, tx);
-				goto protocol_error;
-			}
-
-			if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
-			     call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
-			    hard > tx) {
-				call->acks_hard = tx;
-				goto all_acked;
+		if (annotation == RXRPC_TX_ANNO_UNACK) {
+			if (time_after(sp->resend_at, now)) {
+				if (time_before(sp->resend_at, resend_at))
+					resend_at = sp->resend_at;
+				continue;
 			}
-
-			smp_rmb();
-			rxrpc_rotate_tx_window(call, hard - 1);
-		}
-
-		if (ack.nAcks > 0) {
-			if (hard - 1 + ack.nAcks > tx) {
-				_debug("soft-ACK'd packet %d+%d not"
-				       " transmitted (%d top)",
-				       hard - 1, ack.nAcks, tx);
-				goto protocol_error;
-			}
-
-			if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0)
-				goto protocol_error;
 		}
-		goto discard;
 
-		/* complete ACK to process */
-	case RXRPC_PACKET_TYPE_ACKALL:
-		goto all_acked;
-
-		/* abort and busy are handled elsewhere */
-	case RXRPC_PACKET_TYPE_BUSY:
-	case RXRPC_PACKET_TYPE_ABORT:
-		BUG();
-
-		/* connection level events - also handled elsewhere */
-	case RXRPC_PACKET_TYPE_CHALLENGE:
-	case RXRPC_PACKET_TYPE_RESPONSE:
-	case RXRPC_PACKET_TYPE_DEBUG:
-		BUG();
-	}
-
-	/* if we've had a hard ACK that covers all the packets we've sent, then
-	 * that ends that phase of the operation */
-all_acked:
-	write_lock_bh(&call->state_lock);
-	_debug("ack all %d", call->state);
-
-	switch (call->state) {
-	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
-		break;
-	case RXRPC_CALL_SERVER_AWAIT_ACK:
-		_debug("srv complete");
-		__rxrpc_call_completed(call);
-		post_ACK = true;
-		break;
-	case RXRPC_CALL_CLIENT_SEND_REQUEST:
-	case RXRPC_CALL_SERVER_RECV_REQUEST:
-		goto protocol_error_unlock; /* can't occur yet */
-	default:
-		write_unlock_bh(&call->state_lock);
-		goto discard; /* assume packet left over from earlier phase */
-	}
-
-	write_unlock_bh(&call->state_lock);
-
-	/* if all the packets we sent are hard-ACK'd, then we can discard
-	 * whatever we've got left */
-	_debug("clear Tx %d",
-	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
-
-	del_timer_sync(&call->resend_timer);
-	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-
-	if (call->acks_window)
-		rxrpc_zap_tx_window(call);
+		/* Okay, we need to retransmit a packet. */
+		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+		seq++;
+	} while (before_eq(seq, top));
+
+	call->resend_at = resend_at;
+
+	/* Now go through the Tx window and perform the retransmissions.  We
+	 * have to drop the lock for each send.  If an ACK comes in whilst the
+	 * lock is dropped, it may clear some of the retransmission markers for
+	 * packets that it soft-ACKs.
+	 */
+	seq = cursor + 1;
+	do {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		annotation = call->rxtx_annotations[ix];
+		if (annotation != RXRPC_TX_ANNO_RETRANS)
+			continue;
 
-	if (post_ACK) {
-		/* post the final ACK message for userspace to pick up */
-		_debug("post ACK");
-		skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
-		sp->call = call;
-		rxrpc_get_call_for_skb(call, skb);
-		spin_lock_bh(&call->lock);
-		if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
-			BUG();
+		skb = call->rxtx_buffer[ix];
+		rxrpc_get_skb(skb);
 		spin_unlock_bh(&call->lock);
-		goto process_further;
-	}
-
-discard:
-	rxrpc_free_skb(skb);
-	goto process_further;
-
-protocol_error_unlock:
-	write_unlock_bh(&call->state_lock);
-protocol_error:
-	rxrpc_free_skb(skb);
-	_leave(" = -EPROTO");
-	return -EPROTO;
-}
-
-/*
- * post a message to the socket Rx queue for recvmsg() to pick up
- */
-static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
-			      bool fatal)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	int ret;
-
-	_enter("{%d,%lx},%u,%u,%d",
-	       call->debug_id, call->flags, mark, error, fatal);
-
-	/* remove timers and things for fatal messages */
-	if (fatal) {
-		del_timer_sync(&call->resend_timer);
-		del_timer_sync(&call->ack_timer);
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	}
+		sp = rxrpc_skb(skb);
 
-	if (mark != RXRPC_SKB_MARK_NEW_CALL &&
-	    !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
-		_leave("[no userid]");
-		return 0;
-	}
+		/* Each Tx packet needs a new serial number */
+		sp->hdr.serial = atomic_inc_return(&call->conn->serial);
 
-	if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
-		skb = alloc_skb(0, GFP_NOFS);
-		if (!skb)
-			return -ENOMEM;
+		whdr = (struct rxrpc_wire_header *)skb->head;
+		whdr->serial = htonl(sp->hdr.serial);
 
-		rxrpc_new_skb(skb);
+		if (rxrpc_send_data_packet(call->conn, skb) < 0) {
+			call->resend_at = now + 2;
+			rxrpc_free_skb(skb);
+			return;
+		}
 
-		skb->mark = mark;
-
-		sp = rxrpc_skb(skb);
-		memset(sp, 0, sizeof(*sp));
-		sp->error = error;
-		sp->call = call;
-		rxrpc_get_call_for_skb(call, skb);
+		if (rxrpc_is_client_call(call))
+			rxrpc_expose_client_call(call);
+		sp->resend_at = now + rxrpc_resend_timeout;
 
+		rxrpc_free_skb(skb);
 		spin_lock_bh(&call->lock);
-		ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
-		spin_unlock_bh(&call->lock);
-		BUG_ON(ret < 0);
-	}
 
-	return 0;
+		/* We need to clear the retransmit state, but there are two
+		 * things we need to be aware of: A new ACK/NAK might have been
+		 * received and the packet might have been hard-ACK'd (in which
+		 * case it will no longer be in the buffer).
+		 */
+		if (after(seq, call->tx_hard_ack) &&
+		    (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_RETRANS ||
+		     call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK))
+			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
+
+		if (after(call->tx_hard_ack, seq))
+			seq = call->tx_hard_ack;
+		seq++;
+	} while (before_eq(seq, top));
+
+out_unlock:
+	spin_unlock_bh(&call->lock);
+	_leave("");
 }
 
 /*
- * Handle background processing of incoming call packets and ACK / abort
- * generation.  A ref on the call is donated to us by whoever queued the work
- * item.
+ * Handle retransmission and deferred ACK/abort generation.
  */
 void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
-	struct rxrpc_wire_header whdr;
-	struct rxrpc_ackpacket ack;
-	struct rxrpc_ackinfo ackinfo;
-	struct msghdr msg;
-	struct kvec iov[5];
-	enum rxrpc_call_event genbit;
-	unsigned long bits;
-	__be32 data, pad;
-	size_t len;
-	bool requeue = false;
-	int loop, nbit, ioc, ret, mtu;
-	u32 serial, abort_code = RX_PROTOCOL_ERROR;
-	u8 *acks = NULL;
+	unsigned long now;
 
 	rxrpc_see_call(call);
 
 	//printk("\n--------------------\n");
-	_enter("{%d,%s,%lx} [%lu]",
-	       call->debug_id, rxrpc_call_states[call->state], call->events,
-	       (jiffies - call->creation_jif) / (HZ / 10));
-
-	if (call->state >= RXRPC_CALL_COMPLETE) {
-		rxrpc_put_call(call, rxrpc_call_put);
-		return;
-	}
-
-	if (!call->conn)
-		goto skip_msg_init;
-
-	/* there's a good chance we're going to have to send a message, so set
-	 * one up in advance */
-	msg.msg_name	= &call->peer->srx.transport;
-	msg.msg_namelen	= call->peer->srx.transport_len;
-	msg.msg_control	= NULL;
-	msg.msg_controllen = 0;
-	msg.msg_flags	= 0;
+	_enter("{%d,%s,%lx}",
+	       call->debug_id, rxrpc_call_states[call->state], call->events);
 
-	whdr.epoch	= htonl(call->conn->proto.epoch);
-	whdr.cid	= htonl(call->cid);
-	whdr.callNumber	= htonl(call->call_id);
-	whdr.seq	= 0;
-	whdr.type	= RXRPC_PACKET_TYPE_ACK;
-	whdr.flags	= call->conn->out_clientflag;
-	whdr.userStatus	= 0;
-	whdr.securityIndex = call->conn->security_ix;
-	whdr._rsvd	= 0;
-	whdr.serviceId	= htons(call->service_id);
-
-	memset(iov, 0, sizeof(iov));
-	iov[0].iov_base	= &whdr;
-	iov[0].iov_len	= sizeof(whdr);
-skip_msg_init:
-
-	/* deal with events of a final nature */
-	if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
-		enum rxrpc_skb_mark mark;
-
-		clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
-		clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
-		if (call->completion == RXRPC_CALL_NETWORK_ERROR) {
-			mark = RXRPC_SKB_MARK_NET_ERROR;
-			_debug("post net error %d", call->error);
-		} else {
-			mark = RXRPC_SKB_MARK_LOCAL_ERROR;
-			_debug("post net local error %d", call->error);
-		}
-
-		if (rxrpc_post_message(call, mark, call->error, true) < 0)
-			goto no_mem;
-		clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
-		goto kill_ACKs;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
-		ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
-
-		clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
-		_debug("post conn abort");
-
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
-				       call->error, true) < 0)
-			goto no_mem;
-		clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
-		goto kill_ACKs;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
-		whdr.type = RXRPC_PACKET_TYPE_BUSY;
-		genbit = RXRPC_CALL_EV_REJECT_BUSY;
-		goto send_message;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
-		ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
-
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
-				       call->error, true) < 0)
-			goto no_mem;
-		whdr.type = RXRPC_PACKET_TYPE_ABORT;
-		data = htonl(call->abort_code);
-		iov[1].iov_base = &data;
-		iov[1].iov_len = sizeof(data);
-		genbit = RXRPC_CALL_EV_ABORT;
-		goto send_message;
-	}
-
-	if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
-		genbit = RXRPC_CALL_EV_ACK_FINAL;
-
-		ack.bufferSpace	= htons(8);
-		ack.maxSkew	= 0;
-		ack.serial	= 0;
-		ack.reason	= RXRPC_ACK_IDLE;
-		ack.nAcks	= 0;
-		call->ackr_reason = 0;
-
-		spin_lock_bh(&call->lock);
-		ack.serial	= htonl(call->ackr_serial);
-		ack.previousPacket = htonl(call->ackr_prev_seq);
-		ack.firstPacket	= htonl(call->rx_data_eaten + 1);
-		spin_unlock_bh(&call->lock);
-
-		pad = 0;
-
-		iov[1].iov_base = &ack;
-		iov[1].iov_len	= sizeof(ack);
-		iov[2].iov_base = &pad;
-		iov[2].iov_len	= 3;
-		iov[3].iov_base = &ackinfo;
-		iov[3].iov_len	= sizeof(ackinfo);
-		goto send_ACK;
+recheck_state:
+	if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+		goto recheck_state;
 	}
 
-	if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
-			    (1 << RXRPC_CALL_EV_RCVD_ABORT))
-	    ) {
-		u32 mark;
-
-		if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
-			mark = RXRPC_SKB_MARK_REMOTE_ABORT;
-		else
-			mark = RXRPC_SKB_MARK_BUSY;
-
-		_debug("post abort/busy");
-		rxrpc_clear_tx_window(call);
-		if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
-			goto no_mem;
-
-		clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
-		goto kill_ACKs;
+	if (call->state == RXRPC_CALL_COMPLETE) {
+		del_timer_sync(&call->timer);
+		goto out_put;
 	}
 
-	if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
-		_debug("do implicit ackall");
-		rxrpc_clear_tx_window(call);
-	}
-
-	if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
+	now = jiffies;
+	if (time_after_eq(now, call->expire_at)) {
 		rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
-
-		_debug("post timeout");
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
-				       ETIME, true) < 0)
-			goto no_mem;
-
-		clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
-		goto kill_ACKs;
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 	}
 
-	/* deal with assorted inbound messages */
-	if (!skb_queue_empty(&call->rx_queue)) {
-		ret = rxrpc_process_rx_queue(call, &abort_code);
-		switch (ret) {
-		case 0:
-		case -EAGAIN:
-			break;
-		case -ENOMEM:
-			goto no_mem;
-		case -EKEYEXPIRED:
-		case -EKEYREJECTED:
-		case -EPROTO:
-			rxrpc_abort_call("PRO", call, 0, abort_code, -ret);
-			goto kill_ACKs;
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
+	    time_after_eq(now, call->ack_at)) {
+		call->ack_at = call->expire_at;
+		if (call->ackr_reason) {
+			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+			goto recheck_state;
 		}
 	}
 
-	/* handle resending */
-	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
-		rxrpc_resend_timer(call);
-	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events))
+	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) ||
+	    time_after_eq(now, call->resend_at)) {
 		rxrpc_resend(call);
-
-	/* consider sending an ordinary ACK */
-	if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
-		_debug("send ACK: window: %d - %d { %lx }",
-		       call->rx_data_eaten, call->ackr_win_top,
-		       call->ackr_window[0]);
-
-		if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
-		    call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
-			/* ACK by sending reply DATA packet in this state */
-			clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-			goto maybe_reschedule;
-		}
-
-		genbit = RXRPC_CALL_EV_ACK;
-
-		acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
-			       GFP_NOFS);
-		if (!acks)
-			goto no_mem;
-
-		//hdr.flags	= RXRPC_SLOW_START_OK;
-		ack.bufferSpace	= htons(8);
-		ack.maxSkew	= 0;
-
-		spin_lock_bh(&call->lock);
-		ack.reason	= call->ackr_reason;
-		ack.serial	= htonl(call->ackr_serial);
-		ack.previousPacket = htonl(call->ackr_prev_seq);
-		ack.firstPacket = htonl(call->rx_data_eaten + 1);
-
-		ack.nAcks = 0;
-		for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
-			nbit = loop * BITS_PER_LONG;
-			for (bits = call->ackr_window[loop]; bits; bits >>= 1
-			     ) {
-				_debug("- l=%d n=%d b=%lx", loop, nbit, bits);
-				if (bits & 1) {
-					acks[nbit] = RXRPC_ACK_TYPE_ACK;
-					ack.nAcks = nbit + 1;
-				}
-				nbit++;
-			}
-		}
-		call->ackr_reason = 0;
-		spin_unlock_bh(&call->lock);
-
-		pad = 0;
-
-		iov[1].iov_base = &ack;
-		iov[1].iov_len	= sizeof(ack);
-		iov[2].iov_base = acks;
-		iov[2].iov_len	= ack.nAcks;
-		iov[3].iov_base = &pad;
-		iov[3].iov_len	= 3;
-		iov[4].iov_base = &ackinfo;
-		iov[4].iov_len	= sizeof(ackinfo);
-
-		switch (ack.reason) {
-		case RXRPC_ACK_REQUESTED:
-		case RXRPC_ACK_DUPLICATE:
-		case RXRPC_ACK_OUT_OF_SEQUENCE:
-		case RXRPC_ACK_EXCEEDS_WINDOW:
-		case RXRPC_ACK_NOSPACE:
-		case RXRPC_ACK_PING:
-		case RXRPC_ACK_PING_RESPONSE:
-			goto send_ACK_with_skew;
-		case RXRPC_ACK_DELAY:
-		case RXRPC_ACK_IDLE:
-			goto send_ACK;
-		}
+		goto recheck_state;
 	}
 
-	/* handle completion of security negotiations on an incoming
-	 * connection */
-	if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) {
-		_debug("secured");
-		spin_lock_bh(&call->lock);
-
-		if (call->state == RXRPC_CALL_SERVER_SECURING) {
-			struct rxrpc_sock *rx;
-			_debug("securing");
-			rcu_read_lock();
-			rx = rcu_dereference(call->socket);
-			if (rx) {
-				write_lock(&rx->call_lock);
-				if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
-					_debug("not released");
-					call->state = RXRPC_CALL_SERVER_ACCEPTING;
-					list_move_tail(&call->accept_link,
-						       &rx->acceptq);
-				}
-				write_unlock(&rx->call_lock);
-			}
-			rcu_read_unlock();
-			read_lock(&call->state_lock);
-			if (call->state < RXRPC_CALL_COMPLETE)
-				set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
-			read_unlock(&call->state_lock);
-		}
-
-		spin_unlock_bh(&call->lock);
-		if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
-			goto maybe_reschedule;
-	}
-
-	/* post a notification of an acceptable connection to the app */
-	if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) {
-		_debug("post accept");
-		if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
-				       0, false) < 0)
-			goto no_mem;
-		clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
-		goto maybe_reschedule;
-	}
-
-	/* handle incoming call acceptance */
-	if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) {
-		_debug("accepted");
-		ASSERTCMP(call->rx_data_post, ==, 0);
-		call->rx_data_post = 1;
-		read_lock_bh(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE)
-			set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
-		read_unlock_bh(&call->state_lock);
-	}
-
-	/* drain the out of sequence received packet queue into the packet Rx
-	 * queue */
-	if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
-		while (call->rx_data_post == call->rx_first_oos)
-			if (rxrpc_drain_rx_oos_queue(call) < 0)
-				break;
-		goto maybe_reschedule;
-	}
+	rxrpc_set_timer(call);
 
 	/* other events may have been raised since we started checking */
-	goto maybe_reschedule;
-
-send_ACK_with_skew:
-	ack.maxSkew = htons(call->ackr_skew);
-send_ACK:
-	mtu = call->peer->if_mtu;
-	mtu -= call->peer->hdrsize;
-	ackinfo.maxMTU	= htonl(mtu);
-	ackinfo.rwind	= htonl(rxrpc_rx_window_size);
-
-	/* permit the peer to send us jumbo packets if it wants to */
-	ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
-	ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
-
-	serial = atomic_inc_return(&call->conn->serial);
-	whdr.serial = htonl(serial);
-	_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-	       serial,
-	       ntohs(ack.maxSkew),
-	       ntohl(ack.firstPacket),
-	       ntohl(ack.previousPacket),
-	       ntohl(ack.serial),
-	       rxrpc_acks(ack.reason),
-	       ack.nAcks);
-
-	del_timer_sync(&call->ack_timer);
-	if (ack.nAcks > 0)
-		set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
-	goto send_message_2;
-
-send_message:
-	_debug("send message");
-
-	serial = atomic_inc_return(&call->conn->serial);
-	whdr.serial = htonl(serial);
-	_proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
-send_message_2:
-
-	len = iov[0].iov_len;
-	ioc = 1;
-	if (iov[4].iov_len) {
-		ioc = 5;
-		len += iov[4].iov_len;
-		len += iov[3].iov_len;
-		len += iov[2].iov_len;
-		len += iov[1].iov_len;
-	} else if (iov[3].iov_len) {
-		ioc = 4;
-		len += iov[3].iov_len;
-		len += iov[2].iov_len;
-		len += iov[1].iov_len;
-	} else if (iov[2].iov_len) {
-		ioc = 3;
-		len += iov[2].iov_len;
-		len += iov[1].iov_len;
-	} else if (iov[1].iov_len) {
-		ioc = 2;
-		len += iov[1].iov_len;
-	}
-
-	ret = kernel_sendmsg(call->conn->params.local->socket,
-			     &msg, iov, ioc, len);
-	if (ret < 0) {
-		_debug("sendmsg failed: %d", ret);
-		if (call->state < RXRPC_CALL_COMPLETE)
-			requeue = true;
-		goto error;
-	}
-
-	switch (genbit) {
-	case RXRPC_CALL_EV_ABORT:
-		clear_bit(genbit, &call->events);
-		clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
-		goto kill_ACKs;
-
-	case RXRPC_CALL_EV_ACK_FINAL:
-		rxrpc_call_completed(call);
-		goto kill_ACKs;
-
-	default:
-		clear_bit(genbit, &call->events);
-		switch (call->state) {
-		case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-		case RXRPC_CALL_CLIENT_RECV_REPLY:
-		case RXRPC_CALL_SERVER_RECV_REQUEST:
-		case RXRPC_CALL_SERVER_ACK_REQUEST:
-			_debug("start ACK timer");
-			rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
-					  call->ackr_skew, call->ackr_serial,
-					  false);
-		default:
-			break;
-		}
-		goto maybe_reschedule;
-	}
-
-kill_ACKs:
-	del_timer_sync(&call->ack_timer);
-	clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-
-maybe_reschedule:
-	if (call->events || !skb_queue_empty(&call->rx_queue)) {
-		if (call->state < RXRPC_CALL_COMPLETE)
-			requeue = true;
-	}
-
-error:
-	kfree(acks);
-
-	if ((requeue || call->events) && !work_pending(&call->processor)) {
-		_debug("jumpstart %x", call->conn->proto.cid);
+	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
 		__rxrpc_queue_call(call);
-	} else {
-		rxrpc_put_call(call, rxrpc_call_put);
+		goto out;
 	}
 
+out_put:
+	rxrpc_put_call(call, rxrpc_call_put);
+out:
 	_leave("");
-	return;
-
-no_mem:
-	_debug("out of memory");
-	goto maybe_reschedule;
 }
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index d233adc9b5e5..18ab13f82f6e 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -30,7 +30,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_CLIENT_SEND_REQUEST]	= "ClSndReq",
 	[RXRPC_CALL_CLIENT_AWAIT_REPLY]		= "ClAwtRpl",
 	[RXRPC_CALL_CLIENT_RECV_REPLY]		= "ClRcvRpl",
-	[RXRPC_CALL_CLIENT_FINAL_ACK]		= "ClFnlACK",
 	[RXRPC_CALL_SERVER_PREALLOC]		= "SvPrealc",
 	[RXRPC_CALL_SERVER_SECURING]		= "SvSecure",
 	[RXRPC_CALL_SERVER_ACCEPTING]		= "SvAccept",
@@ -43,7 +42,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 
 const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 	[RXRPC_CALL_SUCCEEDED]			= "Complete",
-	[RXRPC_CALL_SERVER_BUSY]		= "SvBusy  ",
 	[RXRPC_CALL_REMOTELY_ABORTED]		= "RmtAbort",
 	[RXRPC_CALL_LOCALLY_ABORTED]		= "LocAbort",
 	[RXRPC_CALL_LOCAL_ERROR]		= "LocError",
@@ -57,10 +55,8 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
 	[rxrpc_call_queued_ref]		= "QUR",
 	[rxrpc_call_seen]		= "SEE",
 	[rxrpc_call_got]		= "GOT",
-	[rxrpc_call_got_skb]		= "Gsk",
 	[rxrpc_call_got_userid]		= "Gus",
 	[rxrpc_call_put]		= "PUT",
-	[rxrpc_call_put_skb]		= "Psk",
 	[rxrpc_call_put_userid]		= "Pus",
 	[rxrpc_call_put_noqueue]	= "PNQ",
 };
@@ -69,9 +65,15 @@ struct kmem_cache *rxrpc_call_jar;
 LIST_HEAD(rxrpc_calls);
 DEFINE_RWLOCK(rxrpc_call_lock);
 
-static void rxrpc_call_life_expired(unsigned long _call);
-static void rxrpc_ack_time_expired(unsigned long _call);
-static void rxrpc_resend_time_expired(unsigned long _call);
+static void rxrpc_call_timer_expired(unsigned long _call)
+{
+	struct rxrpc_call *call = (struct rxrpc_call *)_call;
+
+	_enter("%d", call->debug_id);
+
+	if (call->state < RXRPC_CALL_COMPLETE)
+		rxrpc_queue_call(call);
+}
 
 /*
  * find an extant server call
@@ -121,27 +123,24 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 	if (!call)
 		return NULL;
 
-	call->acks_winsz = 16;
-	call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long),
+	call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE,
+				    sizeof(struct sk_buff *),
 				    gfp);
-	if (!call->acks_window) {
-		kmem_cache_free(rxrpc_call_jar, call);
-		return NULL;
-	}
+	if (!call->rxtx_buffer)
+		goto nomem;
 
-	setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
-		    (unsigned long) call);
-	setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
-		    (unsigned long) call);
-	setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
-		    (unsigned long) call);
+	call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp);
+	if (!call->rxtx_annotations)
+		goto nomem_2;
+
+	setup_timer(&call->timer, rxrpc_call_timer_expired,
+		    (unsigned long)call);
 	INIT_WORK(&call->processor, &rxrpc_process_call);
 	INIT_LIST_HEAD(&call->link);
 	INIT_LIST_HEAD(&call->chan_wait_link);
 	INIT_LIST_HEAD(&call->accept_link);
-	skb_queue_head_init(&call->rx_queue);
-	skb_queue_head_init(&call->rx_oos_queue);
-	skb_queue_head_init(&call->knlrecv_queue);
+	INIT_LIST_HEAD(&call->recvmsg_link);
+	INIT_LIST_HEAD(&call->sock_link);
 	init_waitqueue_head(&call->waitq);
 	spin_lock_init(&call->lock);
 	rwlock_init(&call->state_lock);
@@ -150,63 +149,52 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 
 	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
 
-	call->rx_data_expect = 1;
-	call->rx_data_eaten = 0;
-	call->rx_first_oos = 0;
-	call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size;
-	call->creation_jif = jiffies;
+	/* Leave space in the ring to handle a maxed-out jumbo packet */
+	call->rx_winsize = RXRPC_RXTX_BUFF_SIZE - 1 - 46;
+	call->tx_winsize = 16;
+	call->rx_expect_next = 1;
 	return call;
+
+nomem_2:
+	kfree(call->rxtx_buffer);
+nomem:
+	kmem_cache_free(rxrpc_call_jar, call);
+	return NULL;
 }
 
 /*
  * Allocate a new client call.
  */
-static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
-						  struct sockaddr_rxrpc *srx,
+static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 						  gfp_t gfp)
 {
 	struct rxrpc_call *call;
 
 	_enter("");
 
-	ASSERT(rx->local != NULL);
-
 	call = rxrpc_alloc_call(gfp);
 	if (!call)
 		return ERR_PTR(-ENOMEM);
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
-	call->rx_data_post = 1;
 	call->service_id = srx->srx_service;
-	rcu_assign_pointer(call->socket, rx);
 
 	_leave(" = %p", call);
 	return call;
 }
 
 /*
- * Begin client call.
+ * Initiate the call ack/resend/expiry timer.
  */
-static int rxrpc_begin_client_call(struct rxrpc_call *call,
-				   struct rxrpc_conn_parameters *cp,
-				   struct sockaddr_rxrpc *srx,
-				   gfp_t gfp)
+static void rxrpc_start_call_timer(struct rxrpc_call *call)
 {
-	int ret;
-
-	/* Set up or get a connection record and set the protocol parameters,
-	 * including channel number and call ID.
-	 */
-	ret = rxrpc_connect_call(call, cp, srx, gfp);
-	if (ret < 0)
-		return ret;
-
-	spin_lock(&call->conn->params.peer->lock);
-	hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets);
-	spin_unlock(&call->conn->params.peer->lock);
-
-	call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
-	add_timer(&call->lifetimer);
-	return 0;
+	unsigned long expire_at;
+
+	expire_at = jiffies + rxrpc_max_call_lifetime;
+	call->expire_at = expire_at;
+	call->ack_at = expire_at;
+	call->resend_at = expire_at;
+	call->timer.expires = expire_at;
+	add_timer(&call->timer);
 }
 
 /*
@@ -226,7 +214,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 
 	_enter("%p,%lx", rx, user_call_ID);
 
-	call = rxrpc_alloc_client_call(rx, srx, gfp);
+	call = rxrpc_alloc_client_call(srx, gfp);
 	if (IS_ERR(call)) {
 		_leave(" = %ld", PTR_ERR(call));
 		return call;
@@ -255,19 +243,32 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 			goto found_user_ID_now_present;
 	}
 
+	rcu_assign_pointer(call->socket, rx);
 	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
 	rb_insert_color(&call->sock_node, &rx->calls);
+	list_add(&call->sock_link, &rx->sock_calls);
+
 	write_unlock(&rx->call_lock);
 
-	write_lock_bh(&rxrpc_call_lock);
+	write_lock(&rxrpc_call_lock);
 	list_add_tail(&call->link, &rxrpc_calls);
-	write_unlock_bh(&rxrpc_call_lock);
+	write_unlock(&rxrpc_call_lock);
 
-	ret = rxrpc_begin_client_call(call, cp, srx, gfp);
+	/* Set up or get a connection record and set the protocol parameters,
+	 * including channel number and call ID.
+	 */
+	ret = rxrpc_connect_call(call, cp, srx, gfp);
 	if (ret < 0)
 		goto error;
 
+	spin_lock_bh(&call->conn->params.peer->lock);
+	hlist_add_head(&call->error_link,
+		       &call->conn->params.peer->error_targets);
+	spin_unlock_bh(&call->conn->params.peer->lock);
+
+	rxrpc_start_call_timer(call);
+
 	_net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
 
 	_leave(" = %p [new]", call);
@@ -279,9 +280,9 @@ error:
 	write_unlock(&rx->call_lock);
 	rxrpc_put_call(call, rxrpc_call_put_userid);
 
-	write_lock_bh(&rxrpc_call_lock);
+	write_lock(&rxrpc_call_lock);
 	list_del_init(&call->link);
-	write_unlock_bh(&rxrpc_call_lock);
+	write_unlock(&rxrpc_call_lock);
 
 error_out:
 	__rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
@@ -303,142 +304,46 @@ found_user_ID_now_present:
 }
 
 /*
- * set up an incoming call
- * - called in process context with IRQs enabled
+ * Set up an incoming call.  call->conn points to the connection.
+ * This is called in BH context and isn't allowed to fail.
  */
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
-				       struct rxrpc_connection *conn,
-				       struct sk_buff *skb)
+void rxrpc_incoming_call(struct rxrpc_sock *rx,
+			 struct rxrpc_call *call,
+			 struct sk_buff *skb)
 {
+	struct rxrpc_connection *conn = call->conn;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_call *call, *candidate;
-	const void *here = __builtin_return_address(0);
-	u32 call_id, chan;
-
-	_enter(",%d", conn->debug_id);
-
-	ASSERT(rx != NULL);
-
-	candidate = rxrpc_alloc_call(GFP_NOIO);
-	if (!candidate)
-		return ERR_PTR(-EBUSY);
+	u32 chan;
 
-	trace_rxrpc_call(candidate, rxrpc_call_new_service,
-			 atomic_read(&candidate->usage), here, NULL);
+	_enter(",%d", call->conn->debug_id);
 
-	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
-	candidate->conn		= conn;
-	candidate->peer		= conn->params.peer;
-	candidate->cid		= sp->hdr.cid;
-	candidate->call_id	= sp->hdr.callNumber;
-	candidate->security_ix	= sp->hdr.securityIndex;
-	candidate->rx_data_post	= 0;
-	candidate->state	= RXRPC_CALL_SERVER_ACCEPTING;
-	candidate->flags	|= (1 << RXRPC_CALL_IS_SERVICE);
-	if (conn->security_ix > 0)
-		candidate->state = RXRPC_CALL_SERVER_SECURING;
-	rcu_assign_pointer(candidate->socket, rx);
-
-	spin_lock(&conn->channel_lock);
-
-	/* set the channel for this call */
-	call = rcu_dereference_protected(conn->channels[chan].call,
-					 lockdep_is_held(&conn->channel_lock));
-
-	_debug("channel[%u] is %p", candidate->cid & RXRPC_CHANNELMASK, call);
-	if (call && call->call_id == sp->hdr.callNumber) {
-		/* already set; must've been a duplicate packet */
-		_debug("extant call [%d]", call->state);
-		ASSERTCMP(call->conn, ==, conn);
-
-		read_lock(&call->state_lock);
-		switch (call->state) {
-		case RXRPC_CALL_LOCALLY_ABORTED:
-			if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
-				rxrpc_queue_call(call);
-		case RXRPC_CALL_REMOTELY_ABORTED:
-			read_unlock(&call->state_lock);
-			goto aborted_call;
-		default:
-			rxrpc_get_call(call, rxrpc_call_got);
-			read_unlock(&call->state_lock);
-			goto extant_call;
-		}
-	}
-
-	if (call) {
-		/* it seems the channel is still in use from the previous call
-		 * - ditch the old binding if its call is now complete */
-		_debug("CALL: %u { %s }",
-		       call->debug_id, rxrpc_call_states[call->state]);
-
-		if (call->state == RXRPC_CALL_COMPLETE) {
-			__rxrpc_disconnect_call(conn, call);
-		} else {
-			spin_unlock(&conn->channel_lock);
-			kmem_cache_free(rxrpc_call_jar, candidate);
-			_leave(" = -EBUSY");
-			return ERR_PTR(-EBUSY);
-		}
-	}
-
-	/* check the call number isn't duplicate */
-	_debug("check dup");
-	call_id = sp->hdr.callNumber;
-
-	/* We just ignore calls prior to the current call ID.  Terminated calls
-	 * are handled via the connection.
+	rcu_assign_pointer(call->socket, rx);
+	call->call_id		= sp->hdr.callNumber;
+	call->service_id	= sp->hdr.serviceId;
+	call->cid		= sp->hdr.cid;
+	call->state		= RXRPC_CALL_SERVER_ACCEPTING;
+	if (sp->hdr.securityIndex > 0)
+		call->state	= RXRPC_CALL_SERVER_SECURING;
+
+	/* Set the channel for this call.  We don't get channel_lock as we're
+	 * only defending against the data_ready handler (which we're called
+	 * from) and the RESPONSE packet parser (which is only really
+	 * interested in call_counter and can cope with a disagreement with the
+	 * call pointer).
 	 */
-	if (call_id <= conn->channels[chan].call_counter)
-		goto old_call; /* TODO: Just drop packet */
-
-	/* Temporary: Mirror the backlog prealloc ref (TODO: use prealloc) */
-	rxrpc_get_call(candidate, rxrpc_call_got);
-
-	/* make the call available */
-	_debug("new call");
-	call = candidate;
-	candidate = NULL;
-	conn->channels[chan].call_counter = call_id;
+	chan = sp->hdr.cid & RXRPC_CHANNELMASK;
+	conn->channels[chan].call_counter = call->call_id;
+	conn->channels[chan].call_id = call->call_id;
 	rcu_assign_pointer(conn->channels[chan].call, call);
-	rxrpc_get_connection(conn);
-	rxrpc_get_peer(call->peer);
-	spin_unlock(&conn->channel_lock);
 
 	spin_lock(&conn->params.peer->lock);
 	hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
 	spin_unlock(&conn->params.peer->lock);
 
-	write_lock_bh(&rxrpc_call_lock);
-	list_add_tail(&call->link, &rxrpc_calls);
-	write_unlock_bh(&rxrpc_call_lock);
-
-	call->service_id = conn->params.service_id;
-
 	_net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
 
-	call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
-	add_timer(&call->lifetimer);
-	_leave(" = %p {%d} [new]", call, call->debug_id);
-	return call;
-
-extant_call:
-	spin_unlock(&conn->channel_lock);
-	kmem_cache_free(rxrpc_call_jar, candidate);
-	_leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1);
-	return call;
-
-aborted_call:
-	spin_unlock(&conn->channel_lock);
-	kmem_cache_free(rxrpc_call_jar, candidate);
-	_leave(" = -ECONNABORTED");
-	return ERR_PTR(-ECONNABORTED);
-
-old_call:
-	spin_unlock(&conn->channel_lock);
-	kmem_cache_free(rxrpc_call_jar, candidate);
-	_leave(" = -ECONNRESET [old]");
-	return ERR_PTR(-ECONNRESET);
+	rxrpc_start_call_timer(call);
+	_leave("");
 }
 
 /*
@@ -497,25 +402,17 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 }
 
 /*
- * Note the addition of a ref on a call for a socket buffer.
+ * Detach a call from its owning socket.
  */
-void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
+void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 {
-	const void *here = __builtin_return_address(0);
-	int n = atomic_inc_return(&call->usage);
+	struct rxrpc_connection *conn = call->conn;
+	bool put = false;
+	int i;
 
-	trace_rxrpc_call(call, rxrpc_call_got_skb, n, here, skb);
-}
+	_enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
 
-/*
- * detach a call from a socket and set up for release
- */
-void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
-{
-	_enter("{%d,%d,%d,%d}",
-	       call->debug_id, atomic_read(&call->usage),
-	       atomic_read(&call->ackr_not_idle),
-	       call->rx_first_oos);
+	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
 	rxrpc_see_call(call);
 
@@ -524,80 +421,46 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 		BUG();
 	spin_unlock_bh(&call->lock);
 
-	/* dissociate from the socket
-	 * - the socket's ref on the call is passed to the death timer
-	 */
-	_debug("RELEASE CALL %p (%d)", call, call->debug_id);
+	del_timer_sync(&call->timer);
 
-	if (call->peer) {
-		spin_lock(&call->peer->lock);
-		hlist_del_init(&call->error_link);
-		spin_unlock(&call->peer->lock);
-	}
+	/* Make sure we don't get any more notifications */
+	write_lock_bh(&rx->recvmsg_lock);
 
-	write_lock_bh(&rx->call_lock);
-	if (!list_empty(&call->accept_link)) {
+	if (!list_empty(&call->recvmsg_link)) {
 		_debug("unlinking once-pending call %p { e=%lx f=%lx }",
 		       call, call->events, call->flags);
-		ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
-		list_del_init(&call->accept_link);
-		sk_acceptq_removed(&rx->sk);
-	} else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+		list_del(&call->recvmsg_link);
+		put = true;
+	}
+
+	/* list_empty() must return false in rxrpc_notify_socket() */
+	call->recvmsg_link.next = NULL;
+	call->recvmsg_link.prev = NULL;
+
+	write_unlock_bh(&rx->recvmsg_lock);
+	if (put)
+		rxrpc_put_call(call, rxrpc_call_put);
+
+	write_lock(&rx->call_lock);
+
+	if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
 		rb_erase(&call->sock_node, &rx->calls);
 		memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
-		clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
 		rxrpc_put_call(call, rxrpc_call_put_userid);
 	}
-	write_unlock_bh(&rx->call_lock);
-
-	/* free up the channel for reuse */
-	if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) {
-		clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
-		rxrpc_call_completed(call);
-	} else {
-		write_lock_bh(&call->state_lock);
-
-		if (call->state < RXRPC_CALL_COMPLETE) {
-			_debug("+++ ABORTING STATE %d +++\n", call->state);
-			__rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
-			clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
-		}
-
-		write_unlock_bh(&call->state_lock);
-	}
 
-	if (call->conn)
+	list_del(&call->sock_link);
+	write_unlock(&rx->call_lock);
+
+	_debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+
+	if (conn)
 		rxrpc_disconnect_call(call);
 
-	/* clean up the Rx queue */
-	if (!skb_queue_empty(&call->rx_queue) ||
-	    !skb_queue_empty(&call->rx_oos_queue)) {
-		struct rxrpc_skb_priv *sp;
-		struct sk_buff *skb;
-
-		_debug("purge Rx queues");
-
-		spin_lock_bh(&call->lock);
-		while ((skb = skb_dequeue(&call->rx_queue)) ||
-		       (skb = skb_dequeue(&call->rx_oos_queue))) {
-			spin_unlock_bh(&call->lock);
-
-			sp = rxrpc_skb(skb);
-			_debug("- zap %s %%%u #%u",
-			       rxrpc_pkts[sp->hdr.type],
-			       sp->hdr.serial, sp->hdr.seq);
-			rxrpc_free_skb(skb);
-			spin_lock_bh(&call->lock);
-		}
-		spin_unlock_bh(&call->lock);
+	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
+		rxrpc_free_skb(call->rxtx_buffer[i]);
+		call->rxtx_buffer[i] = NULL;
 	}
-	rxrpc_purge_queue(&call->knlrecv_queue);
-
-	del_timer_sync(&call->resend_timer);
-	del_timer_sync(&call->ack_timer);
-	del_timer_sync(&call->lifetimer);
 
 	/* We have to release the prealloc backlog ref */
 	if (rxrpc_is_service_call(call))
@@ -611,28 +474,19 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 {
 	struct rxrpc_call *call;
-	struct rb_node *p;
 
 	_enter("%p", rx);
 
-	read_lock_bh(&rx->call_lock);
-
-	/* kill the not-yet-accepted incoming calls */
-	list_for_each_entry(call, &rx->secureq, accept_link) {
-		rxrpc_release_call(rx, call);
-	}
-
-	list_for_each_entry(call, &rx->acceptq, accept_link) {
-		rxrpc_release_call(rx, call);
-	}
-
-	/* mark all the calls as no longer wanting incoming packets */
-	for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
-		call = rb_entry(p, struct rxrpc_call, sock_node);
+	while (!list_empty(&rx->sock_calls)) {
+		call = list_entry(rx->sock_calls.next,
+				  struct rxrpc_call, sock_link);
+		rxrpc_get_call(call, rxrpc_call_got);
+		rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 		rxrpc_release_call(rx, call);
+		rxrpc_put_call(call, rxrpc_call_put);
 	}
 
-	read_unlock_bh(&rx->call_lock);
 	_leave("");
 }
 
@@ -651,23 +505,12 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 	ASSERTCMP(n, >=, 0);
 	if (n == 0) {
 		_debug("call %d dead", call->debug_id);
-		rxrpc_cleanup_call(call);
-	}
-}
+		ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
-/*
- * Release a call ref held by a socket buffer.
- */
-void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
-{
-	const void *here = __builtin_return_address(0);
-	int n;
+		write_lock(&rxrpc_call_lock);
+		list_del_init(&call->link);
+		write_unlock(&rxrpc_call_lock);
 
-	n = atomic_dec_return(&call->usage);
-	trace_rxrpc_call(call, rxrpc_call_put_skb, n, here, skb);
-	ASSERTCMP(n, >=, 0);
-	if (n == 0) {
-		_debug("call %d dead", call->debug_id);
 		rxrpc_cleanup_call(call);
 	}
 }
@@ -679,9 +522,9 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
 {
 	struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
 
-	rxrpc_purge_queue(&call->rx_queue);
-	rxrpc_purge_queue(&call->knlrecv_queue);
 	rxrpc_put_peer(call->peer);
+	kfree(call->rxtx_buffer);
+	kfree(call->rxtx_annotations);
 	kmem_cache_free(rxrpc_call_jar, call);
 }
 
@@ -690,49 +533,24 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
  */
 void rxrpc_cleanup_call(struct rxrpc_call *call)
 {
-	_net("DESTROY CALL %d", call->debug_id);
+	int i;
 
-	write_lock_bh(&rxrpc_call_lock);
-	list_del_init(&call->link);
-	write_unlock_bh(&rxrpc_call_lock);
+	_net("DESTROY CALL %d", call->debug_id);
 
 	memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
 
-	del_timer_sync(&call->lifetimer);
-	del_timer_sync(&call->ack_timer);
-	del_timer_sync(&call->resend_timer);
+	del_timer_sync(&call->timer);
 
 	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 	ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
-	ASSERT(!work_pending(&call->processor));
 	ASSERTCMP(call->conn, ==, NULL);
 
-	if (call->acks_window) {
-		_debug("kill Tx window %d",
-		       CIRC_CNT(call->acks_head, call->acks_tail,
-				call->acks_winsz));
-		smp_mb();
-		while (CIRC_CNT(call->acks_head, call->acks_tail,
-				call->acks_winsz) > 0) {
-			struct rxrpc_skb_priv *sp;
-			unsigned long _skb;
-
-			_skb = call->acks_window[call->acks_tail] & ~1;
-			sp = rxrpc_skb((struct sk_buff *)_skb);
-			_debug("+++ clear Tx %u", sp->hdr.seq);
-			rxrpc_free_skb((struct sk_buff *)_skb);
-			call->acks_tail =
-				(call->acks_tail + 1) & (call->acks_winsz - 1);
-		}
-
-		kfree(call->acks_window);
-	}
+	/* Clean up the Rx/Tx buffer */
+	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
+		rxrpc_free_skb(call->rxtx_buffer[i]);
 
 	rxrpc_free_skb(call->tx_pending);
 
-	rxrpc_purge_queue(&call->rx_queue);
-	ASSERT(skb_queue_empty(&call->rx_oos_queue));
-	rxrpc_purge_queue(&call->knlrecv_queue);
 	call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
 }
 
@@ -747,8 +565,8 @@ void __exit rxrpc_destroy_all_calls(void)
 
 	if (list_empty(&rxrpc_calls))
 		return;
-	
-	write_lock_bh(&rxrpc_call_lock);
+
+	write_lock(&rxrpc_call_lock);
 
 	while (!list_empty(&rxrpc_calls)) {
 		call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
@@ -757,74 +575,15 @@ void __exit rxrpc_destroy_all_calls(void)
 		rxrpc_see_call(call);
 		list_del_init(&call->link);
 
-		pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
+		pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n",
 		       call, atomic_read(&call->usage),
-		       atomic_read(&call->ackr_not_idle),
 		       rxrpc_call_states[call->state],
 		       call->flags, call->events);
-		if (!skb_queue_empty(&call->rx_queue))
-			pr_err("Rx queue occupied\n");
-		if (!skb_queue_empty(&call->rx_oos_queue))
-			pr_err("OOS queue occupied\n");
 
-		write_unlock_bh(&rxrpc_call_lock);
+		write_unlock(&rxrpc_call_lock);
 		cond_resched();
-		write_lock_bh(&rxrpc_call_lock);
+		write_lock(&rxrpc_call_lock);
 	}
 
-	write_unlock_bh(&rxrpc_call_lock);
-	_leave("");
-}
-
-/*
- * handle call lifetime being exceeded
- */
-static void rxrpc_call_life_expired(unsigned long _call)
-{
-	struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
-	_enter("{%d}", call->debug_id);
-
-	rxrpc_see_call(call);
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
-	rxrpc_queue_call(call);
-}
-
-/*
- * handle resend timer expiry
- * - may not take call->state_lock as this can deadlock against del_timer_sync()
- */
-static void rxrpc_resend_time_expired(unsigned long _call)
-{
-	struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
-	_enter("{%d}", call->debug_id);
-
-	rxrpc_see_call(call);
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
-		rxrpc_queue_call(call);
-}
-
-/*
- * handle ACK timer expiry
- */
-static void rxrpc_ack_time_expired(unsigned long _call)
-{
-	struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
-	_enter("{%d}", call->debug_id);
-
-	rxrpc_see_call(call);
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
-		rxrpc_queue_call(call);
+	write_unlock(&rxrpc_call_lock);
 }
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 8c7938ba6a84..0691007cfc02 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -15,10 +15,6 @@
 #include <linux/net.h>
 #include <linux/skbuff.h>
 #include <linux/errqueue.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/icmp.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <net/ip.h>
@@ -140,16 +136,10 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			      u32 abort_code, int error)
 {
 	struct rxrpc_call *call;
-	bool queue;
-	int i, bit;
+	int i;
 
 	_enter("{%d},%x", conn->debug_id, abort_code);
 
-	if (compl == RXRPC_CALL_LOCALLY_ABORTED)
-		bit = RXRPC_CALL_EV_CONN_ABORT;
-	else
-		bit = RXRPC_CALL_EV_RCVD_ABORT;
-
 	spin_lock(&conn->channel_lock);
 
 	for (i = 0; i < RXRPC_MAXCALLS; i++) {
@@ -157,22 +147,13 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			conn->channels[i].call,
 			lockdep_is_held(&conn->channel_lock));
 		if (call) {
-			rxrpc_see_call(call);
 			if (compl == RXRPC_CALL_LOCALLY_ABORTED)
 				trace_rxrpc_abort("CON", call->cid,
 						  call->call_id, 0,
 						  abort_code, error);
-
-			write_lock_bh(&call->state_lock);
-			if (rxrpc_set_call_completion(call, compl, abort_code,
-						      error)) {
-				set_bit(bit, &call->events);
-				queue = true;
-			}
-			write_unlock_bh(&call->state_lock);
-			if (queue)
-				rxrpc_queue_call(call);
-
+			if (rxrpc_set_call_completion(call, compl,
+						      abort_code, error))
+				rxrpc_notify_socket(call);
 		}
 	}
 
@@ -251,17 +232,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 
 /*
  * mark a call as being on a now-secured channel
- * - must be called with softirqs disabled
+ * - must be called with BH's disabled.
  */
 static void rxrpc_call_is_secure(struct rxrpc_call *call)
 {
 	_enter("%p", call);
 	if (call) {
-		read_lock(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events))
-			rxrpc_queue_call(call);
-		read_unlock(&call->state_lock);
+		write_lock_bh(&call->state_lock);
+		if (call->state == RXRPC_CALL_SERVER_SECURING) {
+			call->state = RXRPC_CALL_SERVER_ACCEPTING;
+			rxrpc_notify_socket(call);
+		}
+		write_unlock_bh(&call->state_lock);
 	}
 }
 
@@ -278,7 +260,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 	int loop, ret;
 
 	if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
-		kleave(" = -ECONNABORTED [%u]", conn->state);
+		_leave(" = -ECONNABORTED [%u]", conn->state);
 		return -ECONNABORTED;
 	}
 
@@ -291,14 +273,14 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 		return 0;
 
 	case RXRPC_PACKET_TYPE_ABORT:
-		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
+		if (skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) < 0)
 			return -EPROTO;
 		abort_code = ntohl(wtmp);
 		_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
 
 		conn->state = RXRPC_CONN_REMOTELY_ABORTED;
-		rxrpc_abort_calls(conn, 0, RXRPC_CALL_REMOTELY_ABORTED,
-				  abort_code);
+		rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
+				  abort_code, ECONNABORTED);
 		return -ECONNABORTED;
 
 	case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -323,14 +305,16 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 
 		if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
 			conn->state = RXRPC_CONN_SERVICE;
+			spin_unlock(&conn->state_lock);
 			for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
 				rxrpc_call_is_secure(
 					rcu_dereference_protected(
 						conn->channels[loop].call,
 						lockdep_is_held(&conn->channel_lock)));
+		} else {
+			spin_unlock(&conn->state_lock);
 		}
 
-		spin_unlock(&conn->state_lock);
 		spin_unlock(&conn->channel_lock);
 		return 0;
 
@@ -433,88 +417,3 @@ protocol_error:
 	_leave(" [EPROTO]");
 	goto out;
 }
-
-/*
- * put a packet up for transport-level abort
- */
-void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
-{
-	CHECK_SLAB_OKAY(&local->usage);
-
-	skb_queue_tail(&local->reject_queue, skb);
-	rxrpc_queue_local(local);
-}
-
-/*
- * reject packets through the local endpoint
- */
-void rxrpc_reject_packets(struct rxrpc_local *local)
-{
-	union {
-		struct sockaddr sa;
-		struct sockaddr_in sin;
-	} sa;
-	struct rxrpc_skb_priv *sp;
-	struct rxrpc_wire_header whdr;
-	struct sk_buff *skb;
-	struct msghdr msg;
-	struct kvec iov[2];
-	size_t size;
-	__be32 code;
-
-	_enter("%d", local->debug_id);
-
-	iov[0].iov_base = &whdr;
-	iov[0].iov_len = sizeof(whdr);
-	iov[1].iov_base = &code;
-	iov[1].iov_len = sizeof(code);
-	size = sizeof(whdr) + sizeof(code);
-
-	msg.msg_name = &sa;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_flags = 0;
-
-	memset(&sa, 0, sizeof(sa));
-	sa.sa.sa_family = local->srx.transport.family;
-	switch (sa.sa.sa_family) {
-	case AF_INET:
-		msg.msg_namelen = sizeof(sa.sin);
-		break;
-	default:
-		msg.msg_namelen = 0;
-		break;
-	}
-
-	memset(&whdr, 0, sizeof(whdr));
-	whdr.type = RXRPC_PACKET_TYPE_ABORT;
-
-	while ((skb = skb_dequeue(&local->reject_queue))) {
-		rxrpc_see_skb(skb);
-		sp = rxrpc_skb(skb);
-		switch (sa.sa.sa_family) {
-		case AF_INET:
-			sa.sin.sin_port = udp_hdr(skb)->source;
-			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
-			code = htonl(skb->priority);
-
-			whdr.epoch	= htonl(sp->hdr.epoch);
-			whdr.cid	= htonl(sp->hdr.cid);
-			whdr.callNumber	= htonl(sp->hdr.callNumber);
-			whdr.serviceId	= htons(sp->hdr.serviceId);
-			whdr.flags	= sp->hdr.flags;
-			whdr.flags	^= RXRPC_CLIENT_INITIATED;
-			whdr.flags	&= RXRPC_CLIENT_INITIATED;
-
-			kernel_sendmsg(local->socket, &msg, iov, 2, size);
-			break;
-
-		default:
-			break;
-		}
-
-		rxrpc_free_skb(skb);
-	}
-
-	_leave("");
-}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 8da82e3aa00e..ffa9addb97b2 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -169,7 +169,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
 			chan->last_abort = call->abort_code;
 			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
 		} else {
-			chan->last_seq = call->rx_data_eaten;
+			chan->last_seq = call->rx_hard_ack;
 			chan->last_type = RXRPC_PACKET_TYPE_ACK;
 		}
 		/* Sync with rxrpc_conn_retransmit(). */
@@ -191,6 +191,10 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
 {
 	struct rxrpc_connection *conn = call->conn;
 
+	spin_lock_bh(&conn->params.peer->lock);
+	hlist_del_init(&call->error_link);
+	spin_unlock_bh(&conn->params.peer->lock);
+
 	if (rxrpc_is_client_call(call))
 		return rxrpc_disconnect_client_call(call);
 
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 189338a60457..83d54da4ce8b 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -65,9 +65,8 @@ done:
  * Insert a service connection into a peer's tree, thereby making it a target
  * for incoming packets.
  */
-static struct rxrpc_connection *
-rxrpc_publish_service_conn(struct rxrpc_peer *peer,
-			   struct rxrpc_connection *conn)
+static void rxrpc_publish_service_conn(struct rxrpc_peer *peer,
+				       struct rxrpc_connection *conn)
 {
 	struct rxrpc_connection *cursor = NULL;
 	struct rxrpc_conn_proto k = conn->proto;
@@ -96,7 +95,7 @@ conn_published:
 	set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags);
 	write_sequnlock_bh(&peer->service_conn_lock);
 	_leave(" = %d [new]", conn->debug_id);
-	return conn;
+	return;
 
 found_extant_conn:
 	if (atomic_read(&cursor->usage) == 0)
@@ -143,106 +142,30 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
 }
 
 /*
- * get a record of an incoming connection
+ * Set up an incoming connection.  This is called in BH context with the RCU
+ * read lock held.
  */
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local,
-						   struct sockaddr_rxrpc *srx,
-						   struct sk_buff *skb)
+void rxrpc_new_incoming_connection(struct rxrpc_connection *conn,
+				   struct sk_buff *skb)
 {
-	struct rxrpc_connection *conn;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_peer *peer;
-	const char *new = "old";
 
 	_enter("");
 
-	peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
-	if (!peer) {
-		_debug("no peer");
-		return ERR_PTR(-EBUSY);
-	}
-
-	ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED);
-
-	rcu_read_lock();
-	peer = rxrpc_lookup_peer_rcu(local, srx);
-	if (peer) {
-		conn = rxrpc_find_service_conn_rcu(peer, skb);
-		if (conn) {
-			if (sp->hdr.securityIndex != conn->security_ix)
-				goto security_mismatch_rcu;
-			if (rxrpc_get_connection_maybe(conn))
-				goto found_extant_connection_rcu;
-
-			/* The conn has expired but we can't remove it without
-			 * the appropriate lock, so we attempt to replace it
-			 * when we have a new candidate.
-			 */
-		}
-
-		if (!rxrpc_get_peer_maybe(peer))
-			peer = NULL;
-	}
-	rcu_read_unlock();
-
-	if (!peer) {
-		peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
-		if (!peer)
-			goto enomem;
-	}
-
-	/* We don't have a matching record yet. */
-	conn = rxrpc_alloc_connection(GFP_NOIO);
-	if (!conn)
-		goto enomem_peer;
-
 	conn->proto.epoch	= sp->hdr.epoch;
 	conn->proto.cid		= sp->hdr.cid & RXRPC_CIDMASK;
-	conn->params.local	= local;
-	conn->params.peer	= peer;
 	conn->params.service_id	= sp->hdr.serviceId;
 	conn->security_ix	= sp->hdr.securityIndex;
 	conn->out_clientflag	= 0;
-	conn->state		= RXRPC_CONN_SERVICE;
-	if (conn->params.service_id)
+	if (conn->security_ix)
 		conn->state	= RXRPC_CONN_SERVICE_UNSECURED;
-
-	rxrpc_get_local(local);
-
-	/* We maintain an extra ref on the connection whilst it is on
-	 * the rxrpc_connections list.
-	 */
-	atomic_set(&conn->usage, 2);
-
-	write_lock(&rxrpc_connection_lock);
-	list_add_tail(&conn->link, &rxrpc_connections);
-	list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
-	write_unlock(&rxrpc_connection_lock);
+	else
+		conn->state	= RXRPC_CONN_SERVICE;
 
 	/* Make the connection a target for incoming packets. */
-	rxrpc_publish_service_conn(peer, conn);
-
-	new = "new";
-
-success:
-	_net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid);
-	_leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
-	return conn;
-
-found_extant_connection_rcu:
-	rcu_read_unlock();
-	goto success;
-
-security_mismatch_rcu:
-	rcu_read_unlock();
-	_leave(" = -EKEYREJECTED");
-	return ERR_PTR(-EKEYREJECTED);
+	rxrpc_publish_service_conn(conn->params.peer, conn);
 
-enomem_peer:
-	rxrpc_put_peer(peer);
-enomem:
-	_leave(" = -ENOMEM");
-	return ERR_PTR(-ENOMEM);
+	_net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid);
 }
 
 /*
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 5906579060cd..afeba98004b1 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1,6 +1,6 @@
 /* RxRPC packet reception
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -27,549 +27,547 @@
 #include <net/net_namespace.h>
 #include "ar-internal.h"
 
+static void rxrpc_proto_abort(const char *why,
+			      struct rxrpc_call *call, rxrpc_seq_t seq)
+{
+	if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) {
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+		rxrpc_queue_call(call);
+	}
+}
+
 /*
- * queue a packet for recvmsg to pass to userspace
- * - the caller must hold a lock on call->lock
- * - must not be called with interrupts disabled (sk_filter() disables BH's)
- * - eats the packet whether successful or not
- * - there must be just one reference to the packet, which the caller passes to
- *   this function
+ * Apply a hard ACK by advancing the Tx window.
  */
-int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
-			bool force, bool terminal)
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 {
-	struct rxrpc_skb_priv *sp;
-	struct rxrpc_sock *rx;
-	struct sock *sk;
-	int ret;
+	struct sk_buff *skb, *list = NULL;
+	int ix;
 
-	_enter(",,%d,%d", force, terminal);
+	spin_lock(&call->lock);
 
-	ASSERT(!irqs_disabled());
+	while (before(call->tx_hard_ack, to)) {
+		call->tx_hard_ack++;
+		ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
+		skb = call->rxtx_buffer[ix];
+		rxrpc_see_skb(skb);
+		call->rxtx_buffer[ix] = NULL;
+		call->rxtx_annotations[ix] = 0;
+		skb->next = list;
+		list = skb;
+	}
 
-	sp = rxrpc_skb(skb);
-	ASSERTCMP(sp->call, ==, call);
+	spin_unlock(&call->lock);
 
-	/* if we've already posted the terminal message for a call, then we
-	 * don't post any more */
-	if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
-		_debug("already terminated");
-		ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
+	while (list) {
+		skb = list;
+		list = skb->next;
+		skb->next = NULL;
 		rxrpc_free_skb(skb);
-		return 0;
 	}
+}
 
-	/* The socket may go away under us */
-	ret = 0;
-	rcu_read_lock();
-	rx = rcu_dereference(call->socket);
-	if (!rx)
-		goto out;
-	sk = &rx->sk;
-	if (sock_flag(sk, SOCK_DEAD))
-		goto out;
+/*
+ * End the transmission phase of a call.
+ *
+ * This occurs when we get an ACKALL packet, the first DATA packet of a reply,
+ * or a final ACK packet.
+ */
+static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
+{
+	_enter("");
 
-	if (!force) {
-		/* cast skb->rcvbuf to unsigned...  It's pointless, but
-		 * reduces number of warnings when compiling with -W
-		 * --ANK */
-//		ret = -ENOBUFS;
-//		if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-//		    (unsigned int) sk->sk_rcvbuf)
-//			goto out;
-
-		ret = sk_filter(sk, skb);
-		if (ret < 0)
-			goto out;
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_RECV_REPLY:
+		return true;
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+	case RXRPC_CALL_SERVER_AWAIT_ACK:
+		break;
+	default:
+		rxrpc_proto_abort(abort_why, call, call->tx_top);
+		return false;
 	}
 
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
-	    !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    sk->sk_state != RXRPC_CLOSE) {
-		skb->destructor = rxrpc_packet_destructor;
-		skb->dev = NULL;
-		skb->sk = sk;
-		atomic_add(skb->truesize, &sk->sk_rmem_alloc);
-
-		if (terminal) {
-			_debug("<<<< TERMINAL MESSAGE >>>>");
-			set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags);
-		}
+	rxrpc_rotate_tx_window(call, call->tx_top);
 
-		/* allow interception by a kernel service */
-		if (skb->mark == RXRPC_SKB_MARK_NEW_CALL &&
-		    rx->notify_new_call) {
-			spin_unlock_bh(&sk->sk_receive_queue.lock);
-			skb_queue_tail(&call->knlrecv_queue, skb);
-			rx->notify_new_call(&rx->sk, NULL, 0);
-		} else if (call->notify_rx) {
-			spin_unlock_bh(&sk->sk_receive_queue.lock);
-			skb_queue_tail(&call->knlrecv_queue, skb);
-			call->notify_rx(&rx->sk, call, call->user_call_ID);
-		} else {
-			_net("post skb %p", skb);
-			__skb_queue_tail(&sk->sk_receive_queue, skb);
-			spin_unlock_bh(&sk->sk_receive_queue.lock);
+	write_lock(&call->state_lock);
 
-			sk->sk_data_ready(sk);
-		}
-		skb = NULL;
-	} else {
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
+	switch (call->state) {
+	default:
+		break;
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+		break;
+	case RXRPC_CALL_SERVER_AWAIT_ACK:
+		__rxrpc_call_completed(call);
+		rxrpc_notify_socket(call);
+		break;
 	}
-	ret = 0;
 
-out:
-	rxrpc_free_skb(skb);
-	rcu_read_unlock();
+	write_unlock(&call->state_lock);
+	_leave(" = ok");
+	return true;
+}
+
+/*
+ * Scan a jumbo packet to validate its structure and to work out how many
+ * subpackets it contains.
+ *
+ * A jumbo packet is a collection of consecutive packets glued together with
+ * little headers between that indicate how to change the initial header for
+ * each subpacket.
+ *
+ * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but
+ * the last are RXRPC_JUMBO_DATALEN in size.  The last subpacket may be of any
+ * size.
+ */
+static bool rxrpc_validate_jumbo(struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int offset = sp->offset;
+	unsigned int len = skb->data_len;
+	int nr_jumbo = 1;
+	u8 flags = sp->hdr.flags;
+
+	do {
+		nr_jumbo++;
+		if (len - offset < RXRPC_JUMBO_SUBPKTLEN)
+			goto protocol_error;
+		if (flags & RXRPC_LAST_PACKET)
+			goto protocol_error;
+		offset += RXRPC_JUMBO_DATALEN;
+		if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+			goto protocol_error;
+		offset += sizeof(struct rxrpc_jumbo_header);
+	} while (flags & RXRPC_JUMBO_PACKET);
+
+	sp->nr_jumbo = nr_jumbo;
+	return true;
 
-	_leave(" = %d", ret);
-	return ret;
+protocol_error:
+	return false;
 }
 
 /*
- * process a DATA packet, posting the packet to the appropriate queue
- * - eats the packet if successful
+ * Handle reception of a duplicate packet.
+ *
+ * We have to take care to avoid an attack here whereby we're given a series of
+ * jumbograms, each with a sequence number one before the preceding one and
+ * filled up to maximum UDP size.  If they never send us the first packet in
+ * the sequence, they can cause us to have to hold on to around 2MiB of kernel
+ * space until the call times out.
+ *
+ * We limit the space usage by only accepting three duplicate jumbo packets per
+ * call.  After that, we tell the other side we're no longer accepting jumbos
+ * (that information is encoded in the ACK packet).
  */
-static int rxrpc_fast_process_data(struct rxrpc_call *call,
-				   struct sk_buff *skb, u32 seq)
+static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
+				 u8 annotation, bool *_jumbo_dup)
 {
-	struct rxrpc_skb_priv *sp;
-	bool terminal;
-	int ret, ackbit, ack;
-	u32 serial;
-	u16 skew;
-	u8 flags;
+	/* Discard normal packets that are duplicates. */
+	if (annotation == 0)
+		return;
 
-	_enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
+	/* Skip jumbo subpackets that are duplicates.  When we've had three or
+	 * more partially duplicate jumbo packets, we refuse to take any more
+	 * jumbos for this call.
+	 */
+	if (!*_jumbo_dup) {
+		call->nr_jumbo_dup++;
+		*_jumbo_dup = true;
+	}
+}
 
-	sp = rxrpc_skb(skb);
-	ASSERTCMP(sp->call, ==, NULL);
-	flags = sp->hdr.flags;
-	serial = sp->hdr.serial;
-	skew = skb->priority;
+/*
+ * Process a DATA packet, adding the packet to the Rx ring.
+ */
+static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
+			     u16 skew)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int offset = sp->offset;
+	unsigned int ix;
+	rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
+	rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
+	bool immediate_ack = false, jumbo_dup = false, queued;
+	u16 len;
+	u8 ack = 0, flags, annotation = 0;
 
-	spin_lock(&call->lock);
+	_enter("{%u,%u},{%u,%u}",
+	       call->rx_hard_ack, call->rx_top, skb->data_len, seq);
 
-	if (call->state > RXRPC_CALL_COMPLETE)
-		goto discard;
+	_proto("Rx DATA %%%u { #%u f=%02x }",
+	       sp->hdr.serial, seq, sp->hdr.flags);
 
-	ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post);
-	ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
-	ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		return;
 
-	if (seq < call->rx_data_post) {
-		_debug("dup #%u [-%u]", seq, call->rx_data_post);
-		ack = RXRPC_ACK_DUPLICATE;
-		ret = -ENOBUFS;
-		goto discard_and_ack;
-	}
+	/* Received data implicitly ACKs all of the request packets we sent
+	 * when we're acting as a client.
+	 */
+	if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY &&
+	    !rxrpc_end_tx_phase(call, "ETD"))
+		return;
 
-	/* we may already have the packet in the out of sequence queue */
-	ackbit = seq - (call->rx_data_eaten + 1);
-	ASSERTCMP(ackbit, >=, 0);
-	if (__test_and_set_bit(ackbit, call->ackr_window)) {
-		_debug("dup oos #%u [%u,%u]",
-		       seq, call->rx_data_eaten, call->rx_data_post);
-		ack = RXRPC_ACK_DUPLICATE;
-		goto discard_and_ack;
-	}
+	call->ackr_prev_seq = seq;
 
-	if (seq >= call->ackr_win_top) {
-		_debug("exceed #%u [%u]", seq, call->ackr_win_top);
-		__clear_bit(ackbit, call->ackr_window);
+	hard_ack = READ_ONCE(call->rx_hard_ack);
+	if (after(seq, hard_ack + call->rx_winsize)) {
 		ack = RXRPC_ACK_EXCEEDS_WINDOW;
-		goto discard_and_ack;
+		ack_serial = serial;
+		goto ack;
 	}
 
-	if (seq == call->rx_data_expect) {
-		clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
-		call->rx_data_expect++;
-	} else if (seq > call->rx_data_expect) {
-		_debug("oos #%u [%u]", seq, call->rx_data_expect);
-		call->rx_data_expect = seq + 1;
-		if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
-			ack = RXRPC_ACK_OUT_OF_SEQUENCE;
-			goto enqueue_and_ack;
+	flags = sp->hdr.flags;
+	if (flags & RXRPC_JUMBO_PACKET) {
+		if (call->nr_jumbo_dup > 3) {
+			ack = RXRPC_ACK_NOSPACE;
+			ack_serial = serial;
+			goto ack;
 		}
-		goto enqueue_packet;
+		annotation = 1;
 	}
 
-	if (seq != call->rx_data_post) {
-		_debug("ahead #%u [%u]", seq, call->rx_data_post);
-		goto enqueue_packet;
+next_subpacket:
+	queued = false;
+	ix = seq & RXRPC_RXTX_BUFF_MASK;
+	len = skb->data_len;
+	if (flags & RXRPC_JUMBO_PACKET)
+		len = RXRPC_JUMBO_DATALEN;
+
+	if (flags & RXRPC_LAST_PACKET) {
+		if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+		    seq != call->rx_top)
+			return rxrpc_proto_abort("LSN", call, seq);
+	} else {
+		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+		    after_eq(seq, call->rx_top))
+			return rxrpc_proto_abort("LSA", call, seq);
 	}
 
-	if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags))
-		goto protocol_error;
-
-	/* if the packet need security things doing to it, then it goes down
-	 * the slow path */
-	if (call->security_ix)
-		goto enqueue_packet;
-
-	sp->call = call;
-	rxrpc_get_call_for_skb(call, skb);
-	terminal = ((flags & RXRPC_LAST_PACKET) &&
-		    !(flags & RXRPC_CLIENT_INITIATED));
-	ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
-	if (ret < 0) {
-		if (ret == -ENOMEM || ret == -ENOBUFS) {
-			__clear_bit(ackbit, call->ackr_window);
-			ack = RXRPC_ACK_NOSPACE;
-			goto discard_and_ack;
+	if (before_eq(seq, hard_ack)) {
+		ack = RXRPC_ACK_DUPLICATE;
+		ack_serial = serial;
+		goto skip;
+	}
+
+	if (flags & RXRPC_REQUEST_ACK && !ack) {
+		ack = RXRPC_ACK_REQUESTED;
+		ack_serial = serial;
+	}
+
+	if (call->rxtx_buffer[ix]) {
+		rxrpc_input_dup_data(call, seq, annotation, &jumbo_dup);
+		if (ack != RXRPC_ACK_DUPLICATE) {
+			ack = RXRPC_ACK_DUPLICATE;
+			ack_serial = serial;
 		}
-		goto out;
+		immediate_ack = true;
+		goto skip;
 	}
 
-	skb = NULL;
-	sp = NULL;
-
-	_debug("post #%u", seq);
-	ASSERTCMP(call->rx_data_post, ==, seq);
-	call->rx_data_post++;
-
-	if (flags & RXRPC_LAST_PACKET)
-		set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
-
-	/* if we've reached an out of sequence packet then we need to drain
-	 * that queue into the socket Rx queue now */
-	if (call->rx_data_post == call->rx_first_oos) {
-		_debug("drain rx oos now");
-		read_lock(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
-			rxrpc_queue_call(call);
-		read_unlock(&call->state_lock);
+	/* Queue the packet.  We use a couple of memory barriers here as need
+	 * to make sure that rx_top is perceived to be set after the buffer
+	 * pointer and that the buffer pointer is set after the annotation and
+	 * the skb data.
+	 *
+	 * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
+	 * and also rxrpc_fill_out_ack().
+	 */
+	rxrpc_get_skb(skb);
+	call->rxtx_annotations[ix] = annotation;
+	smp_wmb();
+	call->rxtx_buffer[ix] = skb;
+	if (after(seq, call->rx_top))
+		smp_store_release(&call->rx_top, seq);
+	queued = true;
+
+	if (after_eq(seq, call->rx_expect_next)) {
+		if (after(seq, call->rx_expect_next)) {
+			_net("OOS %u > %u", seq, call->rx_expect_next);
+			ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+			ack_serial = serial;
+		}
+		call->rx_expect_next = seq + 1;
 	}
 
-	spin_unlock(&call->lock);
-	atomic_inc(&call->ackr_not_idle);
-	rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial, false);
-	_leave(" = 0 [posted]");
-	return 0;
+skip:
+	offset += len;
+	if (flags & RXRPC_JUMBO_PACKET) {
+		if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+			return rxrpc_proto_abort("XJF", call, seq);
+		offset += sizeof(struct rxrpc_jumbo_header);
+		seq++;
+		serial++;
+		annotation++;
+		if (flags & RXRPC_JUMBO_PACKET)
+			annotation |= RXRPC_RX_ANNO_JLAST;
+
+		_proto("Rx DATA Jumbo %%%u", serial);
+		goto next_subpacket;
+	}
 
-protocol_error:
-	ret = -EBADMSG;
-out:
-	spin_unlock(&call->lock);
-	_leave(" = %d", ret);
-	return ret;
+	if (queued && flags & RXRPC_LAST_PACKET && !ack) {
+		ack = RXRPC_ACK_DELAY;
+		ack_serial = serial;
+	}
 
-discard_and_ack:
-	_debug("discard and ACK packet %p", skb);
-	__rxrpc_propose_ACK(call, ack, skew, serial, true);
-discard:
-	spin_unlock(&call->lock);
-	rxrpc_free_skb(skb);
-	_leave(" = 0 [discarded]");
-	return 0;
+ack:
+	if (ack)
+		rxrpc_propose_ACK(call, ack, skew, ack_serial,
+				  immediate_ack, true);
 
-enqueue_and_ack:
-	__rxrpc_propose_ACK(call, ack, skew, serial, true);
-enqueue_packet:
-	_net("defer skb %p", skb);
-	spin_unlock(&call->lock);
-	skb_queue_tail(&call->rx_queue, skb);
-	atomic_inc(&call->ackr_not_idle);
-	read_lock(&call->state_lock);
-	if (call->state < RXRPC_CALL_COMPLETE)
-		rxrpc_queue_call(call);
-	read_unlock(&call->state_lock);
-	_leave(" = 0 [queued]");
-	return 0;
+	if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
+		rxrpc_notify_socket(call);
+	_leave(" [queued]");
 }
 
 /*
- * assume an implicit ACKALL of the transmission phase of a client socket upon
- * reception of the first reply packet
+ * Process the extra information that may be appended to an ACK packet
  */
-static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
+static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
+				struct rxrpc_ackinfo *ackinfo)
 {
-	write_lock_bh(&call->state_lock);
-
-	switch (call->state) {
-	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
-		call->acks_latest = serial;
-
-		_debug("implicit ACKALL %%%u", call->acks_latest);
-		set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
-		write_unlock_bh(&call->state_lock);
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_peer *peer;
+	unsigned int mtu;
+
+	_proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
+	       sp->hdr.serial,
+	       ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
+	       ntohl(ackinfo->rwind), ntohl(ackinfo->jumbo_max));
+
+	if (call->tx_winsize > ntohl(ackinfo->rwind))
+		call->tx_winsize = ntohl(ackinfo->rwind);
+
+	mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
+
+	peer = call->peer;
+	if (mtu < peer->maxdata) {
+		spin_lock_bh(&peer->lock);
+		peer->maxdata = mtu;
+		peer->mtu = mtu + peer->hdrsize;
+		spin_unlock_bh(&peer->lock);
+		_net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
+	}
+}
 
-		if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
-			clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-			clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
-			clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+/*
+ * Process individual soft ACKs.
+ *
+ * Each ACK in the array corresponds to one packet and can be either an ACK or
+ * a NAK.  If we get find an explicitly NAK'd packet we resend immediately;
+ * packets that lie beyond the end of the ACK list are scheduled for resend by
+ * the timer on the basis that the peer might just not have processed them at
+ * the time the ACK was sent.
+ */
+static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
+				  rxrpc_seq_t seq, int nr_acks)
+{
+	bool resend = false;
+	int ix;
+
+	for (; nr_acks > 0; nr_acks--, seq++) {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		switch (*acks) {
+		case RXRPC_ACK_TYPE_ACK:
+			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK;
+			break;
+		case RXRPC_ACK_TYPE_NACK:
+			if (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK)
+				continue;
+			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK;
+			resend = true;
+			break;
+		default:
+			return rxrpc_proto_abort("SFT", call, 0);
 		}
-		break;
-
-	default:
-		write_unlock_bh(&call->state_lock);
-		break;
 	}
+
+	if (resend &&
+	    !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+		rxrpc_queue_call(call);
 }
 
 /*
- * post an incoming packet to the nominated call to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process an ACK packet.
+ *
+ * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet
+ * in the ACK array.  Anything before that is hard-ACK'd and may be discarded.
+ *
+ * A hard-ACK means that a packet has been processed and may be discarded; a
+ * soft-ACK means that the packet may be discarded and retransmission
+ * requested.  A phase is complete when all packets are hard-ACK'd.
  */
-void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
+static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
+			    u16 skew)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	__be32 wtmp;
-	u32 abort_code;
-
-	_enter("%p,%p", call, skb);
-
-	ASSERT(!irqs_disabled());
-
-#if 0 // INJECT RX ERROR
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
-		static int skip = 0;
-		if (++skip == 3) {
-			printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n");
-			skip = 0;
-			goto free_packet;
-		}
+	union {
+		struct rxrpc_ackpacket ack;
+		struct rxrpc_ackinfo info;
+		u8 acks[RXRPC_MAXACKS];
+	} buf;
+	rxrpc_seq_t first_soft_ack, hard_ack;
+	int nr_acks, offset;
+
+	_enter("");
+
+	if (skb_copy_bits(skb, sp->offset, &buf.ack, sizeof(buf.ack)) < 0) {
+		_debug("extraction failure");
+		return rxrpc_proto_abort("XAK", call, 0);
 	}
-#endif
-
-	/* request ACK generation for any ACK or DATA packet that requests
-	 * it */
-	if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
-		_proto("ACK Requested on %%%u", sp->hdr.serial);
+	sp->offset += sizeof(buf.ack);
+
+	first_soft_ack = ntohl(buf.ack.firstPacket);
+	hard_ack = first_soft_ack - 1;
+	nr_acks = buf.ack.nAcks;
+
+	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+	       sp->hdr.serial,
+	       ntohs(buf.ack.maxSkew),
+	       first_soft_ack,
+	       ntohl(buf.ack.previousPacket),
+	       ntohl(buf.ack.serial),
+	       rxrpc_acks(buf.ack.reason),
+	       buf.ack.nAcks);
+
+	if (buf.ack.reason == RXRPC_ACK_PING) {
+		_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
+				  skew, sp->hdr.serial, true, true);
+	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
-				  skb->priority, sp->hdr.serial, false);
+				  skew, sp->hdr.serial, true, true);
 	}
 
-	switch (sp->hdr.type) {
-	case RXRPC_PACKET_TYPE_ABORT:
-		_debug("abort");
-
-		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
-			goto protocol_error;
-
-		abort_code = ntohl(wtmp);
-		_proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
-
-		if (__rxrpc_set_call_completion(call,
-						RXRPC_CALL_REMOTELY_ABORTED,
-						abort_code, ECONNABORTED)) {
-			set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
-			rxrpc_queue_call(call);
-		}
-		goto free_packet;
-
-	case RXRPC_PACKET_TYPE_BUSY:
-		_proto("Rx BUSY %%%u", sp->hdr.serial);
-
-		if (rxrpc_is_service_call(call))
-			goto protocol_error;
+	offset = sp->offset + nr_acks + 3;
+	if (skb->data_len >= offset + sizeof(buf.info)) {
+		if (skb_copy_bits(skb, offset, &buf.info, sizeof(buf.info)) < 0)
+			return rxrpc_proto_abort("XAI", call, 0);
+		rxrpc_input_ackinfo(call, skb, &buf.info);
+	}
 
-		write_lock_bh(&call->state_lock);
-		switch (call->state) {
-		case RXRPC_CALL_CLIENT_SEND_REQUEST:
-			__rxrpc_set_call_completion(call,
-						    RXRPC_CALL_SERVER_BUSY,
-						    0, EBUSY);
-			set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
-			rxrpc_queue_call(call);
-		case RXRPC_CALL_SERVER_BUSY:
-			goto free_packet_unlock;
-		default:
-			goto protocol_error_locked;
-		}
+	if (first_soft_ack == 0)
+		return rxrpc_proto_abort("AK0", call, 0);
 
+	/* Ignore ACKs unless we are or have just been transmitting. */
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_SEND_REQUEST:
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+	case RXRPC_CALL_SERVER_SEND_REPLY:
+	case RXRPC_CALL_SERVER_AWAIT_ACK:
+		break;
 	default:
-		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
-		goto protocol_error;
-
-	case RXRPC_PACKET_TYPE_DATA:
-		_proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
-		if (sp->hdr.seq == 0)
-			goto protocol_error;
-
-		call->ackr_prev_seq = sp->hdr.seq;
+		return;
+	}
 
-		/* received data implicitly ACKs all of the request packets we
-		 * sent when we're acting as a client */
-		if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
-			rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
+	/* Discard any out-of-order or duplicate ACKs. */
+	if ((int)sp->hdr.serial - (int)call->acks_latest <= 0) {
+		_debug("discard ACK %d <= %d",
+		       sp->hdr.serial, call->acks_latest);
+		return;
+	}
+	call->acks_latest = sp->hdr.serial;
 
-		switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
-		case 0:
-			skb = NULL;
-			goto done;
+	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
+	    hard_ack == call->tx_top) {
+		rxrpc_end_tx_phase(call, "ETA");
+		return;
+	}
 
-		default:
-			BUG();
+	if (before(hard_ack, call->tx_hard_ack) ||
+	    after(hard_ack, call->tx_top))
+		return rxrpc_proto_abort("AKW", call, 0);
 
-			/* data packet received beyond the last packet */
-		case -EBADMSG:
-			goto protocol_error;
-		}
+	if (after(hard_ack, call->tx_hard_ack))
+		rxrpc_rotate_tx_window(call, hard_ack);
 
-	case RXRPC_PACKET_TYPE_ACKALL:
-	case RXRPC_PACKET_TYPE_ACK:
-		/* ACK processing is done in process context */
-		read_lock_bh(&call->state_lock);
-		if (call->state < RXRPC_CALL_COMPLETE) {
-			skb_queue_tail(&call->rx_queue, skb);
-			rxrpc_queue_call(call);
-			skb = NULL;
-		}
-		read_unlock_bh(&call->state_lock);
-		goto free_packet;
-	}
+	if (after(first_soft_ack, call->tx_top))
+		return;
 
-protocol_error:
-	_debug("protocol error");
-	write_lock_bh(&call->state_lock);
-protocol_error_locked:
-	if (__rxrpc_abort_call("FPR", call, 0, RX_PROTOCOL_ERROR, EPROTO))
-		rxrpc_queue_call(call);
-free_packet_unlock:
-	write_unlock_bh(&call->state_lock);
-free_packet:
-	rxrpc_free_skb(skb);
-done:
-	_leave("");
+	if (nr_acks > call->tx_top - first_soft_ack + 1)
+		nr_acks = first_soft_ack - call->tx_top + 1;
+	if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0)
+		return rxrpc_proto_abort("XSA", call, 0);
+	rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks);
 }
 
 /*
- * split up a jumbo data packet
+ * Process an ACKALL packet.
  */
-static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
-				       struct sk_buff *jumbo)
+static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 {
-	struct rxrpc_jumbo_header jhdr;
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *part;
-
-	_enter(",{%u,%u}", jumbo->data_len, jumbo->len);
-
-	sp = rxrpc_skb(jumbo);
-
-	do {
-		sp->hdr.flags &= ~RXRPC_JUMBO_PACKET;
-
-		/* make a clone to represent the first subpacket in what's left
-		 * of the jumbo packet */
-		part = skb_clone(jumbo, GFP_ATOMIC);
-		if (!part) {
-			/* simply ditch the tail in the event of ENOMEM */
-			pskb_trim(jumbo, RXRPC_JUMBO_DATALEN);
-			break;
-		}
-		rxrpc_new_skb(part);
-
-		pskb_trim(part, RXRPC_JUMBO_DATALEN);
-
-		if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN))
-			goto protocol_error;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
-		if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0)
-			goto protocol_error;
-		if (!pskb_pull(jumbo, sizeof(jhdr)))
-			BUG();
+	_proto("Rx ACKALL %%%u", sp->hdr.serial);
 
-		sp->hdr.seq	+= 1;
-		sp->hdr.serial	+= 1;
-		sp->hdr.flags	= jhdr.flags;
-		sp->hdr._rsvd	= ntohs(jhdr._rsvd);
+	rxrpc_end_tx_phase(call, "ETL");
+}
 
-		_proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
+/*
+ * Process an ABORT packet.
+ */
+static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	__be32 wtmp;
+	u32 abort_code = RX_CALL_DEAD;
 
-		rxrpc_fast_process_packet(call, part);
-		part = NULL;
+	_enter("");
 
-	} while (sp->hdr.flags & RXRPC_JUMBO_PACKET);
+	if (skb->len >= 4 &&
+	    skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) >= 0)
+		abort_code = ntohl(wtmp);
 
-	rxrpc_fast_process_packet(call, jumbo);
-	_leave("");
-	return;
+	_proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
 
-protocol_error:
-	_debug("protocol error");
-	rxrpc_free_skb(part);
-	if (rxrpc_abort_call("PJP", call, sp->hdr.seq,
-			     RX_PROTOCOL_ERROR, EPROTO))
-		rxrpc_queue_call(call);
-	rxrpc_free_skb(jumbo);
-	_leave("");
+	if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+				      abort_code, ECONNABORTED))
+		rxrpc_notify_socket(call);
 }
 
 /*
- * post an incoming packet to the appropriate call/socket to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process an incoming call packet.
  */
-static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
-				      struct rxrpc_call *call,
-				      struct sk_buff *skb)
+static void rxrpc_input_call_packet(struct rxrpc_call *call,
+				    struct sk_buff *skb, u16 skew)
 {
-	struct rxrpc_skb_priv *sp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
 	_enter("%p,%p", call, skb);
 
-	sp = rxrpc_skb(skb);
-
-	_debug("extant call [%d]", call->state);
-
-	read_lock(&call->state_lock);
-	switch (call->state) {
-	case RXRPC_CALL_COMPLETE:
-		switch (call->completion) {
-		case RXRPC_CALL_LOCALLY_ABORTED:
-			if (!test_and_set_bit(RXRPC_CALL_EV_ABORT,
-					      &call->events)) {
-				rxrpc_queue_call(call);
-				goto free_unlock;
-			}
-		default:
-			goto dead_call;
-		case RXRPC_CALL_SUCCEEDED:
-			if (rxrpc_is_service_call(call))
-				goto dead_call;
-			goto resend_final_ack;
-		}
-
-	case RXRPC_CALL_CLIENT_FINAL_ACK:
-		goto resend_final_ack;
+	switch (sp->hdr.type) {
+	case RXRPC_PACKET_TYPE_DATA:
+		rxrpc_input_data(call, skb, skew);
+		break;
 
-	default:
+	case RXRPC_PACKET_TYPE_ACK:
+		rxrpc_input_ack(call, skb, skew);
 		break;
-	}
 
-	read_unlock(&call->state_lock);
+	case RXRPC_PACKET_TYPE_BUSY:
+		_proto("Rx BUSY %%%u", sp->hdr.serial);
 
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
-	    sp->hdr.flags & RXRPC_JUMBO_PACKET)
-		rxrpc_process_jumbo_packet(call, skb);
-	else
-		rxrpc_fast_process_packet(call, skb);
+		/* Just ignore BUSY packets from the server; the retry and
+		 * lifespan timers will take care of business.  BUSY packets
+		 * from the client don't make sense.
+		 */
+		break;
 
-	goto done;
+	case RXRPC_PACKET_TYPE_ABORT:
+		rxrpc_input_abort(call, skb);
+		break;
 
-resend_final_ack:
-	_debug("final ack again");
-	set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-	rxrpc_queue_call(call);
-	goto free_unlock;
+	case RXRPC_PACKET_TYPE_ACKALL:
+		rxrpc_input_ackall(call, skb);
+		break;
 
-dead_call:
-	if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
-		skb->priority = RX_CALL_DEAD;
-		rxrpc_reject_packet(conn->params.local, skb);
-		goto unlock;
+	default:
+		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
+		break;
 	}
-free_unlock:
-	rxrpc_free_skb(skb);
-unlock:
-	read_unlock(&call->state_lock);
-done:
+
 	_leave("");
 }
 
@@ -600,6 +598,17 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
 	rxrpc_queue_local(local);
 }
 
+/*
+ * put a packet up for transport-level abort
+ */
+static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
+{
+	CHECK_SLAB_OKAY(&local->usage);
+
+	skb_queue_tail(&local->reject_queue, skb);
+	rxrpc_queue_local(local);
+}
+
 /*
  * Extract the wire header from a packet and translate the byte order.
  */
@@ -611,8 +620,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
 	/* dig out the RxRPC connection details */
 	if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0)
 		return -EBADMSG;
-	if (!pskb_pull(skb, sizeof(whdr)))
-		BUG();
 
 	memset(sp, 0, sizeof(*sp));
 	sp->hdr.epoch		= ntohl(whdr.epoch);
@@ -626,6 +633,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
 	sp->hdr.securityIndex	= whdr.securityIndex;
 	sp->hdr._rsvd		= ntohs(whdr._rsvd);
 	sp->hdr.serviceId	= ntohs(whdr.serviceId);
+	sp->offset = sizeof(whdr);
 	return 0;
 }
 
@@ -637,19 +645,22 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
  * shut down and the local endpoint from going away, thus sk_user_data will not
  * be cleared until this function returns.
  */
-void rxrpc_data_ready(struct sock *sk)
+void rxrpc_data_ready(struct sock *udp_sk)
 {
 	struct rxrpc_connection *conn;
+	struct rxrpc_channel *chan;
+	struct rxrpc_call *call;
 	struct rxrpc_skb_priv *sp;
-	struct rxrpc_local *local = sk->sk_user_data;
+	struct rxrpc_local *local = udp_sk->sk_user_data;
 	struct sk_buff *skb;
+	unsigned int channel;
 	int ret, skew;
 
-	_enter("%p", sk);
+	_enter("%p", udp_sk);
 
 	ASSERT(!irqs_disabled());
 
-	skb = skb_recv_datagram(sk, 0, 1, &ret);
+	skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
 	if (!skb) {
 		if (ret == -EAGAIN)
 			return;
@@ -695,111 +706,122 @@ void rxrpc_data_ready(struct sock *sk)
 		goto bad_message;
 	}
 
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) {
+	switch (sp->hdr.type) {
+	case RXRPC_PACKET_TYPE_VERSION:
 		rxrpc_post_packet_to_local(local, skb);
 		goto out;
-	}
 
-	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
-	    (sp->hdr.callNumber == 0 || sp->hdr.seq == 0))
-		goto bad_message;
+	case RXRPC_PACKET_TYPE_BUSY:
+		if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
+			goto discard;
+
+	case RXRPC_PACKET_TYPE_DATA:
+		if (sp->hdr.callNumber == 0)
+			goto bad_message;
+		if (sp->hdr.flags & RXRPC_JUMBO_PACKET &&
+		    !rxrpc_validate_jumbo(skb))
+			goto bad_message;
+		break;
+	}
 
 	rcu_read_lock();
 
 	conn = rxrpc_find_connection_rcu(local, skb);
-	if (!conn) {
-		skb->priority = 0;
-		goto cant_route_call;
-	}
+	if (conn) {
+		if (sp->hdr.securityIndex != conn->security_ix)
+			goto wrong_security;
 
-	/* Note the serial number skew here */
-	skew = (int)sp->hdr.serial - (int)conn->hi_serial;
-	if (skew >= 0) {
-		if (skew > 0)
-			conn->hi_serial = sp->hdr.serial;
-		skb->priority = 0;
-	} else {
-		skew = -skew;
-		skb->priority = min(skew, 65535);
-	}
+		if (sp->hdr.callNumber == 0) {
+			/* Connection-level packet */
+			_debug("CONN %p {%d}", conn, conn->debug_id);
+			rxrpc_post_packet_to_conn(conn, skb);
+			goto out_unlock;
+		}
+
+		/* Note the serial number skew here */
+		skew = (int)sp->hdr.serial - (int)conn->hi_serial;
+		if (skew >= 0) {
+			if (skew > 0)
+				conn->hi_serial = sp->hdr.serial;
+		} else {
+			skew = -skew;
+			skew = min(skew, 65535);
+		}
 
-	if (sp->hdr.callNumber == 0) {
-		/* Connection-level packet */
-		_debug("CONN %p {%d}", conn, conn->debug_id);
-		rxrpc_post_packet_to_conn(conn, skb);
-		goto out_unlock;
-	} else {
 		/* Call-bound packets are routed by connection channel. */
-		unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK;
-		struct rxrpc_channel *chan = &conn->channels[channel];
-		struct rxrpc_call *call;
+		channel = sp->hdr.cid & RXRPC_CHANNELMASK;
+		chan = &conn->channels[channel];
 
 		/* Ignore really old calls */
 		if (sp->hdr.callNumber < chan->last_call)
 			goto discard_unlock;
 
 		if (sp->hdr.callNumber == chan->last_call) {
-			/* For the previous service call, if completed
-			 * successfully, we discard all further packets.
+			/* For the previous service call, if completed successfully, we
+			 * discard all further packets.
 			 */
 			if (rxrpc_conn_is_service(conn) &&
 			    (chan->last_type == RXRPC_PACKET_TYPE_ACK ||
 			     sp->hdr.type == RXRPC_PACKET_TYPE_ABORT))
 				goto discard_unlock;
 
-			/* But otherwise we need to retransmit the final packet
-			 * from data cached in the connection record.
+			/* But otherwise we need to retransmit the final packet from
+			 * data cached in the connection record.
 			 */
 			rxrpc_post_packet_to_conn(conn, skb);
 			goto out_unlock;
 		}
 
 		call = rcu_dereference(chan->call);
-		if (!call || atomic_read(&call->usage) == 0)
-			goto cant_route_call;
+	} else {
+		skew = 0;
+		call = NULL;
+	}
 
-		rxrpc_see_call(call);
-		rxrpc_post_packet_to_call(conn, call, skb);
-		goto out_unlock;
+	if (!call || atomic_read(&call->usage) == 0) {
+		if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) ||
+		    sp->hdr.callNumber == 0 ||
+		    sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
+			goto bad_message_unlock;
+		if (sp->hdr.seq != 1)
+			goto discard_unlock;
+		call = rxrpc_new_incoming_call(local, conn, skb);
+		if (!call) {
+			rcu_read_unlock();
+			goto reject_packet;
+		}
 	}
 
+	rxrpc_input_call_packet(call, skb, skew);
+	goto discard_unlock;
+
 discard_unlock:
-	rxrpc_free_skb(skb);
-out_unlock:
 	rcu_read_unlock();
+discard:
+	rxrpc_free_skb(skb);
 out:
 	trace_rxrpc_rx_done(0, 0);
 	return;
 
-cant_route_call:
+out_unlock:
 	rcu_read_unlock();
+	goto out;
 
-	_debug("can't route call");
-	if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
-	    sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
-		if (sp->hdr.seq == 1) {
-			_debug("first packet");
-			skb_queue_tail(&local->accept_queue, skb);
-			rxrpc_queue_work(&local->processor);
-			_leave(" [incoming]");
-			goto out;
-		}
-		skb->priority = RX_INVALID_OPERATION;
-	} else {
-		skb->priority = RX_CALL_DEAD;
-	}
-
-	if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
-		_debug("reject type %d",sp->hdr.type);
-		goto reject_packet;
-	} else {
-		rxrpc_free_skb(skb);
-	}
-	_leave(" [no call]");
-	return;
+wrong_security:
+	rcu_read_unlock();
+	trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+			  RXKADINCONSISTENCY, EBADMSG);
+	skb->priority = RXKADINCONSISTENCY;
+	goto post_abort;
 
+bad_message_unlock:
+	rcu_read_unlock();
 bad_message:
+	trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+			  RX_PROTOCOL_ERROR, EBADMSG);
 	skb->priority = RX_PROTOCOL_ERROR;
+post_abort:
+	skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
 reject_packet:
 	trace_rxrpc_rx_done(skb->mark, skb->priority);
 	rxrpc_reject_packet(local, skb);
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index a4aba0246731..7d4375e557e6 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -30,14 +30,18 @@ static int none_secure_packet(struct rxrpc_call *call,
 	return 0;
 }
 
-static int none_verify_packet(struct rxrpc_call *call,
-			      struct sk_buff *skb,
-			      rxrpc_seq_t seq,
-			      u16 expected_cksum)
+static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			      unsigned int offset, unsigned int len,
+			      rxrpc_seq_t seq, u16 expected_cksum)
 {
 	return 0;
 }
 
+static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+			     unsigned int *_offset, unsigned int *_len)
+{
+}
+
 static int none_respond_to_challenge(struct rxrpc_connection *conn,
 				     struct sk_buff *skb,
 				     u32 *_abort_code)
@@ -79,6 +83,7 @@ const struct rxrpc_security rxrpc_no_security = {
 	.prime_packet_security		= none_prime_packet_security,
 	.secure_packet			= none_secure_packet,
 	.verify_packet			= none_verify_packet,
+	.locate_data			= none_locate_data,
 	.respond_to_challenge		= none_respond_to_challenge,
 	.verify_response		= none_verify_response,
 	.clear				= none_clear,
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index bcc6593b4cdb..cdd58e6e9fbd 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -98,7 +98,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
 
 		switch (sp->hdr.type) {
 		case RXRPC_PACKET_TYPE_VERSION:
-			if (skb_copy_bits(skb, 0, &v, 1) < 0)
+			if (skb_copy_bits(skb, sp->offset, &v, 1) < 0)
 				return;
 			_proto("Rx VERSION { %02x }", v);
 			if (v == 0)
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 610916f4ae34..782b9adf67cb 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -77,7 +77,6 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
 		INIT_WORK(&local->processor, rxrpc_local_processor);
 		INIT_HLIST_HEAD(&local->services);
 		init_rwsem(&local->defrag_sem);
-		skb_queue_head_init(&local->accept_queue);
 		skb_queue_head_init(&local->reject_queue);
 		skb_queue_head_init(&local->event_queue);
 		local->client_conns = RB_ROOT;
@@ -308,7 +307,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
 	/* At this point, there should be no more packets coming in to the
 	 * local endpoint.
 	 */
-	rxrpc_purge_queue(&local->accept_queue);
 	rxrpc_purge_queue(&local->reject_queue);
 	rxrpc_purge_queue(&local->event_queue);
 
@@ -332,11 +330,6 @@ static void rxrpc_local_processor(struct work_struct *work)
 		if (atomic_read(&local->usage) == 0)
 			return rxrpc_local_destroyer(local);
 
-		if (!skb_queue_empty(&local->accept_queue)) {
-			rxrpc_accept_incoming_calls(local);
-			again = true;
-		}
-
 		if (!skb_queue_empty(&local->reject_queue)) {
 			rxrpc_reject_packets(local);
 			again = true;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 39e7cc37c392..fd096f742e4b 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -50,7 +50,7 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
  * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
  * packets.
  */
-unsigned int rxrpc_rx_window_size = 32;
+unsigned int rxrpc_rx_window_size = RXRPC_RXTX_BUFF_SIZE - 46;
 
 /*
  * Maximum Rx MTU size.  This indicates to the sender the size of jumbo packet
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 8756d74fd74b..719a4c23f09d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -15,6 +15,8 @@
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
@@ -38,20 +40,38 @@ struct rxrpc_pkt_buffer {
 static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 				 struct rxrpc_pkt_buffer *pkt)
 {
+	rxrpc_seq_t hard_ack, top, seq;
+	int ix;
 	u32 mtu, jmax;
 	u8 *ackp = pkt->acks;
 
+	/* Barrier against rxrpc_input_data(). */
+	hard_ack = READ_ONCE(call->rx_hard_ack);
+	top = smp_load_acquire(&call->rx_top);
+
 	pkt->ack.bufferSpace	= htons(8);
-	pkt->ack.maxSkew	= htons(0);
-	pkt->ack.firstPacket	= htonl(call->rx_data_eaten + 1);
+	pkt->ack.maxSkew	= htons(call->ackr_skew);
+	pkt->ack.firstPacket	= htonl(hard_ack + 1);
 	pkt->ack.previousPacket	= htonl(call->ackr_prev_seq);
 	pkt->ack.serial		= htonl(call->ackr_serial);
-	pkt->ack.reason		= RXRPC_ACK_IDLE;
-	pkt->ack.nAcks		= 0;
+	pkt->ack.reason		= call->ackr_reason;
+	pkt->ack.nAcks		= top - hard_ack;
+
+	if (after(top, hard_ack)) {
+		seq = hard_ack + 1;
+		do {
+			ix = seq & RXRPC_RXTX_BUFF_MASK;
+			if (call->rxtx_buffer[ix])
+				*ackp++ = RXRPC_ACK_TYPE_ACK;
+			else
+				*ackp++ = RXRPC_ACK_TYPE_NACK;
+			seq++;
+		} while (before_eq(seq, top));
+	}
 
-	mtu = call->peer->if_mtu;
-	mtu -= call->peer->hdrsize;
-	jmax = rxrpc_rx_jumbo_max;
+	mtu = call->conn->params.peer->if_mtu;
+	mtu -= call->conn->params.peer->hdrsize;
+	jmax = (call->nr_jumbo_dup > 3) ? 1 : rxrpc_rx_jumbo_max;
 	pkt->ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
 	pkt->ackinfo.maxMTU	= htonl(mtu);
 	pkt->ackinfo.rwind	= htonl(rxrpc_rx_window_size);
@@ -60,11 +80,11 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	*ackp++ = 0;
 	*ackp++ = 0;
 	*ackp++ = 0;
-	return 3;
+	return top - hard_ack + 3;
 }
 
 /*
- * Send a final ACK or ABORT call packet.
+ * Send an ACK or ABORT call packet.
  */
 int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 {
@@ -158,6 +178,19 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	ret = kernel_sendmsg(conn->params.local->socket,
 			     &msg, iov, ioc, len);
 
+	if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) {
+		switch (pkt->whdr.type) {
+		case RXRPC_PACKET_TYPE_ACK:
+			rxrpc_propose_ACK(call, pkt->ack.reason,
+					  ntohs(pkt->ack.maxSkew),
+					  ntohl(pkt->ack.serial),
+					  true, true);
+			break;
+		case RXRPC_PACKET_TYPE_ABORT:
+			break;
+		}
+	}
+
 out:
 	rxrpc_put_connection(conn);
 	kfree(pkt);
@@ -233,3 +266,77 @@ send_fragmentable:
 	_leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
 	return ret;
 }
+
+/*
+ * reject packets through the local endpoint
+ */
+void rxrpc_reject_packets(struct rxrpc_local *local)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sin;
+	} sa;
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_wire_header whdr;
+	struct sk_buff *skb;
+	struct msghdr msg;
+	struct kvec iov[2];
+	size_t size;
+	__be32 code;
+
+	_enter("%d", local->debug_id);
+
+	iov[0].iov_base = &whdr;
+	iov[0].iov_len = sizeof(whdr);
+	iov[1].iov_base = &code;
+	iov[1].iov_len = sizeof(code);
+	size = sizeof(whdr) + sizeof(code);
+
+	msg.msg_name = &sa;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa.sa_family = local->srx.transport.family;
+	switch (sa.sa.sa_family) {
+	case AF_INET:
+		msg.msg_namelen = sizeof(sa.sin);
+		break;
+	default:
+		msg.msg_namelen = 0;
+		break;
+	}
+
+	memset(&whdr, 0, sizeof(whdr));
+	whdr.type = RXRPC_PACKET_TYPE_ABORT;
+
+	while ((skb = skb_dequeue(&local->reject_queue))) {
+		rxrpc_see_skb(skb);
+		sp = rxrpc_skb(skb);
+		switch (sa.sa.sa_family) {
+		case AF_INET:
+			sa.sin.sin_port = udp_hdr(skb)->source;
+			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+			code = htonl(skb->priority);
+
+			whdr.epoch	= htonl(sp->hdr.epoch);
+			whdr.cid	= htonl(sp->hdr.cid);
+			whdr.callNumber	= htonl(sp->hdr.callNumber);
+			whdr.serviceId	= htons(sp->hdr.serviceId);
+			whdr.flags	= sp->hdr.flags;
+			whdr.flags	^= RXRPC_CLIENT_INITIATED;
+			whdr.flags	&= RXRPC_CLIENT_INITIATED;
+
+			kernel_sendmsg(local->socket, &msg, iov, 2, size);
+			break;
+
+		default:
+			break;
+		}
+
+		rxrpc_free_skb(skb);
+	}
+
+	_leave("");
+}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 27b9ecad007e..c8948936c6fc 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -129,15 +129,14 @@ void rxrpc_error_report(struct sock *sk)
 		_leave("UDP socket errqueue empty");
 		return;
 	}
+	rxrpc_new_skb(skb);
 	serr = SKB_EXT_ERR(skb);
 	if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
 		_leave("UDP empty message");
-		kfree_skb(skb);
+		rxrpc_free_skb(skb);
 		return;
 	}
 
-	rxrpc_new_skb(skb);
-
 	rcu_read_lock();
 	peer = rxrpc_lookup_peer_icmp_rcu(local, skb);
 	if (peer && !rxrpc_get_peer_maybe(peer))
@@ -249,7 +248,6 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
 		container_of(work, struct rxrpc_peer, error_distributor);
 	struct rxrpc_call *call;
 	enum rxrpc_call_completion compl;
-	bool queue;
 	int error;
 
 	_enter("");
@@ -272,15 +270,8 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
 		hlist_del_init(&call->error_link);
 		rxrpc_see_call(call);
 
-		queue = false;
-		write_lock(&call->state_lock);
-		if (__rxrpc_set_call_completion(call, compl, 0, error)) {
-			set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
-			queue = true;
-		}
-		write_unlock(&call->state_lock);
-		if (queue)
-			rxrpc_queue_call(call);
+		if (rxrpc_set_call_completion(call, compl, 0, error))
+			rxrpc_notify_socket(call);
 	}
 
 	spin_unlock_bh(&peer->lock);
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index aebc73ac16dc..2efe29a4c232 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -198,6 +198,32 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
 	return peer;
 }
 
+/*
+ * Initialise peer record.
+ */
+static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
+{
+	rxrpc_assess_MTU_size(peer);
+	peer->mtu = peer->if_mtu;
+
+	if (peer->srx.transport.family == AF_INET) {
+		peer->hdrsize = sizeof(struct iphdr);
+		switch (peer->srx.transport_type) {
+		case SOCK_DGRAM:
+			peer->hdrsize += sizeof(struct udphdr);
+			break;
+		default:
+			BUG();
+			break;
+		}
+	} else {
+		BUG();
+	}
+
+	peer->hdrsize += sizeof(struct rxrpc_wire_header);
+	peer->maxdata = peer->mtu - peer->hdrsize;
+}
+
 /*
  * Set up a new peer.
  */
@@ -214,29 +240,39 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
 	if (peer) {
 		peer->hash_key = hash_key;
 		memcpy(&peer->srx, srx, sizeof(*srx));
+		rxrpc_init_peer(peer, hash_key);
+	}
 
-		rxrpc_assess_MTU_size(peer);
-		peer->mtu = peer->if_mtu;
-
-		if (srx->transport.family == AF_INET) {
-			peer->hdrsize = sizeof(struct iphdr);
-			switch (srx->transport_type) {
-			case SOCK_DGRAM:
-				peer->hdrsize += sizeof(struct udphdr);
-				break;
-			default:
-				BUG();
-				break;
-			}
-		} else {
-			BUG();
-		}
+	_leave(" = %p", peer);
+	return peer;
+}
 
-		peer->hdrsize += sizeof(struct rxrpc_wire_header);
-		peer->maxdata = peer->mtu - peer->hdrsize;
+/*
+ * Set up a new incoming peer.  The address is prestored in the preallocated
+ * peer.
+ */
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
+					      struct rxrpc_peer *prealloc)
+{
+	struct rxrpc_peer *peer;
+	unsigned long hash_key;
+
+	hash_key = rxrpc_peer_hash_key(local, &prealloc->srx);
+	prealloc->local = local;
+	rxrpc_init_peer(prealloc, hash_key);
+
+	spin_lock(&rxrpc_peer_hash_lock);
+
+	/* Need to check that we aren't racing with someone else */
+	peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key);
+	if (peer && !rxrpc_get_peer_maybe(peer))
+		peer = NULL;
+	if (!peer) {
+		peer = prealloc;
+		hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key);
 	}
 
-	_leave(" = %p", peer);
+	spin_unlock(&rxrpc_peer_hash_lock);
 	return peer;
 }
 
@@ -272,7 +308,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 			return NULL;
 		}
 
-		spin_lock(&rxrpc_peer_hash_lock);
+		spin_lock_bh(&rxrpc_peer_hash_lock);
 
 		/* Need to check that we aren't racing with someone else */
 		peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
@@ -282,7 +318,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 			hash_add_rcu(rxrpc_peer_hash,
 				     &candidate->hash_link, hash_key);
 
-		spin_unlock(&rxrpc_peer_hash_lock);
+		spin_unlock_bh(&rxrpc_peer_hash_lock);
 
 		if (peer)
 			kfree(candidate);
@@ -307,9 +343,9 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer)
 {
 	ASSERT(hlist_empty(&peer->error_targets));
 
-	spin_lock(&rxrpc_peer_hash_lock);
+	spin_lock_bh(&rxrpc_peer_hash_lock);
 	hash_del_rcu(&peer->hash_link);
-	spin_unlock(&rxrpc_peer_hash_lock);
+	spin_unlock_bh(&rxrpc_peer_hash_lock);
 
 	kfree_rcu(peer, rcu);
 }
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 6876ffb3b410..20d0b5c6f81b 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -19,319 +19,479 @@
 #include "ar-internal.h"
 
 /*
- * receive a message from an RxRPC socket
- * - we need to be careful about two or more threads calling recvmsg
- *   simultaneously
+ * Post a call for attention by the socket or kernel service.  Further
+ * notifications are suppressed by putting recvmsg_link on a dummy queue.
  */
-int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
-		  int flags)
+void rxrpc_notify_socket(struct rxrpc_call *call)
 {
-	struct rxrpc_skb_priv *sp;
-	struct rxrpc_call *call = NULL, *continue_call = NULL;
-	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
-	struct sk_buff *skb;
-	long timeo;
-	int copy, ret, ullen, offset, copied = 0;
-	u32 abort_code;
+	struct rxrpc_sock *rx;
+	struct sock *sk;
 
-	DEFINE_WAIT(wait);
+	_enter("%d", call->debug_id);
 
-	_enter(",,,%zu,%d", len, flags);
+	if (!list_empty(&call->recvmsg_link))
+		return;
+
+	rcu_read_lock();
+
+	rx = rcu_dereference(call->socket);
+	sk = &rx->sk;
+	if (rx && sk->sk_state < RXRPC_CLOSE) {
+		if (call->notify_rx) {
+			call->notify_rx(sk, call, call->user_call_ID);
+		} else {
+			write_lock_bh(&rx->recvmsg_lock);
+			if (list_empty(&call->recvmsg_link)) {
+				rxrpc_get_call(call, rxrpc_call_got);
+				list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
+			}
+			write_unlock_bh(&rx->recvmsg_lock);
 
-	if (flags & (MSG_OOB | MSG_TRUNC))
-		return -EOPNOTSUPP;
+			if (!sock_flag(sk, SOCK_DEAD)) {
+				_debug("call %ps", sk->sk_data_ready);
+				sk->sk_data_ready(sk);
+			}
+		}
+	}
 
-	ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long);
+	rcu_read_unlock();
+	_leave("");
+}
 
-	timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
-	msg->msg_flags |= MSG_MORE;
+/*
+ * Pass a call terminating message to userspace.
+ */
+static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
+{
+	u32 tmp = 0;
+	int ret;
 
-	lock_sock(&rx->sk);
+	switch (call->completion) {
+	case RXRPC_CALL_SUCCEEDED:
+		ret = 0;
+		if (rxrpc_is_service_call(call))
+			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp);
+		break;
+	case RXRPC_CALL_REMOTELY_ABORTED:
+		tmp = call->abort_code;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+		break;
+	case RXRPC_CALL_LOCALLY_ABORTED:
+		tmp = call->abort_code;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+		break;
+	case RXRPC_CALL_NETWORK_ERROR:
+		tmp = call->error;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp);
+		break;
+	case RXRPC_CALL_LOCAL_ERROR:
+		tmp = call->error;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp);
+		break;
+	default:
+		pr_err("Invalid terminal call state %u\n", call->state);
+		BUG();
+		break;
+	}
 
-	for (;;) {
-		/* return immediately if a client socket has no outstanding
-		 * calls */
-		if (RB_EMPTY_ROOT(&rx->calls)) {
-			if (copied)
-				goto out;
-			if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
-				release_sock(&rx->sk);
-				if (continue_call)
-					rxrpc_put_call(continue_call,
-						       rxrpc_call_put);
-				return -ENODATA;
-			}
-		}
+	return ret;
+}
 
-		/* get the next message on the Rx queue */
-		skb = skb_peek(&rx->sk.sk_receive_queue);
-		if (!skb) {
-			/* nothing remains on the queue */
-			if (copied &&
-			    (flags & MSG_PEEK || timeo == 0))
-				goto out;
+/*
+ * Pass back notification of a new call.  The call is added to the
+ * to-be-accepted list.  This means that the next call to be accepted might not
+ * be the last call seen awaiting acceptance, but unless we leave this on the
+ * front of the queue and block all other messages until someone gives us a
+ * user_ID for it, there's not a lot we can do.
+ */
+static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
+				  struct rxrpc_call *call,
+				  struct msghdr *msg, int flags)
+{
+	int tmp = 0, ret;
 
-			/* wait for a message to turn up */
-			release_sock(&rx->sk);
-			prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
-						  TASK_INTERRUPTIBLE);
-			ret = sock_error(&rx->sk);
-			if (ret)
-				goto wait_error;
-
-			if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
-				if (signal_pending(current))
-					goto wait_interrupted;
-				timeo = schedule_timeout(timeo);
-			}
-			finish_wait(sk_sleep(&rx->sk), &wait);
-			lock_sock(&rx->sk);
-			continue;
-		}
+	ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp);
 
-	peek_next_packet:
-		rxrpc_see_skb(skb);
-		sp = rxrpc_skb(skb);
-		call = sp->call;
-		ASSERT(call != NULL);
-		rxrpc_see_call(call);
-
-		_debug("next pkt %s", rxrpc_pkts[sp->hdr.type]);
-
-		/* make sure we wait for the state to be updated in this call */
-		spin_lock_bh(&call->lock);
-		spin_unlock_bh(&call->lock);
-
-		if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
-			_debug("packet from released call");
-			if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-				BUG();
-			rxrpc_free_skb(skb);
-			continue;
-		}
+	if (ret == 0 && !(flags & MSG_PEEK)) {
+		_debug("to be accepted");
+		write_lock_bh(&rx->recvmsg_lock);
+		list_del_init(&call->recvmsg_link);
+		write_unlock_bh(&rx->recvmsg_lock);
 
-		/* determine whether to continue last data receive */
-		if (continue_call) {
-			_debug("maybe cont");
-			if (call != continue_call ||
-			    skb->mark != RXRPC_SKB_MARK_DATA) {
-				release_sock(&rx->sk);
-				rxrpc_put_call(continue_call, rxrpc_call_put);
-				_leave(" = %d [noncont]", copied);
-				return copied;
-			}
-		}
+		write_lock(&rx->call_lock);
+		list_add_tail(&call->accept_link, &rx->to_be_accepted);
+		write_unlock(&rx->call_lock);
+	}
 
-		rxrpc_get_call(call, rxrpc_call_got);
+	return ret;
+}
 
-		/* copy the peer address and timestamp */
-		if (!continue_call) {
-			if (msg->msg_name) {
-				size_t len =
-					sizeof(call->conn->params.peer->srx);
-				memcpy(msg->msg_name,
-				       &call->conn->params.peer->srx, len);
-				msg->msg_namelen = len;
-			}
-			sock_recv_timestamp(msg, &rx->sk, skb);
-		}
+/*
+ * End the packet reception phase.
+ */
+static void rxrpc_end_rx_phase(struct rxrpc_call *call)
+{
+	_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
 
-		/* receive the message */
-		if (skb->mark != RXRPC_SKB_MARK_DATA)
-			goto receive_non_data_message;
+	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+	} else {
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, false);
+	}
 
-		_debug("recvmsg DATA #%u { %d, %d }",
-		       sp->hdr.seq, skb->len, sp->offset);
+	write_lock_bh(&call->state_lock);
 
-		if (!continue_call) {
-			/* only set the control data once per recvmsg() */
-			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
-				       ullen, &call->user_call_ID);
-			if (ret < 0)
-				goto copy_error;
-			ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
-		}
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_RECV_REPLY:
+		__rxrpc_call_completed(call);
+		break;
 
-		ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
-		ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
-		call->rx_data_recv = sp->hdr.seq;
+	case RXRPC_CALL_SERVER_RECV_REQUEST:
+		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+		break;
+	default:
+		break;
+	}
 
-		ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+	write_unlock_bh(&call->state_lock);
+}
 
-		offset = sp->offset;
-		copy = skb->len - offset;
-		if (copy > len - copied)
-			copy = len - copied;
+/*
+ * Discard a packet we've used up and advance the Rx window by one.
+ */
+static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
+{
+	struct sk_buff *skb;
+	rxrpc_seq_t hard_ack, top;
+	int ix;
+
+	_enter("%d", call->debug_id);
+
+	hard_ack = call->rx_hard_ack;
+	top = smp_load_acquire(&call->rx_top);
+	ASSERT(before(hard_ack, top));
+
+	hard_ack++;
+	ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
+	skb = call->rxtx_buffer[ix];
+	rxrpc_see_skb(skb);
+	call->rxtx_buffer[ix] = NULL;
+	call->rxtx_annotations[ix] = 0;
+	/* Barrier against rxrpc_input_data(). */
+	smp_store_release(&call->rx_hard_ack, hard_ack);
 
-		ret = skb_copy_datagram_msg(skb, offset, msg, copy);
+	rxrpc_free_skb(skb);
 
+	_debug("%u,%u,%lx", hard_ack, top, call->flags);
+	if (hard_ack == top && test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+		rxrpc_end_rx_phase(call);
+}
+
+/*
+ * Decrypt and verify a (sub)packet.  The packet's length may be changed due to
+ * padding, but if this is the case, the packet length will be resident in the
+ * socket buffer.  Note that we can't modify the master skb info as the skb may
+ * be the home to multiple subpackets.
+ */
+static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			       u8 annotation,
+			       unsigned int offset, unsigned int len)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	rxrpc_seq_t seq = sp->hdr.seq;
+	u16 cksum = sp->hdr.cksum;
+
+	_enter("");
+
+	/* For all but the head jumbo subpacket, the security checksum is in a
+	 * jumbo header immediately prior to the data.
+	 */
+	if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) {
+		__be16 tmp;
+		if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0)
+			BUG();
+		cksum = ntohs(tmp);
+		seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1;
+	}
+
+	return call->conn->security->verify_packet(call, skb, offset, len,
+						   seq, cksum);
+}
+
+/*
+ * Locate the data within a packet.  This is complicated by:
+ *
+ * (1) An skb may contain a jumbo packet - so we have to find the appropriate
+ *     subpacket.
+ *
+ * (2) The (sub)packets may be encrypted and, if so, the encrypted portion
+ *     contains an extra header which includes the true length of the data,
+ *     excluding any encrypted padding.
+ */
+static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+			     u8 *_annotation,
+			     unsigned int *_offset, unsigned int *_len)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int offset = *_offset;
+	unsigned int len = *_len;
+	int ret;
+	u8 annotation = *_annotation;
+
+	if (offset > 0)
+		return 0;
+
+	/* Locate the subpacket */
+	offset = sp->offset;
+	len = skb->len - sp->offset;
+	if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) {
+		offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) *
+			   RXRPC_JUMBO_SUBPKTLEN);
+		len = (annotation & RXRPC_RX_ANNO_JLAST) ?
+			skb->len - offset : RXRPC_JUMBO_SUBPKTLEN;
+	}
+
+	if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
+		ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
 		if (ret < 0)
-			goto copy_error;
+			return ret;
+		*_annotation |= RXRPC_RX_ANNO_VERIFIED;
+	}
 
-		/* handle piecemeal consumption of data packets */
-		_debug("copied %d+%d", copy, copied);
+	*_offset = offset;
+	*_len = len;
+	call->conn->security->locate_data(call, skb, _offset, _len);
+	return 0;
+}
 
-		offset += copy;
-		copied += copy;
+/*
+ * Deliver messages to a call.  This keeps processing packets until the buffer
+ * is filled and we find either more DATA (returns 0) or the end of the DATA
+ * (returns 1).  If more packets are required, it returns -EAGAIN.
+ */
+static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
+			      struct msghdr *msg, struct iov_iter *iter,
+			      size_t len, int flags, size_t *_offset)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *skb;
+	rxrpc_seq_t hard_ack, top, seq;
+	size_t remain;
+	bool last;
+	unsigned int rx_pkt_offset, rx_pkt_len;
+	int ix, copy, ret = 0;
+
+	_enter("");
+
+	rx_pkt_offset = call->rx_pkt_offset;
+	rx_pkt_len = call->rx_pkt_len;
+
+	/* Barriers against rxrpc_input_data(). */
+	hard_ack = call->rx_hard_ack;
+	top = smp_load_acquire(&call->rx_top);
+	for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		skb = call->rxtx_buffer[ix];
+		if (!skb)
+			break;
+		smp_rmb();
+		rxrpc_see_skb(skb);
+		sp = rxrpc_skb(skb);
 
-		if (!(flags & MSG_PEEK))
-			sp->offset = offset;
+		if (msg)
+			sock_recv_timestamp(msg, sock->sk, skb);
+
+		ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix],
+					&rx_pkt_offset, &rx_pkt_len);
+		_debug("recvmsg %x DATA #%u { %d, %d }",
+		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
+
+		/* We have to handle short, empty and used-up DATA packets. */
+		remain = len - *_offset;
+		copy = rx_pkt_len;
+		if (copy > remain)
+			copy = remain;
+		if (copy > 0) {
+			ret = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
+						     copy);
+			if (ret < 0)
+				goto out;
+
+			/* handle piecemeal consumption of data packets */
+			_debug("copied %d @%zu", copy, *_offset);
+
+			rx_pkt_offset += copy;
+			rx_pkt_len -= copy;
+			*_offset += copy;
+		}
 
-		if (sp->offset < skb->len) {
+		if (rx_pkt_len > 0) {
 			_debug("buffer full");
-			ASSERTCMP(copied, ==, len);
+			ASSERTCMP(*_offset, ==, len);
 			break;
 		}
 
-		/* we transferred the whole data packet */
+		/* The whole packet has been transferred. */
+		last = sp->hdr.flags & RXRPC_LAST_PACKET;
 		if (!(flags & MSG_PEEK))
-			rxrpc_kernel_data_consumed(call, skb);
-
-		if (sp->hdr.flags & RXRPC_LAST_PACKET) {
-			_debug("last");
-			if (rxrpc_conn_is_client(call->conn)) {
-				 /* last byte of reply received */
-				ret = copied;
-				goto terminal_message;
-			}
+			rxrpc_rotate_rx_window(call);
+		rx_pkt_offset = 0;
+		rx_pkt_len = 0;
 
-			/* last bit of request received */
-			if (!(flags & MSG_PEEK)) {
-				_debug("eat packet");
-				if (skb_dequeue(&rx->sk.sk_receive_queue) !=
-				    skb)
-					BUG();
-				rxrpc_free_skb(skb);
-			}
-			msg->msg_flags &= ~MSG_MORE;
-			break;
-		}
+		ASSERTIFCMP(last, seq, ==, top);
+	}
 
-		/* move on to the next data message */
-		_debug("next");
-		if (!continue_call)
-			continue_call = sp->call;
-		else
-			rxrpc_put_call(call, rxrpc_call_put);
-		call = NULL;
-
-		if (flags & MSG_PEEK) {
-			_debug("peek next");
-			skb = skb->next;
-			if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue)
-				break;
-			goto peek_next_packet;
-		}
+	if (after(seq, top)) {
+		ret = -EAGAIN;
+		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+			ret = 1;
+	}
+out:
+	if (!(flags & MSG_PEEK)) {
+		call->rx_pkt_offset = rx_pkt_offset;
+		call->rx_pkt_len = rx_pkt_len;
+	}
+	_leave(" = %d [%u/%u]", ret, seq, top);
+	return ret;
+}
 
-		_debug("eat packet");
-		if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-			BUG();
-		rxrpc_free_skb(skb);
+/*
+ * Receive a message from an RxRPC socket
+ * - we need to be careful about two or more threads calling recvmsg
+ *   simultaneously
+ */
+int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+		  int flags)
+{
+	struct rxrpc_call *call;
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	struct list_head *l;
+	size_t copied = 0;
+	long timeo;
+	int ret;
+
+	DEFINE_WAIT(wait);
+
+	_enter(",,,%zu,%d", len, flags);
+
+	if (flags & (MSG_OOB | MSG_TRUNC))
+		return -EOPNOTSUPP;
+
+	timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
+
+try_again:
+	lock_sock(&rx->sk);
+
+	/* Return immediately if a client socket has no outstanding calls */
+	if (RB_EMPTY_ROOT(&rx->calls) &&
+	    list_empty(&rx->recvmsg_q) &&
+	    rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
+		release_sock(&rx->sk);
+		return -ENODATA;
 	}
 
-	/* end of non-terminal data packet reception for the moment */
-	_debug("end rcv data");
-out:
-	release_sock(&rx->sk);
-	if (call)
-		rxrpc_put_call(call, rxrpc_call_put);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
-	_leave(" = %d [data]", copied);
-	return copied;
-
-	/* handle non-DATA messages such as aborts, incoming connections and
-	 * final ACKs */
-receive_non_data_message:
-	_debug("non-data");
-
-	if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
-		_debug("RECV NEW CALL");
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
-		if (ret < 0)
-			goto copy_error;
-		if (!(flags & MSG_PEEK)) {
-			if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-				BUG();
-			rxrpc_free_skb(skb);
+	if (list_empty(&rx->recvmsg_q)) {
+		ret = -EWOULDBLOCK;
+		if (timeo == 0)
+			goto error_no_call;
+
+		release_sock(&rx->sk);
+
+		/* Wait for something to happen */
+		prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
+					  TASK_INTERRUPTIBLE);
+		ret = sock_error(&rx->sk);
+		if (ret)
+			goto wait_error;
+
+		if (list_empty(&rx->recvmsg_q)) {
+			if (signal_pending(current))
+				goto wait_interrupted;
+			timeo = schedule_timeout(timeo);
 		}
-		goto out;
+		finish_wait(sk_sleep(&rx->sk), &wait);
+		goto try_again;
 	}
 
-	ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
-		       ullen, &call->user_call_ID);
-	if (ret < 0)
-		goto copy_error;
-	ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+	/* Find the next call and dequeue it if we're not just peeking.  If we
+	 * do dequeue it, that comes with a ref that we will need to release.
+	 */
+	write_lock_bh(&rx->recvmsg_lock);
+	l = rx->recvmsg_q.next;
+	call = list_entry(l, struct rxrpc_call, recvmsg_link);
+	if (!(flags & MSG_PEEK))
+		list_del_init(&call->recvmsg_link);
+	else
+		rxrpc_get_call(call, rxrpc_call_got);
+	write_unlock_bh(&rx->recvmsg_lock);
 
-	switch (skb->mark) {
-	case RXRPC_SKB_MARK_DATA:
+	_debug("recvmsg call %p", call);
+
+	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
 		BUG();
-	case RXRPC_SKB_MARK_FINAL_ACK:
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_BUSY:
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_REMOTE_ABORT:
-		abort_code = call->abort_code;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_LOCAL_ABORT:
-		abort_code = call->abort_code;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
-		if (call->error) {
-			abort_code = call->error;
-			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
-				       &abort_code);
+
+	if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+		if (flags & MSG_CMSG_COMPAT) {
+			unsigned int id32 = call->user_call_ID;
+
+			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+				       sizeof(unsigned int), &id32);
+		} else {
+			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+				       sizeof(unsigned long),
+				       &call->user_call_ID);
 		}
+		if (ret < 0)
+			goto error;
+	}
+
+	if (msg->msg_name) {
+		size_t len = sizeof(call->conn->params.peer->srx);
+		memcpy(msg->msg_name, &call->conn->params.peer->srx, len);
+		msg->msg_namelen = len;
+	}
+
+	switch (call->state) {
+	case RXRPC_CALL_SERVER_ACCEPTING:
+		ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
 		break;
-	case RXRPC_SKB_MARK_NET_ERROR:
-		_debug("RECV NET ERROR %d", sp->error);
-		abort_code = sp->error;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
-		break;
-	case RXRPC_SKB_MARK_LOCAL_ERROR:
-		_debug("RECV LOCAL ERROR %d", sp->error);
-		abort_code = sp->error;
-		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
-			       &abort_code);
+	case RXRPC_CALL_CLIENT_RECV_REPLY:
+	case RXRPC_CALL_SERVER_RECV_REQUEST:
+	case RXRPC_CALL_SERVER_ACK_REQUEST:
+		ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len,
+					 flags, &copied);
+		if (ret == -EAGAIN)
+			ret = 0;
 		break;
 	default:
-		pr_err("Unknown packet mark %u\n", skb->mark);
-		BUG();
+		ret = 0;
 		break;
 	}
 
 	if (ret < 0)
-		goto copy_error;
-
-terminal_message:
-	_debug("terminal");
-	msg->msg_flags &= ~MSG_MORE;
-	msg->msg_flags |= MSG_EOR;
+		goto error;
 
-	if (!(flags & MSG_PEEK)) {
-		_net("free terminal skb %p", skb);
-		if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
-			BUG();
-		rxrpc_free_skb(skb);
-		rxrpc_release_call(rx, call);
+	if (call->state == RXRPC_CALL_COMPLETE) {
+		ret = rxrpc_recvmsg_term(call, msg);
+		if (ret < 0)
+			goto error;
+		if (!(flags & MSG_PEEK))
+			rxrpc_release_call(rx, call);
+		msg->msg_flags |= MSG_EOR;
+		ret = 1;
 	}
 
-	release_sock(&rx->sk);
-	rxrpc_put_call(call, rxrpc_call_put);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
-	_leave(" = %d", ret);
-	return ret;
+	if (ret == 0)
+		msg->msg_flags |= MSG_MORE;
+	else
+		msg->msg_flags &= ~MSG_MORE;
+	ret = copied;
 
-copy_error:
-	_debug("copy error");
-	release_sock(&rx->sk);
+error:
 	rxrpc_put_call(call, rxrpc_call_put);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
+error_no_call:
+	release_sock(&rx->sk);
 	_leave(" = %d", ret);
 	return ret;
 
@@ -339,85 +499,8 @@ wait_interrupted:
 	ret = sock_intr_errno(timeo);
 wait_error:
 	finish_wait(sk_sleep(&rx->sk), &wait);
-	if (continue_call)
-		rxrpc_put_call(continue_call, rxrpc_call_put);
-	if (copied)
-		copied = ret;
-	_leave(" = %d [waitfail %d]", copied, ret);
-	return copied;
-
-}
-
-/*
- * Deliver messages to a call.  This keeps processing packets until the buffer
- * is filled and we find either more DATA (returns 0) or the end of the DATA
- * (returns 1).  If more packets are required, it returns -EAGAIN.
- *
- * TODO: Note that this is hacked in at the moment and will be replaced.
- */
-static int temp_deliver_data(struct socket *sock, struct rxrpc_call *call,
-			     struct iov_iter *iter, size_t size,
-			     size_t *_offset)
-{
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
-	size_t remain;
-	int ret, copy;
-
-	_enter("%d", call->debug_id);
-
-next:
-	local_bh_disable();
-	skb = skb_dequeue(&call->knlrecv_queue);
-	local_bh_enable();
-	if (!skb) {
-		if (test_bit(RXRPC_CALL_RX_NO_MORE, &call->flags))
-			return 1;
-		_leave(" = -EAGAIN [empty]");
-		return -EAGAIN;
-	}
-
-	sp = rxrpc_skb(skb);
-	_debug("dequeued %p %u/%zu", skb, sp->offset, size);
-
-	switch (skb->mark) {
-	case RXRPC_SKB_MARK_DATA:
-		remain = size - *_offset;
-		if (remain > 0) {
-			copy = skb->len - sp->offset;
-			if (copy > remain)
-				copy = remain;
-			ret = skb_copy_datagram_iter(skb, sp->offset, iter,
-						     copy);
-			if (ret < 0)
-				goto requeue_and_leave;
-
-			/* handle piecemeal consumption of data packets */
-			sp->offset += copy;
-			*_offset += copy;
-		}
-
-		if (sp->offset < skb->len)
-			goto partially_used_skb;
-
-		/* We consumed the whole packet */
-		ASSERTCMP(sp->offset, ==, skb->len);
-		if (sp->hdr.flags & RXRPC_LAST_PACKET)
-			set_bit(RXRPC_CALL_RX_NO_MORE, &call->flags);
-		rxrpc_kernel_data_consumed(call, skb);
-		rxrpc_free_skb(skb);
-		goto next;
-
-	default:
-		rxrpc_free_skb(skb);
-		goto next;
-	}
-
-partially_used_skb:
-	ASSERTCMP(*_offset, ==, size);
-	ret = 0;
-requeue_and_leave:
-	skb_queue_head(&call->knlrecv_queue, skb);
+	release_sock(&rx->sk);
+	_leave(" = %d [wait]", ret);
 	return ret;
 }
 
@@ -453,8 +536,9 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 	struct kvec iov;
 	int ret;
 
-	_enter("{%d,%s},%zu,%d",
-	       call->debug_id, rxrpc_call_states[call->state], size, want_more);
+	_enter("{%d,%s},%zu/%zu,%d",
+	       call->debug_id, rxrpc_call_states[call->state],
+	       *_offset, size, want_more);
 
 	ASSERTCMP(*_offset, <=, size);
 	ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
@@ -469,7 +553,8 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 	case RXRPC_CALL_CLIENT_RECV_REPLY:
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
 	case RXRPC_CALL_SERVER_ACK_REQUEST:
-		ret = temp_deliver_data(sock, call, &iter, size, _offset);
+		ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0,
+					 _offset);
 		if (ret < 0)
 			goto out;
 
@@ -494,7 +579,6 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
 		goto call_complete;
 
 	default:
-		*_offset = 0;
 		ret = -EINPROGRESS;
 		goto out;
 	}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 3777432df10b..ae392558829d 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -317,6 +317,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
  * decrypt partial encryption on a packet (level 1 security)
  */
 static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
+				 unsigned int offset, unsigned int len,
 				 rxrpc_seq_t seq)
 {
 	struct rxkad_level1_hdr sechdr;
@@ -330,18 +331,20 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
 
 	_enter("");
 
-	if (skb->len < 8) {
+	if (len < 8) {
 		rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
-	/* we want to decrypt the skbuff in-place */
+	/* Decrypt the skbuff in-place.  TODO: We really want to decrypt
+	 * directly into the target buffer.
+	 */
 	nsg = skb_cow_data(skb, 0, &trailer);
 	if (nsg < 0 || nsg > 16)
 		goto nomem;
 
 	sg_init_table(sg, nsg);
-	skb_to_sgvec(skb, sg, 0, 8);
+	skb_to_sgvec(skb, sg, offset, 8);
 
 	/* start the decryption afresh */
 	memset(&iv, 0, sizeof(iv));
@@ -353,12 +356,12 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
 	skcipher_request_zero(req);
 
 	/* Extract the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+	if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
 		rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (!skb_pull(skb, sizeof(sechdr)))
-		BUG();
+	offset += sizeof(sechdr);
+	len -= sizeof(sechdr);
 
 	buf = ntohl(sechdr.data_size);
 	data_size = buf & 0xffff;
@@ -371,18 +374,16 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
 		goto protocol_error;
 	}
 
-	/* shorten the packet to remove the padding */
-	if (data_size > skb->len) {
+	if (data_size > len) {
 		rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (data_size < skb->len)
-		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
 protocol_error:
+	rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 	_leave(" = -EPROTO");
 	return -EPROTO;
 
@@ -395,6 +396,7 @@ nomem:
  * wholly decrypt a packet (level 2 security)
  */
 static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
+				 unsigned int offset, unsigned int len,
 				 rxrpc_seq_t seq)
 {
 	const struct rxrpc_key_token *token;
@@ -409,12 +411,14 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 
 	_enter(",{%d}", skb->len);
 
-	if (skb->len < 8) {
+	if (len < 8) {
 		rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO);
 		goto protocol_error;
 	}
 
-	/* we want to decrypt the skbuff in-place */
+	/* Decrypt the skbuff in-place.  TODO: We really want to decrypt
+	 * directly into the target buffer.
+	 */
 	nsg = skb_cow_data(skb, 0, &trailer);
 	if (nsg < 0)
 		goto nomem;
@@ -427,7 +431,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 
 	sg_init_table(sg, nsg);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	skb_to_sgvec(skb, sg, offset, len);
 
 	/* decrypt from the session key */
 	token = call->conn->params.key->payload.data[0];
@@ -435,19 +439,19 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 
 	skcipher_request_set_tfm(req, call->conn->cipher);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x);
+	skcipher_request_set_crypt(req, sg, sg, len, iv.x);
 	crypto_skcipher_decrypt(req);
 	skcipher_request_zero(req);
 	if (sg != _sg)
 		kfree(sg);
 
 	/* Extract the decrypted packet length */
-	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) {
+	if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
 		rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (!skb_pull(skb, sizeof(sechdr)))
-		BUG();
+	offset += sizeof(sechdr);
+	len -= sizeof(sechdr);
 
 	buf = ntohl(sechdr.data_size);
 	data_size = buf & 0xffff;
@@ -460,17 +464,16 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 		goto protocol_error;
 	}
 
-	if (data_size > skb->len) {
+	if (data_size > len) {
 		rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO);
 		goto protocol_error;
 	}
-	if (data_size < skb->len)
-		skb->len = data_size;
 
 	_leave(" = 0 [dlen=%x]", data_size);
 	return 0;
 
 protocol_error:
+	rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 	_leave(" = -EPROTO");
 	return -EPROTO;
 
@@ -484,6 +487,7 @@ nomem:
  * jumbo packet).
  */
 static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			       unsigned int offset, unsigned int len,
 			       rxrpc_seq_t seq, u16 expected_cksum)
 {
 	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
@@ -521,6 +525,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 	if (cksum != expected_cksum) {
 		rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 		_leave(" = -EPROTO [csum failed]");
 		return -EPROTO;
 	}
@@ -529,14 +534,60 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	case RXRPC_SECURITY_PLAIN:
 		return 0;
 	case RXRPC_SECURITY_AUTH:
-		return rxkad_verify_packet_1(call, skb, seq);
+		return rxkad_verify_packet_1(call, skb, offset, len, seq);
 	case RXRPC_SECURITY_ENCRYPT:
-		return rxkad_verify_packet_2(call, skb, seq);
+		return rxkad_verify_packet_2(call, skb, offset, len, seq);
 	default:
 		return -ENOANO;
 	}
 }
 
+/*
+ * Locate the data contained in a packet that was partially encrypted.
+ */
+static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb,
+				unsigned int *_offset, unsigned int *_len)
+{
+	struct rxkad_level1_hdr sechdr;
+
+	if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+		BUG();
+	*_offset += sizeof(sechdr);
+	*_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in a packet that was completely encrypted.
+ */
+static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb,
+				unsigned int *_offset, unsigned int *_len)
+{
+	struct rxkad_level2_hdr sechdr;
+
+	if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+		BUG();
+	*_offset += sizeof(sechdr);
+	*_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in an already decrypted packet.
+ */
+static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+			      unsigned int *_offset, unsigned int *_len)
+{
+	switch (call->conn->params.security_level) {
+	case RXRPC_SECURITY_AUTH:
+		rxkad_locate_data_1(call, skb, _offset, _len);
+		return;
+	case RXRPC_SECURITY_ENCRYPT:
+		rxkad_locate_data_2(call, skb, _offset, _len);
+		return;
+	default:
+		return;
+	}
+}
+
 /*
  * issue a challenge
  */
@@ -704,7 +755,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	struct rxkad_challenge challenge;
 	struct rxkad_response resp
 		__attribute__((aligned(8))); /* must be aligned for crypto */
-	struct rxrpc_skb_priv *sp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	u32 version, nonce, min_level, abort_code;
 	int ret;
 
@@ -722,8 +773,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	}
 
 	abort_code = RXKADPACKETSHORT;
-	sp = rxrpc_skb(skb);
-	if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0)
+	if (skb_copy_bits(skb, sp->offset, &challenge, sizeof(challenge)) < 0)
 		goto protocol_error;
 
 	version = ntohl(challenge.version);
@@ -969,7 +1019,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 {
 	struct rxkad_response response
 		__attribute__((aligned(8))); /* must be aligned for crypto */
-	struct rxrpc_skb_priv *sp;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_crypt session_key;
 	time_t expiry;
 	void *ticket;
@@ -980,7 +1030,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
 
 	abort_code = RXKADPACKETSHORT;
-	if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0)
+	if (skb_copy_bits(skb, sp->offset, &response, sizeof(response)) < 0)
 		goto protocol_error;
 	if (!pskb_pull(skb, sizeof(response)))
 		BUG();
@@ -988,7 +1038,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	version = ntohl(response.version);
 	ticket_len = ntohl(response.ticket_len);
 	kvno = ntohl(response.kvno);
-	sp = rxrpc_skb(skb);
 	_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
 	       sp->hdr.serial, version, kvno, ticket_len);
 
@@ -1010,7 +1059,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 		return -ENOMEM;
 
 	abort_code = RXKADPACKETSHORT;
-	if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0)
+	if (skb_copy_bits(skb, sp->offset, ticket, ticket_len) < 0)
 		goto protocol_error_free;
 
 	ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
@@ -1135,6 +1184,7 @@ const struct rxrpc_security rxkad = {
 	.prime_packet_security		= rxkad_prime_packet_security,
 	.secure_packet			= rxkad_secure_packet,
 	.verify_packet			= rxkad_verify_packet,
+	.locate_data			= rxkad_locate_data,
 	.issue_challenge		= rxkad_issue_challenge,
 	.respond_to_challenge		= rxkad_respond_to_challenge,
 	.verify_response		= rxkad_verify_response,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 5d79d5a9c944..82d8134e9287 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -130,20 +130,20 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 	}
 
 	/* find the service */
-	read_lock_bh(&local->services_lock);
+	read_lock(&local->services_lock);
 	hlist_for_each_entry(rx, &local->services, listen_link) {
 		if (rx->srx.srx_service == conn->params.service_id)
 			goto found_service;
 	}
 
 	/* the service appears to have died */
-	read_unlock_bh(&local->services_lock);
+	read_unlock(&local->services_lock);
 	_leave(" = -ENOENT");
 	return -ENOENT;
 
 found_service:
 	if (!rx->securities) {
-		read_unlock_bh(&local->services_lock);
+		read_unlock(&local->services_lock);
 		_leave(" = -ENOKEY");
 		return -ENOKEY;
 	}
@@ -152,13 +152,13 @@ found_service:
 	kref = keyring_search(make_key_ref(rx->securities, 1UL),
 			      &key_type_rxrpc_s, kdesc);
 	if (IS_ERR(kref)) {
-		read_unlock_bh(&local->services_lock);
+		read_unlock(&local->services_lock);
 		_leave(" = %ld [search]", PTR_ERR(kref));
 		return PTR_ERR(kref);
 	}
 
 	key = key_ref_to_ptr(kref);
-	read_unlock_bh(&local->services_lock);
+	read_unlock(&local->services_lock);
 
 	conn->server_key = key;
 	conn->security = sec;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 9a4af992fcdf..cba236575073 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,7 +15,6 @@
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
-#include <linux/circ_buf.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
@@ -38,19 +37,20 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 	DECLARE_WAITQUEUE(myself, current);
 	int ret;
 
-	_enter(",{%d},%ld",
-	       CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
-			  call->acks_winsz),
-	       *timeo);
+	_enter(",{%u,%u,%u}",
+	       call->tx_hard_ack, call->tx_top, call->tx_winsize);
 
 	add_wait_queue(&call->waitq, &myself);
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		ret = 0;
-		if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
-			       call->acks_winsz) > 0)
+		if (call->tx_top - call->tx_hard_ack < call->tx_winsize)
 			break;
+		if (call->state >= RXRPC_CALL_COMPLETE) {
+			ret = -call->error;
+			break;
+		}
 		if (signal_pending(current)) {
 			ret = sock_intr_errno(*timeo);
 			break;
@@ -68,36 +68,44 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 }
 
 /*
- * attempt to schedule an instant Tx resend
+ * Schedule an instant Tx resend.
  */
-static inline void rxrpc_instant_resend(struct rxrpc_call *call)
+static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
 {
-	read_lock_bh(&call->state_lock);
-	if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
+	spin_lock_bh(&call->lock);
+
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+		if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
 			rxrpc_queue_call(call);
 	}
-	read_unlock_bh(&call->state_lock);
+
+	spin_unlock_bh(&call->lock);
 }
 
 /*
- * queue a packet for transmission, set the resend timer and attempt
- * to send the packet immediately
+ * Queue a DATA packet for transmission, set the resend timeout and send the
+ * packet immediately
  */
 static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 			       bool last)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	int ret;
+	rxrpc_seq_t seq = sp->hdr.seq;
+	int ret, ix;
+
+	_net("queue skb %p [%d]", skb, seq);
 
-	_net("queue skb %p [%d]", skb, call->acks_head);
+	ASSERTCMP(seq, ==, call->tx_top + 1);
 
-	ASSERT(call->acks_window != NULL);
-	call->acks_window[call->acks_head] = (unsigned long) skb;
+	ix = seq & RXRPC_RXTX_BUFF_MASK;
+	rxrpc_get_skb(skb);
+	call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
 	smp_wmb();
-	call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
+	call->rxtx_buffer[ix] = skb;
+	call->tx_top = seq;
+	if (last)
+		set_bit(RXRPC_CALL_TX_LAST, &call->flags);
 
 	if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
 		_debug("________awaiting reply/ACK__________");
@@ -121,34 +129,17 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 	_proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
-	sp->need_resend = false;
-	sp->resend_at = jiffies + rxrpc_resend_timeout;
-	if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
-		_debug("run timer");
-		call->resend_timer.expires = sp->resend_at;
-		add_timer(&call->resend_timer);
-	}
-
-	/* attempt to cancel the rx-ACK timer, deferring reply transmission if
-	 * we're ACK'ing the request phase of an incoming call */
-	ret = -EAGAIN;
-	if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
-		/* the packet may be freed by rxrpc_process_call() before this
-		 * returns */
-		if (rxrpc_is_client_call(call))
-			rxrpc_expose_client_call(call);
-		ret = rxrpc_send_data_packet(call->conn, skb);
-		_net("sent skb %p", skb);
-	} else {
-		_debug("failed to delete ACK timer");
-	}
+	if (seq == 1 && rxrpc_is_client_call(call))
+		rxrpc_expose_client_call(call);
 
+	sp->resend_at = jiffies + rxrpc_resend_timeout;
+	ret = rxrpc_send_data_packet(call->conn, skb);
 	if (ret < 0) {
 		_debug("need instant resend %d", ret);
-		sp->need_resend = true;
-		rxrpc_instant_resend(call);
+		rxrpc_instant_resend(call, ix);
 	}
 
+	rxrpc_free_skb(skb);
 	_leave("");
 }
 
@@ -212,9 +203,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			_debug("alloc");
 
-			if (CIRC_SPACE(call->acks_head,
-				       ACCESS_ONCE(call->acks_tail),
-				       call->acks_winsz) <= 0) {
+			if (call->tx_top - call->tx_hard_ack >=
+			    call->tx_winsize) {
 				ret = -EAGAIN;
 				if (msg->msg_flags & MSG_DONTWAIT)
 					goto maybe_error;
@@ -313,7 +303,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 					memset(skb_put(skb, pad), 0, pad);
 			}
 
-			seq = atomic_inc_return(&call->sequence);
+			seq = call->tx_top + 1;
 
 			sp->hdr.epoch	= conn->proto.epoch;
 			sp->hdr.cid	= call->cid;
@@ -329,9 +319,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			sp->hdr.flags = conn->out_clientflag;
 			if (msg_data_left(msg) == 0 && !more)
 				sp->hdr.flags |= RXRPC_LAST_PACKET;
-			else if (CIRC_SPACE(call->acks_head,
-					    ACCESS_ONCE(call->acks_tail),
-					    call->acks_winsz) > 1)
+			else if (call->tx_top - call->tx_hard_ack <
+				 call->tx_winsize)
 				sp->hdr.flags |= RXRPC_MORE_PACKETS;
 			if (more && seq & 1)
 				sp->hdr.flags |= RXRPC_REQUEST_ACK;
@@ -358,7 +347,7 @@ out:
 call_terminated:
 	rxrpc_free_skb(skb);
 	_leave(" = %d", -call->error);
-	return ret;
+	return -call->error;
 
 maybe_error:
 	if (copied)
@@ -451,29 +440,6 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
 	return 0;
 }
 
-/*
- * abort a call, sending an ABORT packet to the peer
- */
-static void rxrpc_send_abort(struct rxrpc_call *call, const char *why,
-			     u32 abort_code, int error)
-{
-	if (call->state >= RXRPC_CALL_COMPLETE)
-		return;
-
-	write_lock_bh(&call->state_lock);
-
-	if (__rxrpc_abort_call(why, call, 0, abort_code, error)) {
-		del_timer_sync(&call->resend_timer);
-		del_timer_sync(&call->ack_timer);
-		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-		clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-		rxrpc_queue_call(call);
-	}
-
-	write_unlock_bh(&call->state_lock);
-}
-
 /*
  * Create a new client call for sendmsg().
  */
@@ -549,7 +515,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 			return PTR_ERR(call);
 	}
 
-	rxrpc_see_call(call);
 	_debug("CALL %d USR %lx ST %d on CONN %p",
 	       call->debug_id, call->user_call_ID, call->state, call->conn);
 
@@ -557,8 +522,10 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		/* it's too late for this call */
 		ret = -ESHUTDOWN;
 	} else if (cmd == RXRPC_CMD_SEND_ABORT) {
-		rxrpc_send_abort(call, "CMD", abort_code, ECONNABORTED);
 		ret = 0;
+		if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED))
+			ret = rxrpc_send_call_packet(call,
+						     RXRPC_PACKET_TYPE_ABORT);
 	} else if (cmd != RXRPC_CMD_SEND_DATA) {
 		ret = -EINVAL;
 	} else if (rxrpc_is_client_call(call) &&
@@ -639,7 +606,8 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
 
 	lock_sock(sock->sk);
 
-	rxrpc_send_abort(call, why, abort_code, error);
+	if (rxrpc_abort_call(why, call, 0, abort_code, error))
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
 
 	release_sock(sock->sk);
 	_leave("");
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 9b8f8456d3bf..620d9ccaf3c1 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,133 +18,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-/*
- * set up for the ACK at the end of the receive phase when we discard the final
- * receive phase data packet
- * - called with softirqs disabled
- */
-static void rxrpc_request_final_ACK(struct rxrpc_call *call)
-{
-	/* the call may be aborted before we have a chance to ACK it */
-	write_lock(&call->state_lock);
-
-	switch (call->state) {
-	case RXRPC_CALL_CLIENT_RECV_REPLY:
-		call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
-		_debug("request final ACK");
-
-		set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
-		if (try_to_del_timer_sync(&call->ack_timer) >= 0)
-			rxrpc_queue_call(call);
-		break;
-
-	case RXRPC_CALL_SERVER_RECV_REQUEST:
-		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
-	default:
-		break;
-	}
-
-	write_unlock(&call->state_lock);
-}
-
-/*
- * drop the bottom ACK off of the call ACK window and advance the window
- */
-static void rxrpc_hard_ACK_data(struct rxrpc_call *call, struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	int loop;
-	u32 seq;
-
-	spin_lock_bh(&call->lock);
-
-	_debug("hard ACK #%u", sp->hdr.seq);
-
-	for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
-		call->ackr_window[loop] >>= 1;
-		call->ackr_window[loop] |=
-			call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
-	}
-
-	seq = sp->hdr.seq;
-	ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
-	call->rx_data_eaten = seq;
-
-	if (call->ackr_win_top < UINT_MAX)
-		call->ackr_win_top++;
-
-	ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
-		    call->rx_data_post, >=, call->rx_data_recv);
-	ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
-		    call->rx_data_recv, >=, call->rx_data_eaten);
-
-	if (sp->hdr.flags & RXRPC_LAST_PACKET) {
-		rxrpc_request_final_ACK(call);
-	} else if (atomic_dec_and_test(&call->ackr_not_idle) &&
-		   test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
-		/* We previously soft-ACK'd some received packets that have now
-		 * been consumed, so send a hard-ACK if no more packets are
-		 * immediately forthcoming to allow the transmitter to free up
-		 * its Tx bufferage.
-		 */
-		_debug("send Rx idle ACK");
-		__rxrpc_propose_ACK(call, RXRPC_ACK_IDLE,
-				    skb->priority, sp->hdr.serial, false);
-	}
-
-	spin_unlock_bh(&call->lock);
-}
-
-/**
- * rxrpc_kernel_data_consumed - Record consumption of data message
- * @call: The call to which the message pertains.
- * @skb: Message holding data
- *
- * Record the consumption of a data message and generate an ACK if appropriate.
- * The call state is shifted if this was the final packet.  The caller must be
- * in process context with no spinlocks held.
- *
- * TODO: Actually generate the ACK here rather than punting this to the
- * workqueue.
- */
-void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
-	_enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq);
-
-	ASSERTCMP(sp->call, ==, call);
-	ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
-
-	/* TODO: Fix the sequence number tracking */
-	ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
-	ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
-	ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
-
-	call->rx_data_recv = sp->hdr.seq;
-	rxrpc_hard_ACK_data(call, skb);
-}
-
-/*
- * Destroy a packet that has an RxRPC control buffer
- */
-void rxrpc_packet_destructor(struct sk_buff *skb)
-{
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_call *call = sp->call;
-
-	_enter("%p{%p}", skb, call);
-
-	if (call) {
-		rxrpc_put_call_for_skb(call, skb);
-		sp->call = NULL;
-	}
-
-	if (skb->sk)
-		sock_rfree(skb);
-	_leave("");
-}
-
 /*
  * Note the existence of a new-to-us socket buffer (allocated or dequeued).
  */
-- 
cgit v1.2.3


From 76061f631c2ea4ab9c4d66f3a96ecc5737f5aaf7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Sep 2016 08:34:11 -0700
Subject: tcp: fastopen: avoid negative sk_forward_alloc

When DATA and/or FIN are carried in a SYN/ACK message or SYN message,
we append an skb in socket receive queue, but we forget to call
sk_forced_mem_schedule().

Effect is that the socket has a negative sk->sk_forward_alloc as long as
the message is not read by the application.

Josh Hunt fixed a similar issue in commit d22e15371811 ("tcp: fix tcp
fin memory accounting")

Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_fastopen.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 62a5751d4fe1..4e777a3243f9 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -150,6 +150,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 	tp->segs_in = 0;
 	tcp_segs_in(tp, skb);
 	__skb_pull(skb, tcp_hdrlen(skb));
+	sk_forced_mem_schedule(sk, skb->truesize);
 	skb_set_owner_r(skb, sk);
 
 	TCP_SKB_CB(skb)->seq++;
-- 
cgit v1.2.3


From d545caca827b65aab557a9e9dcdcf1e5a3823c2d Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Thu, 8 Sep 2016 00:42:25 +0900
Subject: net: inet: diag: expose the socket mark to privileged processes.

This adds the capability for a process that has CAP_NET_ADMIN on
a socket to see the socket mark in socket dumps.

Commit a52e95abf772 ("net: diag: allow socket bytecode filters to
match socket marks") recently gave privileged processes the
ability to filter socket dumps based on mark. This patch is
complementary: it ensures that the mark is also passed to
userspace in the socket's netlink attributes.  It is useful for
tools like ss which display information about sockets.

Tested: https://android-review.googlesource.com/270210
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_diag.h      |  4 ++--
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c           | 49 ++++++++++++++++++++++++++++--------------
 net/ipv4/udp_diag.c            | 10 +++++----
 net/sctp/sctp_diag.c           | 20 +++++++++++------
 5 files changed, 56 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index feb04ea20f11..65da430e260f 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 pid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh);
+		      const struct nlmsghdr *unlh, bool net_admin);
 void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
 			 struct netlink_callback *cb,
 			 const struct inet_diag_req_v2 *r,
@@ -56,7 +56,7 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk);
 
 int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			     struct inet_diag_msg *r, int ext,
-			     struct user_namespace *user_ns);
+			     struct user_namespace *user_ns, bool net_admin);
 
 extern int  inet_diag_register(const struct inet_diag_handler *handler);
 extern void inet_diag_unregister(const struct inet_diag_handler *handler);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 5581206a08ae..b5c366f87b3e 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -123,6 +123,7 @@ enum {
 	INET_DIAG_LOCALS,
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
+	INET_DIAG_MARK,
 	__INET_DIAG_MAX,
 };
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abfbe492ebfe..e4d16fc5bbb3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void)
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
 		+ nla_total_size(sizeof(struct inet_diag_msg))
 		+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void)
 
 int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			     struct inet_diag_msg *r, int ext,
-			     struct user_namespace *user_ns)
+			     struct user_namespace *user_ns,
+			     bool net_admin)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 
@@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+		goto errout;
+
 	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
 	r->idiag_inode = sock_i_ino(sk);
 
@@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 portid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh)
+		      const struct nlmsghdr *unlh,
+		      bool net_admin)
 {
 	const struct tcp_congestion_ops *ca_ops;
 	const struct inet_diag_handler *handler;
@@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	r->idiag_timer = 0;
 	r->idiag_retrans = 0;
 
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
 		goto errout;
 
 	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk,
 			      const struct inet_diag_req_v2 *req,
 			      struct user_namespace *user_ns,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh,
+			      bool net_admin)
 {
-	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
-				 user_ns, portid, seq, nlmsg_flags, unlh);
+	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+				 portid, seq, nlmsg_flags, unlh, net_admin);
 }
 
 static int inet_twsk_diag_fill(struct sock *sk,
@@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
 
 static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh, bool net_admin)
 {
+	struct request_sock *reqsk = inet_reqsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	long tmo;
@@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	inet_diag_msg_common_fill(r, sk);
 	r->idiag_state = TCP_SYN_RECV;
 	r->idiag_timer = 1;
-	r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+	r->idiag_retrans = reqsk->num_retrans;
 
 	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
 		     offsetof(struct sock, sk_cookie));
@@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	r->idiag_uid	= 0;
 	r->idiag_inode	= 0;
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+				     inet_rsk(reqsk)->ir_mark))
+		return -EMSGSIZE;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 }
@@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct inet_diag_req_v2 *r,
 			struct user_namespace *user_ns,
 			u32 portid, u32 seq, u16 nlmsg_flags,
-			const struct nlmsghdr *unlh)
+			const struct nlmsghdr *unlh, bool net_admin)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
 		return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV)
 		return inet_req_diag_fill(sk, skb, portid, seq,
-					  nlmsg_flags, unlh);
+					  nlmsg_flags, unlh, net_admin);
 
 	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
-				  nlmsg_flags, unlh);
+				  nlmsg_flags, unlh, net_admin);
 }
 
 struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 	err = sk_diag_fill(sk, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		nlmsg_free(rep);
@@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 			      struct sk_buff *skb,
 			      struct netlink_callback *cb,
 			      const struct inet_diag_req_v2 *r,
-			      const struct nlattr *bc)
+			      const struct nlattr *bc,
+			      bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 	return inet_csk_diag_fill(sk, skb, r,
 				  sk_user_ns(NETLINK_CB(cb->skb).sk),
 				  NETLINK_CB(cb->skb).portid,
-				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+				  net_admin);
 }
 
 static void twsk_build_assert(void)
@@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	int i, num, s_i, s_num;
 	u32 idiag_states = r->idiag_states;
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 
 	if (idiag_states & TCPF_SYN_RECV)
 		idiag_states |= TCPF_NEW_SYN_RECV;
@@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 				    cb->args[3] > 0)
 					goto next_listen;
 
-				if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+				if (inet_csk_diag_dump(sk, skb, cb, r,
+						       bc, net_admin) < 0) {
 					spin_unlock_bh(&ilb->lock);
 					goto done;
 				}
@@ -948,7 +965,7 @@ skip_listen_ht:
 					   sk_user_ns(NETLINK_CB(cb->skb).sk),
 					   NETLINK_CB(cb->skb).portid,
 					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					   cb->nlh);
+					   cb->nlh, net_admin);
 			if (res < 0) {
 				spin_unlock_bh(lock);
 				goto done;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 58b79c0c0d69..9a89c10a55f0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
 static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 			struct netlink_callback *cb,
 			const struct inet_diag_req_v2 *req,
-			struct nlattr *bc)
+			struct nlattr *bc, bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 	return inet_sk_diag_fill(sk, NULL, skb, req,
 			sk_user_ns(NETLINK_CB(cb->skb).sk),
 			NETLINK_CB(cb->skb).portid,
-			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
 }
 
 static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 	err = inet_sk_diag_fill(sk, NULL, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(rep);
@@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 		     struct netlink_callback *cb,
 		     const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 	struct net *net = sock_net(skb->sk);
 	int num, s_num, slot, s_slot;
 
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 			    r->id.idiag_dport)
 				goto next;
 
-			if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+			if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
 				spin_unlock_bh(&hslot->lock);
 				goto done;
 			}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index f3508aa75815..807158e32f5f 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
 			       const struct inet_diag_req_v2 *req,
 			       struct user_namespace *user_ns,
 			       int portid, u32 seq, u16 nlmsg_flags,
-			       const struct nlmsghdr *unlh)
+			       const struct nlmsghdr *unlh,
+			       bool net_admin)
 {
 	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
 	struct list_head *addr_list;
@@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
 		r->idiag_retrans = 0;
 	}
 
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
 		goto errout;
 
 	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
@@ -203,6 +204,7 @@ struct sctp_comm_param {
 	struct netlink_callback *cb;
 	const struct inet_diag_req_v2 *r;
 	const struct nlmsghdr *nlh;
+	bool net_admin;
 };
 
 static size_t inet_assoc_attr_size(struct sctp_association *asoc)
@@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
 		+ nla_total_size(addrlen * asoc->peer.transport_count)
 		+ nla_total_size(addrlen * addrcnt)
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
@@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
 	err = inet_sctp_diag_fill(sk, assoc, rep, req,
 				  sk_user_ns(NETLINK_CB(in_skb).sk),
 				  NETLINK_CB(in_skb).portid,
-				  nlh->nlmsg_seq, 0, nlh);
+				  nlh->nlmsg_seq, 0, nlh,
+				  commp->net_admin);
 	release_sock(sk);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
@@ -310,7 +314,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
 					sk_user_ns(NETLINK_CB(cb->skb).sk),
 					NETLINK_CB(cb->skb).portid,
 					cb->nlh->nlmsg_seq,
-					NLM_F_MULTI, cb->nlh) < 0) {
+					NLM_F_MULTI, cb->nlh,
+					commp->net_admin) < 0) {
 			cb->args[3] = 1;
 			err = 2;
 			goto release;
@@ -320,7 +325,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
 		if (inet_sctp_diag_fill(sk, assoc, skb, r,
 					sk_user_ns(NETLINK_CB(cb->skb).sk),
 					NETLINK_CB(cb->skb).portid,
-					cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
+					cb->nlh->nlmsg_seq, 0, cb->nlh,
+					commp->net_admin) < 0) {
 			err = 2;
 			goto release;
 		}
@@ -375,7 +381,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
 				sk_user_ns(NETLINK_CB(cb->skb).sk),
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
-				cb->nlh) < 0) {
+				cb->nlh, commp->net_admin) < 0) {
 		err = 2;
 		goto out;
 	}
@@ -412,6 +418,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
 		.skb = in_skb,
 		.r = req,
 		.nlh = nlh,
+		.net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
 	};
 
 	if (req->sdiag_family == AF_INET) {
@@ -447,6 +454,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		.skb = skb,
 		.cb = cb,
 		.r = r,
+		.net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
 	};
 
 	/* eps hashtable dumps
-- 
cgit v1.2.3


From 018c1dda5ff1e7bd1fe2d9fd1d0f5b82dc6fc0cd Mon Sep 17 00:00:00 2001
From: Eric Garver <e@erig.me>
Date: Wed, 7 Sep 2016 12:56:59 -0400
Subject: openvswitch: 802.1AD Flow handling, actions, vlan parsing, netlink
 attributes

Add support for 802.1ad including the ability to push and pop double
tagged vlans. Add support for 802.1ad to netlink parsing and flow
conversion. Uses double nested encap attributes to represent double
tagged vlan. Inner TPID encoded along with ctci in nested attributes.

This is based on Thomas F Herbert's original v20 patch. I made some
small clean ups and bug fixes.

Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c      |  16 ++-
 net/openvswitch/flow.c         |  65 ++++++---
 net/openvswitch/flow.h         |   8 +-
 net/openvswitch/flow_netlink.c | 310 ++++++++++++++++++++++++++++-------------
 net/openvswitch/vport.c        |   7 +-
 5 files changed, 282 insertions(+), 124 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ca91fc33f8a9..4fe9032b1160 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -246,20 +246,24 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 	int err;
 
 	err = skb_vlan_pop(skb);
-	if (skb_vlan_tag_present(skb))
+	if (skb_vlan_tag_present(skb)) {
 		invalidate_flow_key(key);
-	else
-		key->eth.tci = 0;
+	} else {
+		key->eth.vlan.tci = 0;
+		key->eth.vlan.tpid = 0;
+	}
 	return err;
 }
 
 static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_vlan *vlan)
 {
-	if (skb_vlan_tag_present(skb))
+	if (skb_vlan_tag_present(skb)) {
 		invalidate_flow_key(key);
-	else
-		key->eth.tci = vlan->vlan_tci;
+	} else {
+		key->eth.vlan.tci = vlan->vlan_tci;
+		key->eth.vlan.tpid = vlan->vlan_tpid;
+	}
 	return skb_vlan_push(skb, vlan->vlan_tpid,
 			     ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
 }
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0ea128eeeab2..1240ae3b88d2 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -302,24 +302,57 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
 				  sizeof(struct icmp6hdr));
 }
 
-static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+/**
+ * Parse vlan tag from vlan header.
+ * Returns ERROR on memory error.
+ * Returns 0 if it encounters a non-vlan or incomplete packet.
+ * Returns 1 after successfully parsing vlan tag.
+ */
+static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
 {
-	struct qtag_prefix {
-		__be16 eth_type; /* ETH_P_8021Q */
-		__be16 tci;
-	};
-	struct qtag_prefix *qp;
+	struct vlan_head *vh = (struct vlan_head *)skb->data;
 
-	if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
+	if (likely(!eth_type_vlan(vh->tpid)))
 		return 0;
 
-	if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
-					 sizeof(__be16))))
+	if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16)))
+		return 0;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) +
+				 sizeof(__be16))))
 		return -ENOMEM;
 
-	qp = (struct qtag_prefix *) skb->data;
-	key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
-	__skb_pull(skb, sizeof(struct qtag_prefix));
+	vh = (struct vlan_head *)skb->data;
+	key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
+	key_vh->tpid = vh->tpid;
+
+	__skb_pull(skb, sizeof(struct vlan_head));
+	return 1;
+}
+
+static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	int res;
+
+	key->eth.vlan.tci = 0;
+	key->eth.vlan.tpid = 0;
+	key->eth.cvlan.tci = 0;
+	key->eth.cvlan.tpid = 0;
+
+	if (likely(skb_vlan_tag_present(skb))) {
+		key->eth.vlan.tci = htons(skb->vlan_tci);
+		key->eth.vlan.tpid = skb->vlan_proto;
+	} else {
+		/* Parse outer vlan tag in the non-accelerated case. */
+		res = parse_vlan_tag(skb, &key->eth.vlan);
+		if (res <= 0)
+			return res;
+	}
+
+	/* Parse inner vlan tag. */
+	res = parse_vlan_tag(skb, &key->eth.cvlan);
+	if (res <= 0)
+		return res;
 
 	return 0;
 }
@@ -480,12 +513,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 	 * update skb->csum here.
 	 */
 
-	key->eth.tci = 0;
-	if (skb_vlan_tag_present(skb))
-		key->eth.tci = htons(skb->vlan_tci);
-	else if (eth->h_proto == htons(ETH_P_8021Q))
-		if (unlikely(parse_vlan(skb, key)))
-			return -ENOMEM;
+	if (unlikely(parse_vlan(skb, key)))
+		return -ENOMEM;
 
 	key->eth.type = parse_ethertype(skb);
 	if (unlikely(key->eth.type == htons(0)))
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 03378e75a67c..156a3029c17b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -50,6 +50,11 @@ struct ovs_tunnel_info {
 	struct metadata_dst	*tun_dst;
 };
 
+struct vlan_head {
+	__be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/
+	__be16 tci;  /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+};
+
 #define OVS_SW_FLOW_KEY_METADATA_SIZE			\
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
@@ -69,7 +74,8 @@ struct sw_flow_key {
 	struct {
 		u8     src[ETH_ALEN];	/* Ethernet source address. */
 		u8     dst[ETH_ALEN];	/* Ethernet destination address. */
-		__be16 tci;		/* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+		struct vlan_head vlan;
+		struct vlan_head cvlan;
 		__be16 type;		/* Ethernet frame type. */
 	} eth;
 	union {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c78a6a1476fb..8efa718ddb5e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -808,6 +808,167 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb,
 				  ip_tunnel_info_af(tun_info));
 }
 
+static int encode_vlan_from_nlattrs(struct sw_flow_match *match,
+				    const struct nlattr *a[],
+				    bool is_mask, bool inner)
+{
+	__be16 tci = 0;
+	__be16 tpid = 0;
+
+	if (a[OVS_KEY_ATTR_VLAN])
+		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+	if (a[OVS_KEY_ATTR_ETHERTYPE])
+		tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+	if (likely(!inner)) {
+		SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask);
+		SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask);
+	} else {
+		SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask);
+		SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask);
+	}
+	return 0;
+}
+
+static int validate_vlan_from_nlattrs(const struct sw_flow_match *match,
+				      u64 key_attrs, bool inner,
+				      const struct nlattr **a, bool log)
+{
+	__be16 tci = 0;
+
+	if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+	      (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+	       eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) {
+		/* Not a VLAN. */
+		return 0;
+	}
+
+	if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+	      (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+		OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN");
+		return -EINVAL;
+	}
+
+	if (a[OVS_KEY_ATTR_VLAN])
+		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+	if (!(tci & htons(VLAN_TAG_PRESENT))) {
+		if (tci) {
+			OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
+				  (inner) ? "C-VLAN" : "VLAN");
+			return -EINVAL;
+		} else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
+			/* Corner case for truncated VLAN header. */
+			OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.",
+				  (inner) ? "C-VLAN" : "VLAN");
+			return -EINVAL;
+		}
+	}
+
+	return 1;
+}
+
+static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
+					   u64 key_attrs, bool inner,
+					   const struct nlattr **a, bool log)
+{
+	__be16 tci = 0;
+	__be16 tpid = 0;
+	bool encap_valid = !!(match->key->eth.vlan.tci &
+			      htons(VLAN_TAG_PRESENT));
+	bool i_encap_valid = !!(match->key->eth.cvlan.tci &
+				htons(VLAN_TAG_PRESENT));
+
+	if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) {
+		/* Not a VLAN. */
+		return 0;
+	}
+
+	if ((!inner && !encap_valid) || (inner && !i_encap_valid)) {
+		OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.",
+			  (inner) ? "C-VLAN" : "VLAN");
+		return -EINVAL;
+	}
+
+	if (a[OVS_KEY_ATTR_VLAN])
+		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+	if (a[OVS_KEY_ATTR_ETHERTYPE])
+		tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+	if (tpid != htons(0xffff)) {
+		OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).",
+			  (inner) ? "C-VLAN" : "VLAN", ntohs(tpid));
+		return -EINVAL;
+	}
+	if (!(tci & htons(VLAN_TAG_PRESENT))) {
+		OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.",
+			  (inner) ? "C-VLAN" : "VLAN");
+		return -EINVAL;
+	}
+
+	return 1;
+}
+
+static int __parse_vlan_from_nlattrs(struct sw_flow_match *match,
+				     u64 *key_attrs, bool inner,
+				     const struct nlattr **a, bool is_mask,
+				     bool log)
+{
+	int err;
+	const struct nlattr *encap;
+
+	if (!is_mask)
+		err = validate_vlan_from_nlattrs(match, *key_attrs, inner,
+						 a, log);
+	else
+		err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner,
+						      a, log);
+	if (err <= 0)
+		return err;
+
+	err = encode_vlan_from_nlattrs(match, a, is_mask, inner);
+	if (err)
+		return err;
+
+	*key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+	*key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+	*key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+
+	encap = a[OVS_KEY_ATTR_ENCAP];
+
+	if (!is_mask)
+		err = parse_flow_nlattrs(encap, a, key_attrs, log);
+	else
+		err = parse_flow_mask_nlattrs(encap, a, key_attrs, log);
+
+	return err;
+}
+
+static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
+				   u64 *key_attrs, const struct nlattr **a,
+				   bool is_mask, bool log)
+{
+	int err;
+	bool encap_valid = false;
+
+	err = __parse_vlan_from_nlattrs(match, key_attrs, false, a,
+					is_mask, log);
+	if (err)
+		return err;
+
+	encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT));
+	if (encap_valid) {
+		err = __parse_vlan_from_nlattrs(match, key_attrs, true, a,
+						is_mask, log);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				 u64 *attrs, const struct nlattr **a,
 				 bool is_mask, bool log)
@@ -923,20 +1084,11 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 	}
 
 	if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
-		__be16 tci;
-
-		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
-		if (!(tci & htons(VLAN_TAG_PRESENT))) {
-			if (is_mask)
-				OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.");
-			else
-				OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set.");
-
-			return -EINVAL;
-		}
-
-		SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
-		attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+		/* VLAN attribute is always parsed before getting here since it
+		 * may occur multiple times.
+		 */
+		OVS_NLERR(log, "VLAN attribute unexpected.");
+		return -EINVAL;
 	}
 
 	if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
@@ -1182,49 +1334,18 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
 		      bool log)
 {
 	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
-	const struct nlattr *encap;
 	struct nlattr *newmask = NULL;
 	u64 key_attrs = 0;
 	u64 mask_attrs = 0;
-	bool encap_valid = false;
 	int err;
 
 	err = parse_flow_nlattrs(nla_key, a, &key_attrs, log);
 	if (err)
 		return err;
 
-	if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
-	    (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
-	    (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
-		__be16 tci;
-
-		if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
-		      (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
-			OVS_NLERR(log, "Invalid Vlan frame.");
-			return -EINVAL;
-		}
-
-		key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
-		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
-		encap = a[OVS_KEY_ATTR_ENCAP];
-		key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
-		encap_valid = true;
-
-		if (tci & htons(VLAN_TAG_PRESENT)) {
-			err = parse_flow_nlattrs(encap, a, &key_attrs, log);
-			if (err)
-				return err;
-		} else if (!tci) {
-			/* Corner case for truncated 802.1Q header. */
-			if (nla_len(encap)) {
-				OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute.");
-				return -EINVAL;
-			}
-		} else {
-			OVS_NLERR(log, "Encap attr is set for non-VLAN frame");
-			return  -EINVAL;
-		}
-	}
+	err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log);
+	if (err)
+		return err;
 
 	err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
 	if (err)
@@ -1265,46 +1386,12 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
 			goto free_newmask;
 
 		/* Always match on tci. */
-		SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
-
-		if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
-			__be16 eth_type = 0;
-			__be16 tci = 0;
-
-			if (!encap_valid) {
-				OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame.");
-				err = -EINVAL;
-				goto free_newmask;
-			}
-
-			mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
-			if (a[OVS_KEY_ATTR_ETHERTYPE])
-				eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-
-			if (eth_type == htons(0xffff)) {
-				mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
-				encap = a[OVS_KEY_ATTR_ENCAP];
-				err = parse_flow_mask_nlattrs(encap, a,
-							      &mask_attrs, log);
-				if (err)
-					goto free_newmask;
-			} else {
-				OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).",
-					  ntohs(eth_type));
-				err = -EINVAL;
-				goto free_newmask;
-			}
-
-			if (a[OVS_KEY_ATTR_VLAN])
-				tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+		SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true);
+		SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true);
 
-			if (!(tci & htons(VLAN_TAG_PRESENT))) {
-				OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).",
-					  ntohs(tci));
-				err = -EINVAL;
-				goto free_newmask;
-			}
-		}
+		err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log);
+		if (err)
+			goto free_newmask;
 
 		err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
 					   log);
@@ -1410,12 +1497,25 @@ int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
 	return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
 }
 
+static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
+			    bool is_mask)
+{
+	__be16 eth_type = !is_mask ? vh->tpid : htons(0xffff);
+
+	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+	    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 			     const struct sw_flow_key *output, bool is_mask,
 			     struct sk_buff *skb)
 {
 	struct ovs_key_ethernet *eth_key;
-	struct nlattr *nla, *encap;
+	struct nlattr *nla;
+	struct nlattr *encap = NULL;
+	struct nlattr *in_encap = NULL;
 
 	if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
 		goto nla_put_failure;
@@ -1464,17 +1564,21 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	ether_addr_copy(eth_key->eth_src, output->eth.src);
 	ether_addr_copy(eth_key->eth_dst, output->eth.dst);
 
-	if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
-		__be16 eth_type;
-		eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
-		if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
-		    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
+	if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
+		if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
 			goto nla_put_failure;
 		encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
-		if (!swkey->eth.tci)
+		if (!swkey->eth.vlan.tci)
 			goto unencap;
-	} else
-		encap = NULL;
+
+		if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
+			if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
+				goto nla_put_failure;
+			in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+			if (!swkey->eth.cvlan.tci)
+				goto unencap;
+		}
+	}
 
 	if (swkey->eth.type == htons(ETH_P_802_2)) {
 		/*
@@ -1493,6 +1597,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
 		goto nla_put_failure;
 
+	if (eth_type_vlan(swkey->eth.type)) {
+		/* There are 3 VLAN tags, we don't know anything about the rest
+		 * of the packet, so truncate here.
+		 */
+		WARN_ON_ONCE(!(encap && in_encap));
+		goto unencap;
+	}
+
 	if (swkey->eth.type == htons(ETH_P_IP)) {
 		struct ovs_key_ipv4 *ipv4_key;
 
@@ -1619,6 +1731,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	}
 
 unencap:
+	if (in_encap)
+		nla_nest_end(skb, in_encap);
 	if (encap)
 		nla_nest_end(skb, encap);
 
@@ -2283,7 +2397,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 			vlan = nla_data(a);
-			if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+			if (!eth_type_vlan(vlan->vlan_tpid))
 				return -EINVAL;
 			if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
 				return -EINVAL;
@@ -2388,7 +2502,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 
 	(*sfa)->orig_len = nla_len(attr);
 	err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type,
-				     key->eth.tci, log);
+				     key->eth.vlan.tci, log);
 	if (err)
 		ovs_nla_free_flow_actions(*sfa);
 
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6b21fd068d87..8f198437c724 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -485,9 +485,14 @@ static unsigned int packet_length(const struct sk_buff *skb)
 {
 	unsigned int length = skb->len - ETH_HLEN;
 
-	if (skb->protocol == htons(ETH_P_8021Q))
+	if (skb_vlan_tagged(skb))
 		length -= VLAN_HLEN;
 
+	/* Don't subtract for multiple VLAN tags. Most (all?) drivers allow
+	 * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none
+	 * account for 802.1ad. e.g. is_skb_forwardable().
+	 */
+
 	return length;
 }
 
-- 
cgit v1.2.3


From db7196a0d0984b933ccf2cd6a60e26abf466e8a3 Mon Sep 17 00:00:00 2001
From: Artem Germanov <agermanov@anchorfree.com>
Date: Wed, 7 Sep 2016 10:49:36 -0700
Subject: tcp: cwnd does not increase in TCP YeAH

Commit 76174004a0f19785a328f40388e87e982bbf69b9
(tcp: do not slow start when cwnd equals ssthresh )
introduced regression in TCP YeAH. Using 100ms delay 1% loss virtual
ethernet link kernel 4.2 shows bandwidth ~500KB/s for single TCP
connection and kernel 4.3 and above (including 4.8-rc4) shows bandwidth
~100KB/s.
   That is caused by stalled cwnd when cwnd equals ssthresh. This patch
fixes it by proper increasing cwnd in this case.

Signed-off-by: Artem Germanov <agermanov@anchorfree.com>
Acked-by: Dmitry Adamushko <d.adamushko@anchorfree.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_yeah.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 028eb046ea40..9c5fc973267f 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -76,7 +76,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (!tcp_is_cwnd_limited(sk))
 		return;
 
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
+	if (tcp_in_slow_start(tp))
 		tcp_slow_start(tp, acked);
 
 	else if (!yeah->doing_reno_now) {
-- 
cgit v1.2.3


From 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a Mon Sep 17 00:00:00 2001
From: Yaogong Wang <wygivan@google.com>
Date: Wed, 7 Sep 2016 14:49:28 -0700
Subject: tcp: use an RB tree for ooo receive queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Over the years, TCP BDP has increased by several orders of magnitude,
and some people are considering to reach the 2 Gbytes limit.

Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000
MSS.

In presence of packet losses (or reorders), TCP stores incoming packets
into an out of order queue, and number of skbs sitting there waiting for
the missing packets to be received can be in the 10^5 range.

Most packets are appended to the tail of this queue, and when
packets can finally be transferred to receive queue, we scan the queue
from its head.

However, in presence of heavy losses, we might have to find an arbitrary
point in this queue, involving a linear scan for every incoming packet,
throwing away cpu caches.

This patch converts it to a RB tree, to get bounded latencies.

Yaogong wrote a preliminary patch about 2 years ago.
Eric did the rebase, added ofo_last_skb cache, polishing and tests.

Tested with network dropping between 1 and 10 % packets, with good
success (about 30 % increase of throughput in stress tests)

Next step would be to also use an RB tree for the write queue at sender
side ;)

Signed-off-by: Yaogong Wang <wygivan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-By: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   |   2 +
 include/linux/tcp.h      |   7 +-
 include/net/tcp.h        |   2 +-
 net/core/skbuff.c        |  19 +++
 net/ipv4/tcp.c           |   4 +-
 net/ipv4/tcp_input.c     | 330 +++++++++++++++++++++++++++--------------------
 net/ipv4/tcp_ipv4.c      |   2 +-
 net/ipv4/tcp_minisocks.c |   1 -
 8 files changed, 218 insertions(+), 149 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cfb7219be665..4c5662f05bda 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
+void skb_rbtree_purge(struct rb_root *root);
+
 void *netdev_alloc_frag(unsigned int fragsz);
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7be9b1242354..c723a465125d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -281,10 +281,9 @@ struct tcp_sock {
 	struct sk_buff* lost_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
 
-	/* OOO segments go in this list. Note that socket lock must be held,
-	 * as we do not use sk_buff_head lock.
-	 */
-	struct sk_buff_head	out_of_order_queue;
+	/* OOO segments go in this rbtree. Socket lock must be held. */
+	struct rb_root	out_of_order_queue;
+	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */
 
 	/* SACKs data, these 2 need to be together (see tcp_options_write) */
 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d6ae36512429..fdfbedd61c67 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -640,7 +640,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (skb_queue_empty(&tp->out_of_order_queue) &&
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
 	    tp->rcv_wnd &&
 	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
 	    !tp->urg_data)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..1e329d411242 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2444,6 +2444,25 @@ void skb_queue_purge(struct sk_buff_head *list)
 }
 EXPORT_SYMBOL(skb_queue_purge);
 
+/**
+ *	skb_rbtree_purge - empty a skb rbtree
+ *	@root: root of the rbtree to empty
+ *
+ *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ *	the list and one reference dropped. This function does not take
+ *	any lock. Synchronization should be handled by the caller (e.g., TCP
+ *	out-of-order queue is protected by the socket lock).
+ */
+void skb_rbtree_purge(struct rb_root *root)
+{
+	struct sk_buff *skb, *next;
+
+	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+		kfree_skb(skb);
+
+	*root = RB_ROOT;
+}
+
 /**
  *	skb_queue_head - queue a buffer at the list head
  *	@list: list to use
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 77311a92275c..a13fcb369f52 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -380,7 +380,7 @@ void tcp_init_sock(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	__skb_queue_head_init(&tp->out_of_order_queue);
+	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
 	INIT_LIST_HEAD(&tp->tsq_node);
@@ -2243,7 +2243,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
 	tcp_write_queue_purge(sk);
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 	inet->inet_dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8cd02c0b056c..a5934c4c8cd4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4108,7 +4108,7 @@ void tcp_fin(struct sock *sk)
 	/* It _is_ possible, that we have something out-of-order _after_ FIN.
 	 * Probably, we should reset in this case. For now drop them.
 	 */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 	if (tcp_is_sack(tp))
 		tcp_sack_reset(&tp->rx_opt);
 	sk_mem_reclaim(sk);
@@ -4268,7 +4268,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	int this_sack;
 
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-	if (skb_queue_empty(&tp->out_of_order_queue)) {
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
 		return;
 	}
@@ -4344,10 +4344,13 @@ static void tcp_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 dsack_high = tp->rcv_nxt;
+	bool fin, fragstolen, eaten;
 	struct sk_buff *skb, *tail;
-	bool fragstolen, eaten;
+	struct rb_node *p;
 
-	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+	p = rb_first(&tp->out_of_order_queue);
+	while (p) {
+		skb = rb_entry(p, struct sk_buff, rbnode);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4357,9 +4360,10 @@ static void tcp_ofo_queue(struct sock *sk)
 				dsack_high = TCP_SKB_CB(skb)->end_seq;
 			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
 		}
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
 
-		__skb_unlink(skb, &tp->out_of_order_queue);
-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			tcp_drop(sk, skb);
 			continue;
@@ -4371,12 +4375,19 @@ static void tcp_ofo_queue(struct sock *sk)
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
-			tcp_fin(sk);
-		if (eaten)
+		else
 			kfree_skb_partial(skb, fragstolen);
+
+		if (unlikely(fin)) {
+			tcp_fin(sk);
+			/* tcp_fin() purges tp->out_of_order_queue,
+			 * so we must end this loop right now.
+			 */
+			break;
+		}
 	}
 }
 
@@ -4403,8 +4414,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct rb_node **p, *q, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
+	bool fragstolen;
 
 	tcp_ecn_check_ce(tp, skb);
 
@@ -4419,88 +4432,85 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+	seq = TCP_SKB_CB(skb)->seq;
+	end_seq = TCP_SKB_CB(skb)->end_seq;
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+		   tp->rcv_nxt, seq, end_seq);
 
-	skb1 = skb_peek_tail(&tp->out_of_order_queue);
-	if (!skb1) {
+	p = &tp->out_of_order_queue.rb_node;
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		/* Initial out of order segment, build 1 SACK. */
 		if (tcp_is_sack(tp)) {
 			tp->rx_opt.num_sacks = 1;
-			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
-			tp->selective_acks[0].end_seq =
-						TCP_SKB_CB(skb)->end_seq;
+			tp->selective_acks[0].start_seq = seq;
+			tp->selective_acks[0].end_seq = end_seq;
 		}
-		__skb_queue_head(&tp->out_of_order_queue, skb);
+		rb_link_node(&skb->rbnode, NULL, p);
+		rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+		tp->ooo_last_skb = skb;
 		goto end;
 	}
 
-	seq = TCP_SKB_CB(skb)->seq;
-	end_seq = TCP_SKB_CB(skb)->end_seq;
-
-	if (seq == TCP_SKB_CB(skb1)->end_seq) {
-		bool fragstolen;
-
-		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
-			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-		} else {
-			tcp_grow_window(sk, skb);
-			kfree_skb_partial(skb, fragstolen);
-			skb = NULL;
+	/* In the typical case, we are adding an skb to the end of the list.
+	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+	 */
+	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+coalesce_done:
+		tcp_grow_window(sk, skb);
+		kfree_skb_partial(skb, fragstolen);
+		skb = NULL;
+		goto add_sack;
+	}
+
+	/* Find place to insert this segment. Handle overlaps on the way. */
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+			p = &parent->rb_left;
+			continue;
 		}
-
-		if (!tp->rx_opt.num_sacks ||
-		    tp->selective_acks[0].end_seq != seq)
-			goto add_sack;
-
-		/* Common case: data arrive in order after hole. */
-		tp->selective_acks[0].end_seq = end_seq;
-		goto end;
-	}
-
-	/* Find place to insert this segment. */
-	while (1) {
-		if (!after(TCP_SKB_CB(skb1)->seq, seq))
-			break;
-		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
-			skb1 = NULL;
-			break;
+		if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				/* All the bits are present. Drop. */
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb);
+				skb = NULL;
+				tcp_dsack_set(sk, seq, end_seq);
+				goto add_sack;
+			}
+			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+				/* Partial overlap. */
+				tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+			} else {
+				/* skb's seq == skb1's seq and skb covers skb1.
+				 * Replace skb1 with skb.
+				 */
+				rb_replace_node(&skb1->rbnode, &skb->rbnode,
+						&tp->out_of_order_queue);
+				tcp_dsack_extend(sk,
+						 TCP_SKB_CB(skb1)->seq,
+						 TCP_SKB_CB(skb1)->end_seq);
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				__kfree_skb(skb1);
+				goto add_sack;
+			}
+		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+			goto coalesce_done;
 		}
-		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+		p = &parent->rb_right;
 	}
 
-	/* Do skb overlap to previous one? */
-	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
-		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
-			/* All the bits are present. Drop. */
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-			tcp_drop(sk, skb);
-			skb = NULL;
-			tcp_dsack_set(sk, seq, end_seq);
-			goto add_sack;
-		}
-		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
-			/* Partial overlap. */
-			tcp_dsack_set(sk, seq,
-				      TCP_SKB_CB(skb1)->end_seq);
-		} else {
-			if (skb_queue_is_first(&tp->out_of_order_queue,
-					       skb1))
-				skb1 = NULL;
-			else
-				skb1 = skb_queue_prev(
-					&tp->out_of_order_queue,
-					skb1);
-		}
-	}
-	if (!skb1)
-		__skb_queue_head(&tp->out_of_order_queue, skb);
-	else
-		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+	/* Insert segment into RB tree. */
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
 
-	/* And clean segments covered by new one as whole. */
-	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
-		skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+	/* Remove other segments covered by skb. */
+	while ((q = rb_next(&skb->rbnode)) != NULL) {
+		skb1 = rb_entry(q, struct sk_buff, rbnode);
 
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
@@ -4509,12 +4519,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 					 end_seq);
 			break;
 		}
-		__skb_unlink(skb1, &tp->out_of_order_queue);
+		rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
 		tcp_drop(sk, skb1);
 	}
+	/* If there is no skb after us, we are the last_skb ! */
+	if (!q)
+		tp->ooo_last_skb = skb;
 
 add_sack:
 	if (tcp_is_sack(tp))
@@ -4651,13 +4664,13 @@ queue_and_out:
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			tcp_fin(sk);
 
-		if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
 
 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
 			 * gap in queue is filled.
 			 */
-			if (skb_queue_empty(&tp->out_of_order_queue))
+			if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 				inet_csk(sk)->icsk_ack.pingpong = 0;
 		}
 
@@ -4711,48 +4724,76 @@ drop:
 	tcp_data_queue_ofo(sk, skb);
 }
 
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+	if (list)
+		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+}
+
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
-					struct sk_buff_head *list)
+					struct sk_buff_head *list,
+					struct rb_root *root)
 {
-	struct sk_buff *next = NULL;
+	struct sk_buff *next = tcp_skb_next(skb, list);
 
-	if (!skb_queue_is_last(list, skb))
-		next = skb_queue_next(list, skb);
+	if (list)
+		__skb_unlink(skb, list);
+	else
+		rb_erase(&skb->rbnode, root);
 
-	__skb_unlink(skb, list);
 	__kfree_skb(skb);
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
 
 	return next;
 }
 
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sk_buff *skb1;
+
+	while (*p) {
+		parent = *p;
+		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, root);
+}
+
 /* Collapse contiguous sequence of skbs head..tail with
  * sequence numbers start..end.
  *
- * If tail is NULL, this means until the end of the list.
+ * If tail is NULL, this means until the end of the queue.
  *
  * Segments with FIN/SYN are not collapsed (only because this
  * simplifies code)
  */
 static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
-	     struct sk_buff *head, struct sk_buff *tail,
-	     u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
 {
-	struct sk_buff *skb, *n;
+	struct sk_buff *skb = head, *n;
+	struct sk_buff_head tmp;
 	bool end_of_skbs;
 
 	/* First, check that queue is collapsible and find
-	 * the point where collapsing can be useful. */
-	skb = head;
+	 * the point where collapsing can be useful.
+	 */
 restart:
-	end_of_skbs = true;
-	skb_queue_walk_from_safe(list, skb, n) {
-		if (skb == tail)
-			break;
+	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+		n = tcp_skb_next(skb, list);
+
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-			skb = tcp_collapse_one(sk, skb, list);
+			skb = tcp_collapse_one(sk, skb, list, root);
 			if (!skb)
 				break;
 			goto restart;
@@ -4770,13 +4811,10 @@ restart:
 			break;
 		}
 
-		if (!skb_queue_is_last(list, skb)) {
-			struct sk_buff *next = skb_queue_next(list, skb);
-			if (next != tail &&
-			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
-				end_of_skbs = false;
-				break;
-			}
+		if (n && n != tail &&
+		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+			end_of_skbs = false;
+			break;
 		}
 
 		/* Decided to skip this, advance start seq. */
@@ -4786,17 +4824,22 @@ restart:
 	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 		return;
 
+	__skb_queue_head_init(&tmp);
+
 	while (before(start, end)) {
 		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
 		struct sk_buff *nskb;
 
 		nskb = alloc_skb(copy, GFP_ATOMIC);
 		if (!nskb)
-			return;
+			break;
 
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		__skb_queue_before(list, skb, nskb);
+		if (list)
+			__skb_queue_before(list, skb, nskb);
+		else
+			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
 		skb_set_owner_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
@@ -4814,14 +4857,17 @@ restart:
 				start += size;
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-				skb = tcp_collapse_one(sk, skb, list);
+				skb = tcp_collapse_one(sk, skb, list, root);
 				if (!skb ||
 				    skb == tail ||
 				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
-					return;
+					goto end;
 			}
 		}
 	}
+end:
+	skb_queue_walk_safe(&tmp, skb, n)
+		tcp_rbtree_insert(root, skb);
 }
 
 /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4830,43 +4876,43 @@ restart:
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
-	struct sk_buff *head;
+	struct sk_buff *skb, *head;
+	struct rb_node *p;
 	u32 start, end;
 
-	if (!skb)
+	p = rb_first(&tp->out_of_order_queue);
+	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+new_range:
+	if (!skb) {
+		p = rb_last(&tp->out_of_order_queue);
+		/* Note: This is possible p is NULL here. We do not
+		 * use rb_entry_safe(), as ooo_last_skb is valid only
+		 * if rbtree is not empty.
+		 */
+		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
 		return;
-
+	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
-	head = skb;
-
-	for (;;) {
-		struct sk_buff *next = NULL;
 
-		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
-			next = skb_queue_next(&tp->out_of_order_queue, skb);
-		skb = next;
+	for (head = skb;;) {
+		skb = tcp_skb_next(skb, NULL);
 
-		/* Segment is terminated when we see gap or when
-		 * we are at the end of all the queue. */
+		/* Range is terminated when we see a gap or when
+		 * we are at the queue end.
+		 */
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, &tp->out_of_order_queue,
+			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
 				     head, skb, start, end);
-			head = skb;
-			if (!skb)
-				break;
-			/* Start new segment */
+			goto new_range;
+		}
+
+		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 			start = TCP_SKB_CB(skb)->seq;
+		if (after(TCP_SKB_CB(skb)->end_seq, end))
 			end = TCP_SKB_CB(skb)->end_seq;
-		} else {
-			if (before(TCP_SKB_CB(skb)->seq, start))
-				start = TCP_SKB_CB(skb)->seq;
-			if (after(TCP_SKB_CB(skb)->end_seq, end))
-				end = TCP_SKB_CB(skb)->end_seq;
-		}
 	}
 }
 
@@ -4883,20 +4929,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
+	struct rb_node *node, *prev;
 
-	if (skb_queue_empty(&tp->out_of_order_queue))
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		return false;
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-
-	while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
-		tcp_drop(sk, skb);
+	node = &tp->ooo_last_skb->rbnode;
+	do {
+		prev = rb_prev(node);
+		rb_erase(node, &tp->out_of_order_queue);
+		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
 		sk_mem_reclaim(sk);
 		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 		    !tcp_under_memory_pressure(sk))
 			break;
-	}
+		node = prev;
+	} while (node);
+	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
@@ -4930,7 +4980,7 @@ static int tcp_prune_queue(struct sock *sk)
 
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		tcp_collapse(sk, &sk->sk_receive_queue,
+		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
 			     skb_peek(&sk->sk_receive_queue),
 			     NULL,
 			     tp->copied_seq, tp->rcv_nxt);
@@ -5035,7 +5085,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* We have out of order data. */
-	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+	    (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
 		/* Then ack it now */
 		tcp_send_ack(sk);
 	} else {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a75bf48d7950..04b989328558 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1845,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	tcp_write_queue_purge(sk);
 
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	skb_rbtree_purge(&tp->out_of_order_queue);
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Clean up the MD5 key list, if any */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4b95ec4ed2c8..f63c73dc0acb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -488,7 +488,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_cwnd_cnt = 0;
 
 		tcp_init_xmit_timers(newsk);
-		__skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
 
 		newtp->rx_opt.saw_tstamp = 0;
-- 
cgit v1.2.3


From e895cdce683161081e3626c4f5a5c55cb72089f8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Sep 2016 21:52:56 -0700
Subject: ipv4: accept u8 in IP_TOS ancillary data

In commit f02db315b8d8 ("ipv4: IP_TOS and IP_TTL can be specified as
ancillary data") Francesco added IP_TOS values specified as integer.

However, kernel sends to userspace (at recvmsg() time) an IP_TOS value
in a single byte, when IP_RECVTOS is set on the socket.

It can be very useful to reflect all ancillary options as given by the
kernel in a subsequent sendmsg(), instead of aborting the sendmsg() with
EINVAL after Francesco patch.

So this patch extends IP_TOS ancillary to accept an u8, so that an UDP
server can simply reuse same ancillary block without having to mangle
it.

Jesper can then augment
https://github.com/netoptimizer/network-testing/blob/master/src/udp_example02.c
to add TOS reflection ;)

Fixes: f02db315b8d8 ("ipv4: IP_TOS and IP_TTL can be specified as ancillary data")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Francesco Fusco <ffusco@redhat.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 71a52f4d4cff..af4919792b6a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -284,9 +284,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			ipc->ttl = val;
 			break;
 		case IP_TOS:
-			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+			if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+				val = *(int *)CMSG_DATA(cmsg);
+			else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+				val = *(u8 *)CMSG_DATA(cmsg);
+			else
 				return -EINVAL;
-			val = *(int *)CMSG_DATA(cmsg);
 			if (val < 0 || val > 255)
 				return -EINVAL;
 			ipc->tos = val;
-- 
cgit v1.2.3


From 2f30ea5090cbc57ea573cdc66421264b3de3fb0a Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 8 Sep 2016 18:09:57 +0200
Subject: xfrm_user: propagate sec ctx allocation errors

When we fail to attach the security context in xfrm_state_construct()
we'll return 0 as error value which, in turn, will wrongly claim success
to userland when, in fact, we won't be adding / updating the XFRM state.

This is a regression introduced by commit fd21150a0fe1 ("[XFRM] netlink:
Inline attach_encap_tmpl(), attach_sec_ctx(), and attach_one_addr()").

Fix it by propagating the error returned by security_xfrm_state_alloc()
in this case.

Fixes: fd21150a0fe1 ("[XFRM] netlink: Inline attach_encap_tmpl()...")
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index cb65d916a345..08892091cfe3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	if (err)
 		goto error;
 
-	if (attrs[XFRMA_SEC_CTX] &&
-	    security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
-		goto error;
+	if (attrs[XFRMA_SEC_CTX]) {
+		err = security_xfrm_state_alloc(x,
+						nla_data(attrs[XFRMA_SEC_CTX]));
+		if (err)
+			goto error;
+	}
 
 	if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
 					       attrs[XFRMA_REPLAY_ESN_VAL])))
-- 
cgit v1.2.3


From 1fb81e09d487656aa23f2acb1232c7f56b4c2367 Mon Sep 17 00:00:00 2001
From: "thomas.zeitlhofer+lkml@ze-it.at" <thomas.zeitlhofer+lkml@ze-it.at>
Date: Wed, 7 Sep 2016 20:40:38 +0200
Subject: vti: use right inner_mode for inbound inter address family policy
 checks

In case of inter address family tunneling (IPv6 over vti4 or IPv4 over
vti6), the inbound policy checks in vti_rcv_cb() and vti6_rcv_cb() are
using the wrong address family. As a result, all inbound inter address
family traffic is dropped.

Use the xfrm_ip2inner_mode() helper, as done in xfrm_input() (i.e., also
increment LINUX_MIB_XFRMINSTATEMODEERROR in case of error), to select the
inner_mode that contains the right address family for the inbound policy
checks.

Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c  | 15 ++++++++++++++-
 net/ipv6/ip6_vti.c | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cc701fa70b12..5d7944f394d9 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
 	struct net_device *dev;
 	struct pcpu_sw_netstats *tstats;
 	struct xfrm_state *x;
+	struct xfrm_mode *inner_mode;
 	struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
 	u32 orig_mark = skb->mark;
 	int ret;
@@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
 	}
 
 	x = xfrm_input_state(skb);
-	family = x->inner_mode->afinfo->family;
+
+	inner_mode = x->inner_mode;
+
+	if (x->sel.family == AF_UNSPEC) {
+		inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+		if (inner_mode == NULL) {
+			XFRM_INC_STATS(dev_net(skb->dev),
+				       LINUX_MIB_XFRMINSTATEMODEERROR);
+			return -EINVAL;
+		}
+	}
+
+	family = inner_mode->afinfo->family;
 
 	skb->mark = be32_to_cpu(tunnel->parms.i_key);
 	ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d90a11f14040..52a2f735881f 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -340,6 +340,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
 	struct net_device *dev;
 	struct pcpu_sw_netstats *tstats;
 	struct xfrm_state *x;
+	struct xfrm_mode *inner_mode;
 	struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
 	u32 orig_mark = skb->mark;
 	int ret;
@@ -357,7 +358,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
 	}
 
 	x = xfrm_input_state(skb);
-	family = x->inner_mode->afinfo->family;
+
+	inner_mode = x->inner_mode;
+
+	if (x->sel.family == AF_UNSPEC) {
+		inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+		if (inner_mode == NULL) {
+			XFRM_INC_STATS(dev_net(skb->dev),
+				       LINUX_MIB_XFRMINSTATEMODEERROR);
+			return -EINVAL;
+		}
+	}
+
+	family = inner_mode->afinfo->family;
 
 	skb->mark = be32_to_cpu(t->parms.i_key);
 	ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
-- 
cgit v1.2.3


From b93e1fa7106582e3a81cc818b719e0341585ff1b Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 7 Sep 2016 17:20:46 +0200
Subject: ipv4: fix value of ->nlmsg_flags reported in RTM_NEWROUTE events

fib_table_insert() inconsistently fills the nlmsg_flags field in its
notification messages.

Since commit b8f558313506 ("[RTNETLINK]: Fix sending netlink message
when replace route."), the netlink message has its nlmsg_flags set to
NLM_F_REPLACE if the route replaced a preexisting one.

Then commit a2bb6d7d6f42 ("ipv4: include NLM_F_APPEND flag in append
route notifications") started setting nlmsg_flags to NLM_F_APPEND if
the route matched a preexisting one but was appended.

In other cases (exclusive creation or prepend), nlmsg_flags is 0.

This patch sets ->nlmsg_flags in all situations, preserving the
semantic of the NLM_F_* bits:

  * NLM_F_CREATE: a new fib entry has been created for this route.
  * NLM_F_EXCL: no other fib entry existed for this route.
  * NLM_F_REPLACE: this route has overwritten a preexisting fib entry.
  * NLM_F_APPEND: the new fib entry was added after other entries for
    the same route.

As a result, the possible flag combination can now be reported
(iproute2's terminology into parentheses):

  * NLM_F_CREATE | NLM_F_EXCL: route didn't exist, exclusive creation
    ("add").
  * NLM_F_CREATE | NLM_F_APPEND: route did already exist, new route
    added after preexisting ones ("append").
  * NLM_F_CREATE: route did already exist, new route added before
    preexisting ones ("prepend").
  * NLM_F_REPLACE: route did already exist, new route replaced the
    first preexisting one ("change").

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e2ffc2a5c7db..241f27bbd7ad 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1081,7 +1081,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct fib_alias *fa, *new_fa;
 	struct key_vector *l, *tp;
-	unsigned int nlflags = 0;
+	u16 nlflags = NLM_F_EXCL;
 	struct fib_info *fi;
 	u8 plen = cfg->fc_dst_len;
 	u8 slen = KEYLENGTH - plen;
@@ -1126,6 +1126,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		if (cfg->fc_nlflags & NLM_F_EXCL)
 			goto out;
 
+		nlflags &= ~NLM_F_EXCL;
+
 		/* We have 2 goals:
 		 * 1. Find exact match for type, scope, fib_info to avoid
 		 * duplicate routes
@@ -1151,6 +1153,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			struct fib_info *fi_drop;
 			u8 state;
 
+			nlflags |= NLM_F_REPLACE;
 			fa = fa_first;
 			if (fa_match) {
 				if (fa == fa_match)
@@ -1191,7 +1194,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			if (state & FA_S_ACCESSED)
 				rt_cache_flush(cfg->fc_nlinfo.nl_net);
 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
-				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+				tb->tb_id, &cfg->fc_nlinfo, nlflags);
 
 			goto succeeded;
 		}
@@ -1203,7 +1206,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			goto out;
 
 		if (cfg->fc_nlflags & NLM_F_APPEND)
-			nlflags = NLM_F_APPEND;
+			nlflags |= NLM_F_APPEND;
 		else
 			fa = fa_first;
 	}
@@ -1211,6 +1214,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
 		goto out;
 
+	nlflags |= NLM_F_CREATE;
 	err = -ENOBUFS;
 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
 	if (!new_fa)
-- 
cgit v1.2.3


From 73483c1289d148282be3aac3ad30b4aa1f8fac87 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 7 Sep 2016 17:21:40 +0200
Subject: ipv6: report NLM_F_CREATE and NLM_F_EXCL flags in RTM_NEWROUTE events

Since commit 37a1d3611c12 ("ipv6: include NLM_F_REPLACE in route
replace notifications"), RTM_NEWROUTE notifications have their
NLM_F_REPLACE flag set if the new route replaced a preexisting one.
However, other flags aren't set.

This patch reports the missing NLM_F_CREATE and NLM_F_EXCL flag bits.

NLM_F_APPEND is not reported, because in ipv6 a NLM_F_CREATE request
is interpreted as an append request (contrary to ipv4, "prepend" is not
supported, so if NLM_F_EXCL is not set then NLM_F_APPEND is implicit).

As a result, the possible flag combination can now be reported
(iproute2's terminology into parentheses):

  * NLM_F_CREATE | NLM_F_EXCL: route didn't exist, exclusive creation
    ("add").
  * NLM_F_CREATE: route did already exist, new route added after
    preexisting ones ("append").
  * NLM_F_REPLACE: route did already exist, new route replaced the
    first preexisting one ("change").

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 771be1fa4176..ef5485204522 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -743,6 +743,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		   (info->nlh->nlmsg_flags & NLM_F_CREATE));
 	int found = 0;
 	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+	u16 nlflags = NLM_F_EXCL;
 	int err;
 
 	ins = &fn->leaf;
@@ -759,6 +760,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			if (info->nlh &&
 			    (info->nlh->nlmsg_flags & NLM_F_EXCL))
 				return -EEXIST;
+
+			nlflags &= ~NLM_F_EXCL;
 			if (replace) {
 				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
 					found++;
@@ -856,6 +859,7 @@ next_iter:
 			pr_warn("NLM_F_CREATE should be set when creating new route\n");
 
 add:
+		nlflags |= NLM_F_CREATE;
 		err = fib6_commit_metrics(&rt->dst, mxc);
 		if (err)
 			return err;
@@ -864,7 +868,7 @@ add:
 		*ins = rt;
 		rt->rt6i_node = fn;
 		atomic_inc(&rt->rt6i_ref);
-		inet6_rt_notify(RTM_NEWROUTE, rt, info, 0);
+		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
 
 		if (!(fn->fn_flags & RTN_RTINFO)) {
-- 
cgit v1.2.3


From b8b867e132d2c32f16a49b3ce5c11ee289a92c4e Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Wed, 7 Sep 2016 13:57:36 -0700
Subject: rtnetlink: remove unused ifla_stats_policy

This structure is defined but never used. Flagged with W=1

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1dfca1c3f8f5..937e459bdaa9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3669,10 +3669,6 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
-	[IFLA_STATS_LINK_64]	= { .len = sizeof(struct rtnl_link_stats64) },
-};
-
 static size_t if_nlmsg_stats_size(const struct net_device *dev,
 				  u32 filter_mask)
 {
-- 
cgit v1.2.3


From 05f1b12f71a49848730a0eb9acda032d5c43432b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 8 Sep 2016 08:42:06 +0100
Subject: net: x25: remove null checks on arrays calling_ae and called_ae

dtefacs.calling_ae and called_ae are both 20 element __u8 arrays and
cannot be null and hence are redundant checks. Remove these.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index a750f330b8dd..f83b74d3e2ac 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1500,12 +1500,8 @@ out_fac_release:
 			goto out_dtefac_release;
 		if (dtefacs.calling_len > X25_MAX_AE_LEN)
 			goto out_dtefac_release;
-		if (dtefacs.calling_ae == NULL)
-			goto out_dtefac_release;
 		if (dtefacs.called_len > X25_MAX_AE_LEN)
 			goto out_dtefac_release;
-		if (dtefacs.called_ae == NULL)
-			goto out_dtefac_release;
 		x25->dte_facilities = dtefacs;
 		rc = 0;
 out_dtefac_release:
-- 
cgit v1.2.3


From 7303a1475008bee5c3e82a06a282568415690d72 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 8 Sep 2016 17:54:11 +0800
Subject: sctp: identify chunks that need to be fragmented at IP level

Previously, without GSO, it was easy to identify it: if the chunk didn't
fit and there was no data chunk in the packet yet, we could fragment at
IP level. So if there was an auth chunk and we were bundling a big data
chunk, it would fragment regardless of the size of the auth chunk. This
also works for the context of PMTU reductions.

But with GSO, we cannot distinguish such PMTU events anymore, as the
packet is allowed to exceed PMTU.

So we need another check: to ensure that the chunk that we are adding,
actually fits the current PMTU. If it doesn't, trigger a flush and let
it be fragmented at IP level in the next round.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/output.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 1f1682b9a6a8..31b7bc35895d 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -878,7 +878,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
 					struct sctp_chunk *chunk,
 					u16 chunk_len)
 {
-	size_t psize, pmtu;
+	size_t psize, pmtu, maxsize;
 	sctp_xmit_t retval = SCTP_XMIT_OK;
 
 	psize = packet->size;
@@ -906,6 +906,17 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
 			goto out;
 		}
 
+		/* Similarly, if this chunk was built before a PMTU
+		 * reduction, we have to fragment it at IP level now. So
+		 * if the packet already contains something, we need to
+		 * flush.
+		 */
+		maxsize = pmtu - packet->overhead;
+		if (packet->auth)
+			maxsize -= WORD_ROUND(packet->auth->skb->len);
+		if (chunk_len > maxsize)
+			retval = SCTP_XMIT_PMTU_FULL;
+
 		/* It is also okay to fragment if the chunk we are
 		 * adding is a control chunk, but only if current packet
 		 * is not a GSO one otherwise it causes fragmentation of
-- 
cgit v1.2.3


From bf8d85d4f907d2156e6e5d2831378527957d9bde Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Sep 2016 15:40:48 -0700
Subject: ip_tunnel: do not clear l4 hashes

If skb has a valid l4 hash, there is no point clearing hash and force
a further flow dissection when a tunnel encapsulation is added.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0f227db0e9ac..777bc1883870 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -69,7 +69,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 
 	skb_scrub_packet(skb, xnet);
 
-	skb_clear_hash(skb);
+	skb_clear_hash_if_not_l4(skb);
 	skb_dst_set(skb, &rt->dst);
 	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 
-- 
cgit v1.2.3


From 6088b5823b4cb132a838878747384cbfb5ce6646 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Sep 2016 02:45:28 +0200
Subject: bpf: minor cleanups in helpers

Some minor misc cleanups, f.e. use sizeof(__u32) instead of hardcoding
and in __bpf_skb_max_len(), I missed that we always have skb->dev valid
anyway, so we can drop the unneeded test for dev; also few more other
misc bits addressed here.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/helpers.c | 6 +++---
 net/core/filter.c    | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ea3afba1a4f..6df73bd1ba34 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -126,7 +126,7 @@ static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	struct task_struct *task = current;
 
-	if (!task)
+	if (unlikely(!task))
 		return -EINVAL;
 
 	return (u64) task->tgid << 32 | task->pid;
@@ -144,12 +144,12 @@ static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	kuid_t uid;
 	kgid_t gid;
 
-	if (!task)
+	if (unlikely(!task))
 		return -EINVAL;
 
 	current_uid_gid(&uid, &gid);
 	return (u64) from_kgid(&init_user_ns, gid) << 32 |
-		from_kuid(&init_user_ns, uid);
+		     from_kuid(&init_user_ns, uid);
 }
 
 const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
diff --git a/net/core/filter.c b/net/core/filter.c
index a83766be1ad2..628ed8c7d38d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2010,8 +2010,7 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb)
 
 static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 {
-	return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
-	       65536;
+	return skb->dev->mtu + skb->dev->hard_header_len;
 }
 
 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
@@ -2605,7 +2604,7 @@ static bool __is_valid_xdp_access(int off, int size,
 		return false;
 	if (off % size != 0)
 		return false;
-	if (size != 4)
+	if (size != sizeof(__u32))
 		return false;
 
 	return true;
@@ -2727,7 +2726,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					  dst_reg, src_reg, insn);
 
 	case offsetof(struct __sk_buff, cb[0]) ...
-		offsetof(struct __sk_buff, cb[4]):
+	     offsetof(struct __sk_buff, cb[4]):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
 
 		prog->cb_access = 1;
-- 
cgit v1.2.3


From f035a51536af9802f55d8c79bd87f184ebffb093 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Sep 2016 02:45:29 +0200
Subject: bpf: add BPF_SIZEOF and BPF_FIELD_SIZEOF macros

Add BPF_SIZEOF() and BPF_FIELD_SIZEOF() macros to improve the code a bit
which otherwise often result in overly long bytes_to_bpf_size(sizeof())
and bytes_to_bpf_size(FIELD_SIZEOF()) lines. So place them into a macro
helper instead. Moreover, we currently have a BUILD_BUG_ON(BPF_FIELD_SIZEOF())
check in convert_bpf_extensions(), but we should rather make that generic
as well and add a BUILD_BUG_ON() test in all BPF_SIZEOF()/BPF_FIELD_SIZEOF()
users to detect any rewriter size issues at compile time. Note, there are
currently none, but we want to assert that it stays this way.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   | 14 ++++++++++++++
 kernel/trace/bpf_trace.c | 12 ++++++------
 net/core/filter.c        | 15 +++++++--------
 3 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index a16439b99fd9..7fabad8dc3fc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -314,6 +314,20 @@ struct bpf_prog_aux;
 	bpf_size;						\
 })
 
+#define BPF_SIZEOF(type)					\
+	({							\
+		const int __size = bytes_to_bpf_size(sizeof(type)); \
+		BUILD_BUG_ON(__size < 0);			\
+		__size;						\
+	})
+
+#define BPF_FIELD_SIZEOF(type, field)				\
+	({							\
+		const int __size = bytes_to_bpf_size(FIELD_SIZEOF(type, field)); \
+		BUILD_BUG_ON(__size < 0);			\
+		__size;						\
+	})
+
 #ifdef CONFIG_COMPAT
 /* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d3869b03d9fe..e63d7d435796 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -583,18 +583,18 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	switch (ctx_off) {
 	case offsetof(struct bpf_perf_event_data, sample_period):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)),
-				      dst_reg, src_reg,
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+						       data), dst_reg, src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, data));
 		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
 				      offsetof(struct perf_sample_data, period));
 		break;
 	default:
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)),
-				      dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+						       regs), dst_reg, src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, regs));
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)),
-				      dst_reg, dst_reg, ctx_off);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 628ed8c7d38d..120c813ef030 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -233,9 +233,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_HATYPE:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
-		BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
 
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 				      BPF_REG_TMP, BPF_REG_CTX,
 				      offsetof(struct sk_buff, dev));
 		/* if (tmp != 0) goto pc + 1 */
@@ -2685,7 +2684,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	case offsetof(struct __sk_buff, ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 				      dst_reg, src_reg,
 				      offsetof(struct sk_buff, dev));
 		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
@@ -2750,7 +2749,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		break;
 
 	case offsetof(struct __sk_buff, data):
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
 				      dst_reg, src_reg,
 				      offsetof(struct sk_buff, data));
 		break;
@@ -2759,8 +2758,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		ctx_off -= offsetof(struct __sk_buff, data_end);
 		ctx_off += offsetof(struct sk_buff, cb);
 		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
-				      dst_reg, src_reg, ctx_off);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
+				      ctx_off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_index):
@@ -2795,12 +2794,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 
 	switch (ctx_off) {
 	case offsetof(struct xdp_md, data):
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
 				      dst_reg, src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
 	case offsetof(struct xdp_md, data_end):
-		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      dst_reg, src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
-- 
cgit v1.2.3


From 374fb54eeaaa6b2cb82bca73a11273687bb2a96a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Sep 2016 02:45:30 +0200
Subject: bpf: add own ctx rewriter on ifindex for clsact progs

When fetching ifindex, we don't need to test dev for being NULL since
we're always guaranteed to have a valid dev for clsact programs. Thus,
avoid this test in fast path.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 120c813ef030..d6d9bb89ce3a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2634,10 +2634,10 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				      int src_reg, int ctx_off,
-				      struct bpf_insn *insn_buf,
-				      struct bpf_prog *prog)
+static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+					int src_reg, int ctx_off,
+					struct bpf_insn *insn_buf,
+					struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
@@ -2785,6 +2785,31 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+					 int src_reg, int ctx_off,
+					 struct bpf_insn *insn_buf,
+					 struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct __sk_buff, ifindex):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+				      dst_reg, src_reg,
+				      offsetof(struct sk_buff, dev));
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+				      offsetof(struct net_device, ifindex));
+		break;
+	default:
+		return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
+						    ctx_off, insn_buf, prog);
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn_buf,
@@ -2811,13 +2836,13 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
-	.convert_ctx_access	= bpf_net_convert_ctx_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
-	.convert_ctx_access	= bpf_net_convert_ctx_access,
+	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops xdp_ops = {
-- 
cgit v1.2.3


From f3694e00123802d688180e7ae90b240669910e3c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Sep 2016 02:45:31 +0200
Subject: bpf: add BPF_CALL_x macros for declaring helpers

This work adds BPF_CALL_<n>() macros and converts all the eBPF helper functions
to use them, in a similar fashion like we do with SYSCALL_DEFINE<n>() macros
that are used today. Motivation for this is to hide all the register handling
and all necessary casts from the user, so that it is done automatically in the
background when adding a BPF_CALL_<n>() call.

This makes current helpers easier to review, eases to write future helpers,
avoids getting the casting mess wrong, and allows for extending all helpers at
once (f.e. build time checks, etc). It also helps detecting more easily in
code reviews that unused registers are not instrumented in the code by accident,
breaking compatibility with existing programs.

BPF_CALL_<n>() internals are quite similar to SYSCALL_DEFINE<n>() ones with some
fundamental differences, for example, for generating the actual helper function
that carries all u64 regs, we need to fill unused regs, so that we always end up
with 5 u64 regs as an argument.

I reviewed several 0-5 generated BPF_CALL_<n>() variants of the .i results and
they look all as expected. No sparse issue spotted. We let this also sit for a
few days with Fengguang's kbuild test robot, and there were no issues seen. On
s390, it barked on the "uses dynamic stack allocation" notice, which is an old
one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion
to the call wrapper, just telling that the perf raw record/frag sits on stack
(gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests
and they were fine as well. All eBPF helpers are now converted to use these
macros, getting rid of a good chunk of all the raw castings.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   |  50 ++++++++++++++++++
 kernel/bpf/core.c        |   2 +-
 kernel/bpf/helpers.c     |  46 +++++------------
 kernel/bpf/stackmap.c    |   5 +-
 kernel/trace/bpf_trace.c |  75 +++++++++++++--------------
 net/core/filter.c        | 129 ++++++++++++++++++-----------------------------
 6 files changed, 149 insertions(+), 158 deletions(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7fabad8dc3fc..1f09c521adfe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -328,6 +328,56 @@ struct bpf_prog_aux;
 		__size;						\
 	})
 
+#define __BPF_MAP_0(m, v, ...) v
+#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
+#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
+#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
+#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
+#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)
+
+#define __BPF_REG_0(...) __BPF_PAD(5)
+#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
+#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
+#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
+#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
+#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)
+
+#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
+#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)
+
+#define __BPF_CAST(t, a)						       \
+	(__force t)							       \
+	(__force							       \
+	 typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
+				      (unsigned long)0, (t)0))) a
+#define __BPF_V void
+#define __BPF_N
+
+#define __BPF_DECL_ARGS(t, a) t   a
+#define __BPF_DECL_REGS(t, a) u64 a
+
+#define __BPF_PAD(n)							       \
+	__BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
+		  u64, __ur_3, u64, __ur_4, u64, __ur_5)
+
+#define BPF_CALL_x(x, name, ...)					       \
+	static __always_inline						       \
+	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
+	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));	       \
+	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))	       \
+	{								       \
+		return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
+	}								       \
+	static __always_inline						       \
+	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))
+
+#define BPF_CALL_0(name, ...)	BPF_CALL_x(0, name, __VA_ARGS__)
+#define BPF_CALL_1(name, ...)	BPF_CALL_x(1, name, __VA_ARGS__)
+#define BPF_CALL_2(name, ...)	BPF_CALL_x(2, name, __VA_ARGS__)
+#define BPF_CALL_3(name, ...)	BPF_CALL_x(3, name, __VA_ARGS__)
+#define BPF_CALL_4(name, ...)	BPF_CALL_x(4, name, __VA_ARGS__)
+#define BPF_CALL_5(name, ...)	BPF_CALL_x(5, name, __VA_ARGS__)
+
 #ifdef CONFIG_COMPAT
 /* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 03fd23d4d587..7b7baaed9ed4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void)
 	prandom_init_once(&bpf_user_rnd_state);
 }
 
-u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_user_rnd_u32)
 {
 	/* Should someone ever have the rather unwise idea to use some
 	 * of the registers passed into this function, then note that
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6df73bd1ba34..a5b8bf8cfcfd 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
 #include <linux/ktime.h>
 #include <linux/sched.h>
 #include <linux/uidgid.h>
+#include <linux/filter.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -26,24 +27,10 @@
  * if program is allowed to access maps, so check rcu_read_lock_held in
  * all three functions.
  */
-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 {
-	/* verifier checked that R1 contains a valid pointer to bpf_map
-	 * and R2 points to a program stack and map->key_size bytes were
-	 * initialized
-	 */
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-	void *key = (void *) (unsigned long) r2;
-	void *value;
-
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	value = map->ops->map_lookup_elem(map, key);
-
-	/* lookup() returns either pointer to element value or NULL
-	 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
-	 */
-	return (unsigned long) value;
+	return (unsigned long) map->ops->map_lookup_elem(map, key);
 }
 
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
@@ -54,15 +41,11 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
+	   void *, value, u64, flags)
 {
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-	void *key = (void *) (unsigned long) r2;
-	void *value = (void *) (unsigned long) r3;
-
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	return map->ops->map_update_elem(map, key, value, r4);
+	return map->ops->map_update_elem(map, key, value, flags);
 }
 
 const struct bpf_func_proto bpf_map_update_elem_proto = {
@@ -75,13 +58,9 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 {
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-	void *key = (void *) (unsigned long) r2;
-
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
 	return map->ops->map_delete_elem(map, key);
 }
 
@@ -99,7 +78,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_smp_processor_id)
 {
 	return smp_processor_id();
 }
@@ -110,7 +89,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_ktime_get_ns)
 {
 	/* NMI safe access to clock monotonic */
 	return ktime_get_mono_fast_ns();
@@ -122,7 +101,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_pid_tgid)
 {
 	struct task_struct *task = current;
 
@@ -138,7 +117,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_uid_gid)
 {
 	struct task_struct *task = current;
 	kuid_t uid;
@@ -158,10 +137,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
 {
 	struct task_struct *task = current;
-	char *buf = (char *) (long) r1;
 
 	if (unlikely(!task))
 		goto err_clear;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index bf4495fcd25d..732ae16d12b7 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,10 +116,9 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+	   u64, flags)
 {
-	struct pt_regs *regs = (struct pt_regs *) (long) r1;
-	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e63d7d435796..5dcb99281259 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -61,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
-	void *dst = (void *) (long) r1;
-	int ret, size = (int) r2;
-	void *unsafe_ptr = (void *) (long) r3;
+	int ret;
 
 	ret = probe_kernel_read(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
@@ -83,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+	   u32, size)
 {
-	void *unsafe_ptr = (void *) (long) r1;
-	void *src = (void *) (long) r2;
-	int size = (int) r3;
-
 	/*
 	 * Ensure we're in user context which is safe for the helper to
 	 * run. This helper has no business in a kthread.
@@ -130,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
  */
-static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+	   u64, arg2, u64, arg3)
 {
-	char *fmt = (char *) (long) r1;
 	bool str_seen = false;
 	int mod[3] = {};
 	int fmt_cnt = 0;
@@ -178,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 
 				switch (fmt_cnt) {
 				case 1:
-					unsafe_addr = r3;
-					r3 = (long) buf;
+					unsafe_addr = arg1;
+					arg1 = (long) buf;
 					break;
 				case 2:
-					unsafe_addr = r4;
-					r4 = (long) buf;
+					unsafe_addr = arg2;
+					arg2 = (long) buf;
 					break;
 				case 3:
-					unsafe_addr = r5;
-					r5 = (long) buf;
+					unsafe_addr = arg3;
+					arg3 = (long) buf;
 					break;
 				}
 				buf[0] = 0;
@@ -209,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 	}
 
 	return __trace_printk(1/* fake ip will not be printed */, fmt,
-			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
-			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
-			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+			      mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
+			      mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
+			      mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
 }
 
 static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -233,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 {
-	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
@@ -312,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	return 0;
 }
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
+	   u64, flags, void *, data, u64, size)
 {
-	struct pt_regs *regs = (struct pt_regs *)(long) r1;
-	struct bpf_map *map  = (struct bpf_map *)(long) r2;
-	void *data = (void *)(long) r4;
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
@@ -367,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	return __bpf_perf_event_output(regs, map, flags, &raw);
 }
 
-static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_task)
 {
 	return (long) current;
 }
@@ -378,16 +370,13 @@ static const struct bpf_func_proto bpf_get_current_task_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 {
-	struct bpf_map *map = (struct bpf_map *)(long)r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct cgroup *cgrp;
-	u32 idx = (u32)r2;
 
 	if (unlikely(in_interrupt()))
 		return -EINVAL;
-
 	if (unlikely(idx >= array->map.max_entries))
 		return -E2BIG;
 
@@ -481,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
-static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
+	   u64, flags, void *, data, u64, size)
 {
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
 	/*
 	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
 	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
-	 * from there and call the same bpf_perf_event_output() helper
+	 * from there and call the same bpf_perf_event_output() helper inline.
 	 */
-	u64 ctx = *(long *)(uintptr_t)r1;
-
-	return bpf_perf_event_output(ctx, r2, index, r4, size);
+	return ____bpf_perf_event_output(regs, map, flags, data, size);
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
@@ -504,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
-static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
+	   u64, flags)
 {
-	u64 ctx = *(long *)(uintptr_t)r1;
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
 
-	return bpf_get_stackid(ctx, r2, r3, r4, r5);
+	/*
+	 * Same comment as in bpf_perf_event_output_tp(), only that this time
+	 * the other helper's function body cannot be inlined due to being
+	 * external, thus we need to call raw helper function.
+	 */
+	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+			       flags, 0, 0);
 }
 
 static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
diff --git a/net/core/filter.c b/net/core/filter.c
index d6d9bb89ce3a..298b146b47e7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -94,14 +94,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 }
 EXPORT_SYMBOL(sk_filter_trim_cap);
 
-static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
 {
-	return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+	return skb_get_poff(skb);
 }
 
-static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
 {
-	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 	struct nlattr *nla;
 
 	if (skb_is_nonlinear(skb))
@@ -120,9 +119,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return 0;
 }
 
-static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 {
-	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 	struct nlattr *nla;
 
 	if (skb_is_nonlinear(skb))
@@ -145,7 +143,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return 0;
 }
 
-static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_0(__get_raw_cpu_id)
 {
 	return raw_smp_processor_id();
 }
@@ -1376,12 +1374,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
 		skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
 }
 
-static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+	   const void *, from, u32, len, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	unsigned int offset = (unsigned int) r2;
-	void *from = (void *) (long) r3;
-	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
 	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
@@ -1416,12 +1411,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+	   void *, to, u32, len)
 {
-	const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
-	unsigned int offset = (unsigned int) r2;
-	void *to = (void *)(unsigned long) r3;
-	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
 	if (unlikely(offset > 0xffff))
@@ -1449,10 +1441,9 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
-static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
+	   u64, from, u64, to, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	unsigned int offset = (unsigned int) r2;
 	__sum16 *ptr;
 
 	if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
@@ -1494,12 +1485,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
+	   u64, from, u64, to, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
 	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
-	unsigned int offset = (unsigned int) r2;
 	__sum16 *ptr;
 
 	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
@@ -1547,12 +1537,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
+	   __be32 *, to, u32, to_size, __wsum, seed)
 {
 	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
-	u64 diff_size = from_size + to_size;
-	__be32 *from = (__be32 *) (long) r1;
-	__be32 *to   = (__be32 *) (long) r3;
+	u32 diff_size = from_size + to_size;
 	int i, j = 0;
 
 	/* This is quite flexible, some examples:
@@ -1610,9 +1599,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 	return ret;
 }
 
-static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	struct net_device *dev;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1648,7 +1636,7 @@ struct redirect_info {
 
 static DEFINE_PER_CPU(struct redirect_info, redirect_info);
 
-static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 
@@ -1687,9 +1675,9 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
-	return task_get_classid((struct sk_buff *) (unsigned long) r1);
+	return task_get_classid(skb);
 }
 
 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
@@ -1699,9 +1687,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
 {
-	return dst_tclassid((struct sk_buff *) (unsigned long) r1);
+	return dst_tclassid(skb);
 }
 
 static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1711,14 +1699,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
 {
 	/* If skb_clear_hash() was called due to mangling, we can
 	 * trigger SW recalculation here. Later access to hash
 	 * can then use the inline skb->hash via context directly
 	 * instead of calling this helper again.
 	 */
-	return skb_get_hash((struct sk_buff *) (unsigned long) r1);
+	return skb_get_hash(skb);
 }
 
 static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
@@ -1728,10 +1716,9 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
-static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
+	   u16, vlan_tci)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	__be16 vlan_proto = (__force __be16) r2;
 	int ret;
 
 	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
@@ -1756,9 +1743,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
 };
 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
 
-static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	int ret;
 
 	bpf_push_mac_rcsum(skb);
@@ -1933,10 +1919,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
 	return -ENOTSUPP;
 }
 
-static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
+	   u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	__be16 proto = (__force __be16) r2;
 	int ret;
 
 	if (unlikely(flags))
@@ -1973,11 +1958,8 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	u32 pkt_type = r2;
-
 	/* We only allow a restricted subset to be changed for now. */
 	if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
 		     !skb_pkt_type_ok(pkt_type)))
@@ -2028,12 +2010,11 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
 	return __skb_trim_rcsum(skb, new_len);
 }
 
-static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+	   u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *)(long) r1;
 	u32 max_len = __bpf_skb_max_len(skb);
 	u32 min_len = __bpf_skb_min_len(skb);
-	u32 new_len = (u32) r2;
 	int ret;
 
 	if (unlikely(flags || new_len > max_len || new_len < min_len))
@@ -2113,13 +2094,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
 	return 0;
 }
 
-static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
-				u64 meta_size)
+BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
+	   u64, flags, void *, meta, u64, meta_size)
 {
-	struct sk_buff *skb = (struct sk_buff *)(long) r1;
-	struct bpf_map *map = (struct bpf_map *)(long) r2;
 	u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
-	void *meta = (void *)(long) r4;
 
 	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -2146,10 +2124,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags)
 	return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
 }
 
-static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
+	   u32, size, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	void *to_orig = to;
@@ -2214,10 +2191,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	u8 *to = (u8 *) (long) r2;
 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	int err;
 
@@ -2252,10 +2227,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
 
 static struct metadata_dst __percpu *md_dst;
 
-static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
+	   const struct bpf_tunnel_key *, from, u32, size, u64, flags)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
 	struct metadata_dst *md = this_cpu_ptr(md_dst);
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	struct ip_tunnel_info *info;
@@ -2273,7 +2247,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			 */
 			memcpy(compat, from, size);
 			memset(compat + size, 0, sizeof(compat) - size);
-			from = (struct bpf_tunnel_key *)compat;
+			from = (const struct bpf_tunnel_key *) compat;
 			break;
 		default:
 			return -EINVAL;
@@ -2323,10 +2297,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
+	   const u8 *, from, u32, size)
 {
-	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	u8 *from = (u8 *) (long) r2;
 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	const struct metadata_dst *md = this_cpu_ptr(md_dst);
 
@@ -2372,23 +2345,20 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 	}
 }
 
-static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
+	   u32, idx)
 {
-	struct sk_buff *skb = (struct sk_buff *)(long)r1;
-	struct bpf_map *map = (struct bpf_map *)(long)r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct cgroup *cgrp;
 	struct sock *sk;
-	u32 i = (u32)r3;
 
 	sk = skb->sk;
 	if (!sk || !sk_fullsock(sk))
 		return -ENOENT;
-
-	if (unlikely(i >= array->map.max_entries))
+	if (unlikely(idx >= array->map.max_entries))
 		return -E2BIG;
 
-	cgrp = READ_ONCE(array->ptrs[i]);
+	cgrp = READ_ONCE(array->ptrs[idx]);
 	if (unlikely(!cgrp))
 		return -EAGAIN;
 
@@ -2411,13 +2381,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	return 0;
 }
 
-static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
-				u64 meta_size)
+BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
+	   u64, flags, void *, meta, u64, meta_size)
 {
-	struct xdp_buff *xdp = (struct xdp_buff *)(long) r1;
-	struct bpf_map *map = (struct bpf_map *)(long) r2;
 	u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
-	void *meta = (void *)(long) r4;
 
 	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
 		return -EINVAL;
-- 
cgit v1.2.3


From d817f432c2ab7639a4f69de73eafdc55e57c45ad Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:45 +0300
Subject: net/ip_tunnels: Introduce tunnel_id_to_key32() and
 key32_to_tunnel_id()

Add utility functions to convert a 32 bits key into a 64 bits tunnel and
vice versa.
These functions will be used instead of cloning code in GRE and VXLAN,
and in tc act_iptunnel which will be introduced in a following patch in
this patchset.

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c      |  4 ++--
 include/net/ip_tunnels.h | 19 +++++++++++++++++++
 include/net/vxlan.h      | 18 ------------------
 net/ipv4/ip_gre.c        | 23 ++---------------------
 4 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 199dec033cf8..4bfeb9765c55 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1291,7 +1291,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 		struct metadata_dst *tun_dst;
 
 		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
-					 vxlan_vni_to_tun_id(vni), sizeof(*md));
+					 key32_to_tunnel_id(vni), sizeof(*md));
 
 		if (!tun_dst)
 			goto drop;
@@ -1945,7 +1945,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto drop;
 		}
 		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
-		vni = vxlan_tun_id_to_vni(info->key.tun_id);
+		vni = tunnel_id_to_key32(info->key.tun_id);
 		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
 		if (remote_ip.sa.sa_family == AF_INET) {
 			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a5e7035fb93f..e598c639aa6f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -222,6 +222,25 @@ static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
 	return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
 }
 
+static inline __be64 key32_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be64)key;
+#else
+	return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static inline __be32 tunnel_id_to_key32(__be64 tun_id)
+{
+#ifdef __BIG_ENDIAN
+	return (__force __be32)tun_id;
+#else
+	return (__force __be32)((__force u64)tun_id >> 32);
+#endif
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b96d0360c095..0255613a54a4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -350,24 +350,6 @@ static inline __be32 vxlan_vni_field(__be32 vni)
 #endif
 }
 
-static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id)
-{
-#if defined(__BIG_ENDIAN)
-	return (__force __be32)tun_id;
-#else
-	return (__force __be32)((__force u64)tun_id >> 32);
-#endif
-}
-
-static inline __be64 vxlan_vni_to_tun_id(__be32 vni)
-{
-#if defined(__BIG_ENDIAN)
-	return (__force __be64)vni;
-#else
-	return (__force __be64)((u64)(__force u32)vni << 32);
-#endif
-}
-
 static inline size_t vxlan_rco_start(__be32 vni_field)
 {
 	return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	ipgre_err(skb, info, &tpi);
 }
 
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
-	return (__force __be64)((__force u32)key);
-#else
-	return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
-	return (__force __be32)x;
-#else
-	return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 {
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 			__be64 tun_id;
 
 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
-			tun_id = key_to_tunnel_id(tpi->key);
+			tun_id = key32_to_tunnel_id(tpi->key);
 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 			if (!tun_dst)
 				return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
 	gre_build_header(skb, tunnel_hlen, flags, proto,
-			 tunnel_id_to_key(tun_info->key.tun_id), 0);
+			 tunnel_id_to_key32(tun_info->key.tun_id), 0);
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 
-- 
cgit v1.2.3


From bc3103f1ed405de587fa43d8b0671e615505a700 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:47 +0300
Subject: net/sched: cls_flower: Classify packet in ip tunnels

Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0' (after metadata is released):

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
      enc_src_ip 11.11.0.2 \
      enc_dst_ip 11.11.0.1 \
      enc_key_id 11 \
      dst_ip 11.11.11.1 \
    action tunnel_key release \
    action mirred egress redirect dev vnet0

The action tunnel_key, will be introduced in the next patch in this
series.

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  11 +++++
 net/sched/cls_flower.c       | 100 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 51b5b247fb5a..f9c287c67eae 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -431,6 +431,17 @@ enum {
 	TCA_FLOWER_KEY_VLAN_ID,
 	TCA_FLOWER_KEY_VLAN_PRIO,
 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+
+	TCA_FLOWER_KEY_ENC_KEY_ID,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index cf9ad5b50889..b084b2aab2d7 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,9 +23,13 @@
 #include <net/ip.h>
 #include <net/flow_dissector.h>
 
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
 struct fl_flow_key {
 	int	indev_ifindex;
 	struct flow_dissector_key_control control;
+	struct flow_dissector_key_control enc_control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	struct flow_dissector_key_vlan vlan;
@@ -35,6 +39,11 @@ struct fl_flow_key {
 		struct flow_dissector_key_ipv6_addrs ipv6;
 	};
 	struct flow_dissector_key_ports tp;
+	struct flow_dissector_key_keyid enc_key_id;
+	union {
+		struct flow_dissector_key_ipv4_addrs enc_ipv4;
+		struct flow_dissector_key_ipv6_addrs enc_ipv6;
+	};
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -124,11 +133,31 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct cls_fl_filter *f;
 	struct fl_flow_key skb_key;
 	struct fl_flow_key skb_mkey;
+	struct ip_tunnel_info *info;
 
 	if (!atomic_read(&head->ht.nelems))
 		return -1;
 
 	fl_clear_masked_range(&skb_key, &head->mask);
+
+	info = skb_tunnel_info(skb);
+	if (info) {
+		struct ip_tunnel_key *key = &info->key;
+
+		switch (ip_tunnel_info_af(info)) {
+		case AF_INET:
+			skb_key.enc_ipv4.src = key->u.ipv4.src;
+			skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+			break;
+		case AF_INET6:
+			skb_key.enc_ipv6.src = key->u.ipv6.src;
+			skb_key.enc_ipv6.dst = key->u.ipv6.dst;
+			break;
+		}
+
+		skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+	}
+
 	skb_key.indev_ifindex = skb->skb_iif;
 	/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
 	 * so do it rather here.
@@ -297,7 +326,15 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_VLAN_ID]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_VLAN_PRIO]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_VLAN_ETH_TYPE]	= { .type = NLA_U16 },
-
+	[TCA_FLOWER_KEY_ENC_KEY_ID]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_SRC]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_DST]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV6_SRC]	= { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_DST]	= { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -409,6 +446,40 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       sizeof(key->tp.dst));
 	}
 
+	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+	    tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
+		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		fl_set_key_val(tb, &key->enc_ipv4.src,
+			       TCA_FLOWER_KEY_ENC_IPV4_SRC,
+			       &mask->enc_ipv4.src,
+			       TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+			       sizeof(key->enc_ipv4.src));
+		fl_set_key_val(tb, &key->enc_ipv4.dst,
+			       TCA_FLOWER_KEY_ENC_IPV4_DST,
+			       &mask->enc_ipv4.dst,
+			       TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+			       sizeof(key->enc_ipv4.dst));
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
+	    tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
+		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+		fl_set_key_val(tb, &key->enc_ipv6.src,
+			       TCA_FLOWER_KEY_ENC_IPV6_SRC,
+			       &mask->enc_ipv6.src,
+			       TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+			       sizeof(key->enc_ipv6.src));
+		fl_set_key_val(tb, &key->enc_ipv6.dst,
+			       TCA_FLOWER_KEY_ENC_IPV6_DST,
+			       &mask->enc_ipv6.dst,
+			       TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+			       sizeof(key->enc_ipv6.dst));
+	}
+
+	fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+		       &mask->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+		       sizeof(key->enc_key_id.keyid));
+
 	return 0;
 }
 
@@ -821,6 +892,33 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 				  sizeof(key->tp.dst))))
 		goto nla_put_failure;
 
+	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
+			    TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
+			    TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+			    sizeof(key->enc_ipv4.src)) ||
+	     fl_dump_key_val(skb, &key->enc_ipv4.dst,
+			     TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst,
+			     TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+			     sizeof(key->enc_ipv4.dst))))
+		goto nla_put_failure;
+	else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+		 (fl_dump_key_val(skb, &key->enc_ipv6.src,
+			    TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src,
+			    TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+			    sizeof(key->enc_ipv6.src)) ||
+		 fl_dump_key_val(skb, &key->enc_ipv6.dst,
+				 TCA_FLOWER_KEY_ENC_IPV6_DST,
+				 &mask->enc_ipv6.dst,
+				 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+			    sizeof(key->enc_ipv6.dst))))
+		goto nla_put_failure;
+
+	if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+			    &mask->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+			    sizeof(key->enc_key_id)))
+		goto nla_put_failure;
+
 	nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
 
 	if (tcf_exts_dump(skb, &f->exts))
-- 
cgit v1.2.3


From d0f6dd8a914f42c6f1a3a8c08caa16559d3d9a1b Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:48 +0300
Subject: net/sched: Introduce act_tunnel_key

This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action tunnel_key set \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_tunnel_key.h        |  30 +++
 include/uapi/linux/tc_act/tc_tunnel_key.h |  41 ++++
 net/sched/Kconfig                         |  11 +
 net/sched/Makefile                        |   1 +
 net/sched/act_tunnel_key.c                | 351 ++++++++++++++++++++++++++++++
 5 files changed, 434 insertions(+)
 create mode 100644 include/net/tc_act/tc_tunnel_key.h
 create mode 100644 include/uapi/linux/tc_act/tc_tunnel_key.h
 create mode 100644 net/sched/act_tunnel_key.c

(limited to 'net')

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..253f8da6c2a6
--- /dev/null
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NET_TC_TUNNEL_KEY_H
+#define __NET_TC_TUNNEL_KEY_H
+
+#include <net/act_api.h>
+
+struct tcf_tunnel_key_params {
+	struct rcu_head		rcu;
+	int			tcft_action;
+	int			action;
+	struct metadata_dst     *tcft_enc_metadata;
+};
+
+struct tcf_tunnel_key {
+	struct tc_action	      common;
+	struct tcf_tunnel_key_params __rcu *params;
+};
+
+#define to_tunnel_key(a) ((struct tcf_tunnel_key *)a)
+
+#endif /* __NET_TC_TUNNEL_KEY_H */
diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..890106ff16e6
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET	    1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+	tc_gen;
+	int t_action;
+};
+
+enum {
+	TCA_TUNNEL_KEY_UNSPEC,
+	TCA_TUNNEL_KEY_TM,
+	TCA_TUNNEL_KEY_PARMS,
+	TCA_TUNNEL_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_KEY_ID,	/* be64 */
+	TCA_TUNNEL_KEY_PAD,
+	__TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..72e3426fa48f 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -761,6 +761,17 @@ config NET_ACT_IFE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_ife.
 
+config NET_ACT_TUNNEL_KEY
+        tristate "IP tunnel metadata manipulation"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to set/release ip tunnel metadata.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_tunnel_key.
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..b9d046b9535a 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 000000000000..dceff7412dc3
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+#define TUNNEL_KEY_TAB_MASK     15
+
+static int tunnel_key_net_id;
+static struct tc_action_ops act_tunnel_key_ops;
+
+static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	int action;
+
+	rcu_read_lock();
+
+	params = rcu_dereference(t->params);
+
+	tcf_lastuse_update(&t->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+	action = params->action;
+
+	switch (params->tcft_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		skb_dst_drop(skb);
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		skb_dst_drop(skb);
+		skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
+		break;
+	default:
+		WARN_ONCE(1, "Bad tunnel_key action %d.\n",
+			  params->tcft_action);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return action;
+}
+
+static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
+	[TCA_TUNNEL_KEY_PARMS]	    = { .len = sizeof(struct tc_tunnel_key) },
+	[TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_KEY_ID]   = { .type = NLA_U32 },
+};
+
+static int tunnel_key_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+	struct tcf_tunnel_key_params *params_old;
+	struct tcf_tunnel_key_params *params_new;
+	struct metadata_dst *metadata = NULL;
+	struct tc_tunnel_key *parm;
+	struct tcf_tunnel_key *t;
+	bool exists = false;
+	__be64 key_id;
+	int ret = 0;
+	int err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TUNNEL_KEY_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	switch (parm->t_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+
+		if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
+		    tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
+			__be32 saddr;
+			__be32 daddr;
+
+			saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+			daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+
+			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+						    TUNNEL_KEY, key_id, 0);
+		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
+			   tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
+			struct in6_addr saddr;
+			struct in6_addr daddr;
+
+			saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+
+			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
+						      TUNNEL_KEY, key_id, 0);
+		}
+
+		if (!metadata) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
+		break;
+	default:
+		goto err_out;
+	}
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_tunnel_key_ops, bind, true);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	t = to_tunnel_key(*a);
+
+	ASSERT_RTNL();
+	params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+	if (unlikely(!params_new)) {
+		if (ret == ACT_P_CREATED)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+
+	params_old = rtnl_dereference(t->params);
+
+	params_new->action = parm->action;
+	params_new->tcft_action = parm->t_action;
+	params_new->tcft_enc_metadata = metadata;
+
+	rcu_assign_pointer(t->params, params_new);
+
+	if (params_old)
+		kfree_rcu(params_old, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+
+	return ret;
+
+err_out:
+	if (exists)
+		tcf_hash_release(*a, bind);
+	return ret;
+}
+
+static void tunnel_key_release(struct tc_action *a, int bind)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+		dst_release(&params->tcft_enc_metadata->dst);
+
+	kfree_rcu(params, rcu);
+
+	rcu_read_unlock();
+}
+
+static int tunnel_key_dump_addresses(struct sk_buff *skb,
+				     const struct ip_tunnel_info *info)
+{
+	unsigned short family = ip_tunnel_info_af(info);
+
+	if (family == AF_INET) {
+		__be32 saddr = info->key.u.ipv4.src;
+		__be32 daddr = info->key.u.ipv4.dst;
+
+		if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
+		    !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
+			return 0;
+	}
+
+	if (family == AF_INET6) {
+		const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
+		const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
+
+		if (!nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
+		    !nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
+			return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	struct tc_tunnel_key opt = {
+		.index    = t->tcf_index,
+		.refcnt   = t->tcf_refcnt - ref,
+		.bindcnt  = t->tcf_bindcnt - bind,
+	};
+	struct tcf_t tm;
+	int ret = -1;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	opt.t_action = params->tcft_action;
+	opt.action = params->action;
+
+	if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+		struct ip_tunnel_key *key =
+			&params->tcft_enc_metadata->u.tun_info.key;
+		__be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+		if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+		    tunnel_key_dump_addresses(skb,
+					      &params->tcft_enc_metadata->u.tun_info))
+			goto nla_put_failure;
+	}
+
+	tcf_tm_dump(&tm, &t->tcf_tm);
+	if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
+			  &tm, TCA_TUNNEL_KEY_PAD))
+		goto nla_put_failure;
+
+	ret = skb->len;
+	goto out;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_tunnel_key_ops = {
+	.kind		=	"tunnel_key",
+	.type		=	TCA_ACT_TUNNEL_KEY,
+	.owner		=	THIS_MODULE,
+	.act		=	tunnel_key_act,
+	.dump		=	tunnel_key_dump,
+	.init		=	tunnel_key_init,
+	.cleanup	=	tunnel_key_release,
+	.walk		=	tunnel_key_walker,
+	.lookup		=	tunnel_key_search,
+	.size		=	sizeof(struct tcf_tunnel_key),
+};
+
+static __net_init int tunnel_key_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
+}
+
+static void __net_exit tunnel_key_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations tunnel_key_net_ops = {
+	.init = tunnel_key_init_net,
+	.exit = tunnel_key_exit_net,
+	.id   = &tunnel_key_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init tunnel_key_init_module(void)
+{
+	return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+static void __exit tunnel_key_cleanup_module(void)
+{
+	tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+module_init(tunnel_key_init_module);
+module_exit(tunnel_key_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From a73ec314a0d28cdbc29b4e4ad10871df0829986d Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:13 -0400
Subject: appletalk: use IS_ENABLED() instead of checking for built-in or
 module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/ddp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index f066781be3c8..10d2bdce686e 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1278,7 +1278,7 @@ out:
 	return err;
 }
 
-#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE)
+#if IS_ENABLED(CONFIG_IPDDP)
 static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
 {
 	return skb->data[12] == 22;
-- 
cgit v1.2.3


From 9a81c34ace3598188f633d4654a2a57b7f7a2c2a Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:14 -0400
Subject: lec: use IS_ENABLED() instead of checking for built-in or module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/lec.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/atm/lec.c b/net/atm/lec.c
index e574a7e9db6f..5d2693826afb 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -31,7 +31,7 @@
 #include <linux/atmlec.h>
 
 /* Proxy LEC knows about bridging */
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
 #include "../bridge/br_private.h"
 
 static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
@@ -121,7 +121,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 /* Device structures */
 static struct net_device *dev_lec[MAX_LEC_ITF];
 
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
 static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
 {
 	char *buff;
@@ -155,7 +155,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
 		sk->sk_data_ready(sk);
 	}
 }
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
 
 /*
  * Open/initialize the netdevice. This is called (in the current kernel)
@@ -222,7 +222,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
 	pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
 		 (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb),
 		 (long)skb_end_pointer(skb));
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
 	if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0)
 		lec_handle_bridge(skb, dev);
 #endif
@@ -426,7 +426,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
 		    (unsigned short)(0xffff & mesg->content.normal.flag);
 		break;
 	case l_should_bridge:
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
 	{
 		pr_debug("%s: bridge zeppelin asks about %pM\n",
 			 dev->name, mesg->content.proxy.mac_addr);
@@ -452,7 +452,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
 			sk->sk_data_ready(sk);
 		}
 	}
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
 		break;
 	default:
 		pr_info("%s: Unknown message type %d\n", dev->name, mesg->type);
-- 
cgit v1.2.3


From 181402a5c7899fad945485130ded47ca2bf1161e Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:15 -0400
Subject: net: use IS_ENABLED() instead of checking for built-in or module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 34b5322bc081..b0d307b6af19 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3904,8 +3904,7 @@ static void net_tx_action(struct softirq_action *h)
 	}
 }
 
-#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
-    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 /* This hook is defined here for ATM LANE */
 int (*br_fdb_test_addr_hook)(struct net_device *dev,
 			     unsigned char *addr) __read_mostly;
-- 
cgit v1.2.3


From 6ca40d4e8463c53e6b778010b9331268865725a6 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:16 -0400
Subject: ipv4: use IS_ENABLED() instead of checking for built-in or module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 65569274efb8..b913f5bf0757 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -490,7 +490,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
 	to->ipvs_property = from->ipvs_property;
 #endif
 	skb_copy_secmark(to, from);
-- 
cgit v1.2.3


From 9dd79945b0f846ca5282c7df7ecf3823f0243898 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:17 -0400
Subject: l2tp: use IS_ENABLED() instead of checking for built-in or module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.h | 2 +-
 net/l2tp/l2tp_eth.c  | 4 ++--
 net/l2tp/l2tp_ppp.c  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 5871537af387..2599af6378e4 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -139,7 +139,7 @@ struct l2tp_session {
 	void (*session_close)(struct l2tp_session *session);
 	void (*ref)(struct l2tp_session *session);
 	void (*deref)(struct l2tp_session *session);
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
 	void (*show)(struct seq_file *m, void *priv);
 #endif
 	uint8_t			priv[0];	/* private data */
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 57fc5a46ce06..ef2cd30ca06e 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -195,7 +195,7 @@ static void l2tp_eth_delete(struct l2tp_session *session)
 	}
 }
 
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
 static void l2tp_eth_show(struct seq_file *m, void *arg)
 {
 	struct l2tp_session *session = arg;
@@ -268,7 +268,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
 	priv->tunnel_sock = tunnel->sock;
 	session->recv_skb = l2tp_eth_dev_recv;
 	session->session_close = l2tp_eth_delete;
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
 	session->show = l2tp_eth_show;
 #endif
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 34eff77982cf..41d47bfda15c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -552,7 +552,7 @@ out:
 	return error;
 }
 
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
 static void pppol2tp_show(struct seq_file *m, void *arg)
 {
 	struct l2tp_session *session = arg;
@@ -723,7 +723,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 
 	session->recv_skb	= pppol2tp_recv;
 	session->session_close	= pppol2tp_session_close;
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
 	session->show		= pppol2tp_show;
 #endif
 
-- 
cgit v1.2.3


From 0013de38a829db3f83a36e3e178ff386eb589c51 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:18 -0400
Subject: net: sched: use IS_ENABLED() instead of checking for built-in or
 module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flow.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 2c1ae549edbf..a379bae1d74e 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -29,7 +29,7 @@
 #include <net/route.h>
 #include <net/flow_dissector.h>
 
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <net/netfilter/nf_conntrack.h>
 #endif
 
@@ -125,14 +125,14 @@ static u32 flow_get_mark(const struct sk_buff *skb)
 
 static u32 flow_get_nfct(const struct sk_buff *skb)
 {
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	return addr_fold(skb->nfct);
 #else
 	return 0;
 #endif
 }
 
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #define CTTUPLE(skb, member)						\
 ({									\
 	enum ip_conntrack_info ctinfo;					\
-- 
cgit v1.2.3


From aebf5de07aabd44db740c9d33b6daa1abd19fa56 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:19 -0400
Subject: sctp: use IS_ENABLED() instead of checking for built-in or module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/auth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 912eb1685a5d..f99d4855d3de 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -48,7 +48,7 @@ static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
 		/* id 2 is reserved as well */
 		.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
 	},
-#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE)
+#if IS_ENABLED(CONFIG_CRYPTO_SHA256)
 	{
 		.hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
 		.hmac_name = "hmac(sha256)",
-- 
cgit v1.2.3


From 65b323e2ffbb05db4136ee822e08a9b0ec6ac716 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Fri, 9 Sep 2016 08:43:20 -0400
Subject: xfrm: use IS_ENABLED() instead of checking for built-in or module

The IS_ENABLED() macro checks if a Kconfig symbol has been enabled either
built-in or as a module, use that macro instead of open coding the same.

Using the macro makes the code more readable by helping abstract away some
of the Kconfig built-in and module enable details.

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_algo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 250e567ba3d6..44ac85fe2bc9 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -17,7 +17,7 @@
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <net/xfrm.h>
-#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
+#if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP)
 #include <net/esp.h>
 #endif
 
-- 
cgit v1.2.3


From ed227099dac95128e2aecd62af51bb9d922e5977 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Date: Fri, 9 Sep 2016 17:42:30 -0300
Subject: openvswitch: use alias for genetlink family names

When userspace tries to create datapaths and the module is not loaded,
it will simply fail. With this patch, the module will be automatically
loaded.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 524c0fd3078e..0536ab3504d5 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2437,3 +2437,7 @@ module_exit(dp_cleanup);
 
 MODULE_DESCRIPTION("Open vSwitch switching datapath");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
-- 
cgit v1.2.3


From 2594a2a928a010bf27e6545f90bc2de7ed5ed075 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 9 Sep 2016 14:22:45 -0700
Subject: tcp: better use ooo_last_skb in tcp_data_queue_ofo()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Willem noticed that we could avoid an rbtree lookup if the
the attempt to coalesce incoming skb to the last skb failed
for some reason.

Since most ooo additions are at the tail, this is definitely
worth adding a test and fast path.

Suggested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yaogong Wang <wygivan@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a5934c4c8cd4..70b892db9901 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4461,6 +4461,12 @@ coalesce_done:
 		skb = NULL;
 		goto add_sack;
 	}
+	/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+	if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
+		parent = &tp->ooo_last_skb->rbnode;
+		p = &parent->rb_right;
+		goto insert;
+	}
 
 	/* Find place to insert this segment. Handle overlaps on the way. */
 	parent = NULL;
@@ -4503,7 +4509,7 @@ coalesce_done:
 		}
 		p = &parent->rb_right;
 	}
-
+insert:
 	/* Insert segment into RB tree. */
 	rb_link_node(&skb->rbnode, parent, p);
 	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
-- 
cgit v1.2.3


From 9ee0034b8f49aaaa7e7c2da8db1038915db99c19 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:52 -0700
Subject: net: flow: Add l3mdev flow update

Add l3mdev hook to set FLOWI_FLAG_SKIP_NH_OIF flag and update oif/iif
in flow struct if its oif or iif points to a device enslaved to an L3
Master device. Only 1 needs to be converted to match the l3mdev FIB
rule. This moves the flow adjustment for l3mdev to a single point
catching all lookups. It is redundant for existing hooks (those are
removed in later patches) but is needed for missed lookups such as
PMTU updates.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h  |  6 ++++++
 net/ipv4/fib_rules.c  |  3 +++
 net/ipv6/fib6_rules.c |  3 +++
 net/l3mdev/l3mdev.c   | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+)

(limited to 'net')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index e90095091aa0..81e175e80537 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -49,6 +49,8 @@ struct l3mdev_ops {
 int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
 			  struct fib_lookup_arg *arg);
 
+void l3mdev_update_flow(struct net *net, struct flowi *fl);
+
 int l3mdev_master_ifindex_rcu(const struct net_device *dev);
 static inline int l3mdev_master_ifindex(struct net_device *dev)
 {
@@ -290,6 +292,10 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
 {
 	return 1;
 }
+static inline
+void l3mdev_update_flow(struct net *net, struct flowi *fl)
+{
+}
 #endif
 
 #endif /* _NET_L3MDEV_H_ */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6e9ea69e5f75..770bebed6b28 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
 	};
 	int err;
 
+	/* update flow if oif or iif point to device enslaved to l3mdev */
+	l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
 	err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	if (arg.rule)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 5857c1fc8b67..eea23b57c6a5 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 		.flags = FIB_LOOKUP_NOREF,
 	};
 
+	/* update flow if oif or iif point to device enslaved to l3mdev */
+	l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
 	fib_rules_lookup(net->ipv6.fib6_rules_ops,
 			 flowi6_to_flowi(fl6), flags, &arg);
 
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index c4a1c3e84e12..43610e5acc4e 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -222,3 +222,38 @@ out:
 
 	return rc;
 }
+
+void l3mdev_update_flow(struct net *net, struct flowi *fl)
+{
+	struct net_device *dev;
+	int ifindex;
+
+	rcu_read_lock();
+
+	if (fl->flowi_oif) {
+		dev = dev_get_by_index_rcu(net, fl->flowi_oif);
+		if (dev) {
+			ifindex = l3mdev_master_ifindex_rcu(dev);
+			if (ifindex) {
+				fl->flowi_oif = ifindex;
+				fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+				goto out;
+			}
+		}
+	}
+
+	if (fl->flowi_iif) {
+		dev = dev_get_by_index_rcu(net, fl->flowi_iif);
+		if (dev) {
+			ifindex = l3mdev_master_ifindex_rcu(dev);
+			if (ifindex) {
+				fl->flowi_iif = ifindex;
+				fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+			}
+		}
+	}
+
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(l3mdev_update_flow);
-- 
cgit v1.2.3


From a8e3e1a9f02094145580ea7920c6a1d9aabd5539 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:53 -0700
Subject: net: l3mdev: Add hook to output path

This patch adds the infrastructure to the output path to pass an skb
to an l3mdev device if it has a hook registered. This is the Tx parallel
to l3mdev_ip{6}_rcv in the receive path and is the basis for removing
the existing hook that returns the vrf dst on the fib lookup.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_output.c   |  8 ++++++++
 net/ipv6/ip6_output.c  |  8 ++++++++
 net/ipv6/output_core.c |  7 +++++++
 net/ipv6/raw.c         |  7 +++++++
 5 files changed, 78 insertions(+)

(limited to 'net')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 81e175e80537..53d5274920e3 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -11,6 +11,7 @@
 #ifndef _NET_L3MDEV_H_
 #define _NET_L3MDEV_H_
 
+#include <net/dst.h>
 #include <net/fib_rules.h>
 
 /**
@@ -18,6 +19,10 @@
  *
  * @l3mdev_fib_table: Get FIB table id to use for lookups
  *
+ * @l3mdev_l3_rcv:    Hook in L3 receive path
+ *
+ * @l3mdev_l3_out:    Hook in L3 output path
+ *
  * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
  *
  * @l3mdev_get_saddr: Get source address for a flow
@@ -29,6 +34,9 @@ struct l3mdev_ops {
 	u32		(*l3mdev_fib_table)(const struct net_device *dev);
 	struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
 					  struct sk_buff *skb, u16 proto);
+	struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev,
+					  struct sock *sk, struct sk_buff *skb,
+					  u16 proto);
 
 	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
@@ -201,6 +209,34 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
 	return l3mdev_l3_rcv(skb, AF_INET6);
 }
 
+static inline
+struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
+{
+	struct net_device *dev = skb_dst(skb)->dev;
+
+	if (netif_is_l3_slave(dev)) {
+		struct net_device *master;
+
+		master = netdev_master_upper_dev_get_rcu(dev);
+		if (master && master->l3mdev_ops->l3mdev_l3_out)
+			skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
+								skb, proto);
+	}
+
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
+{
+	return l3mdev_l3_out(sk, skb, AF_INET);
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
+{
+	return l3mdev_l3_out(sk, skb, AF_INET6);
+}
 #else
 
 static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -286,6 +322,18 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
 	return skb;
 }
 
+static inline
+struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
+{
+	return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
+{
+	return skb;
+}
+
 static inline
 int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
 			  struct fib_lookup_arg *arg)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b913f5bf0757..41e10e34769c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -99,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 	iph->tot_len = htons(skb->len);
 	ip_send_check(iph);
+
+	/* if egress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip_out(sk, skb);
+	if (unlikely(!skb))
+		return 0;
+
 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
 		       net, sk, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 993fd9666f1b..6ea6caace3a8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -236,6 +236,14 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_OUT, skb->len);
+
+		/* if egress device is enslaved to an L3 master device pass the
+		 * skb to its handler for processing
+		 */
+		skb = l3mdev_ip6_out((struct sock *)sk, skb);
+		if (unlikely(!skb))
+			return 0;
+
 		/* hooks should never assume socket lock is held.
 		 * we promote our socket to non const
 		 */
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 462f2a76b5c2..7cca8ac66fe9 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -148,6 +148,13 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 	ipv6_hdr(skb)->payload_len = htons(len);
 	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
 
+	/* if egress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip6_out(sk, skb);
+	if (unlikely(!skb))
+		return 0;
+
 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 		       net, sk, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 590dd1f7746f..54404f08efcc 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -653,6 +653,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	if (err)
 		goto error_fault;
 
+	/* if egress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip6_out(sk, skb);
+	if (unlikely(!skb))
+		return 0;
+
 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
 	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
 		      NULL, rt->dst.dev, dst_output);
-- 
cgit v1.2.3


From 5f02ce24c2696fec33f2a5dfcf753996f5fdd211 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:54 -0700
Subject: net: l3mdev: Allow the l3mdev to be a loopback

Allow an L3 master device to act as the loopback for that L3 domain.
For IPv4 the device can also have the address 127.0.0.1.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h |  6 +++---
 net/ipv4/route.c     |  8 ++++++--
 net/ipv6/route.c     | 12 ++++++++++--
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 53d5274920e3..3ee110518584 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -90,7 +90,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
 }
 
 static inline
-const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
+struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
 {
 	/* netdev_master_upper_dev_get_rcu calls
 	 * list_first_or_null_rcu to walk the upper dev list.
@@ -99,7 +99,7 @@ const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
 	 * typecast to remove the const
 	 */
 	struct net_device *dev = (struct net_device *)_dev;
-	const struct net_device *master;
+	struct net_device *master;
 
 	if (!dev)
 		return NULL;
@@ -254,7 +254,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
 }
 
 static inline
-const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
+struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
 {
 	return NULL;
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3e992783c1d0..f49b2c534e92 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2018,7 +2018,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		return ERR_PTR(-EINVAL);
 
 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
-		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+		if (ipv4_is_loopback(fl4->saddr) &&
+		    !(dev_out->flags & IFF_LOOPBACK) &&
+		    !netif_is_l3_master(dev_out))
 			return ERR_PTR(-EINVAL);
 
 	if (ipv4_is_lbcast(fl4->daddr))
@@ -2302,7 +2304,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 			else
 				fl4->saddr = fl4->daddr;
 		}
-		dev_out = net->loopback_dev;
+
+		/* L3 master device is the loopback for that domain */
+		dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
 		fl4->flowi4_oif = dev_out->ifindex;
 		flags |= RTCF_LOCAL;
 		goto make_route;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09d43ff11a8d..2c681113c055 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2558,8 +2558,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 {
 	u32 tb_id;
 	struct net *net = dev_net(idev->dev);
-	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
-					    DST_NOCOUNT);
+	struct net_device *dev = net->loopback_dev;
+	struct rt6_info *rt;
+
+	/* use L3 Master device as loopback for host routes if device
+	 * is enslaved and address is not link local or multicast
+	 */
+	if (!rt6_need_strict(addr))
+		dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
+
+	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
 	if (!rt)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From ebfc102c566d0d9c174ff9b721fd35ebda01f7eb Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:55 -0700
Subject: net: vrf: Flip IPv4 output path from FIB lookup hook to out hook

Flip the IPv4 output path to use the l3mdev tx out hook. The VRF dst
is not returned on the first FIB lookup. Instead, the dst on the
skb is switched at the beginning of the IPv4 output processing to
send the packet to the VRF driver on xmit.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/route.c  |  4 ----
 2 files changed, 63 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 1ce7420322ee..08540b96ec18 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -227,6 +227,20 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 }
 #endif
 
+/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */
+static int vrf_ip_local_out(struct net *net, struct sock *sk,
+			    struct sk_buff *skb)
+{
+	int err;
+
+	err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
+		      skb, NULL, skb_dst(skb)->dev, dst_output);
+	if (likely(err == 1))
+		err = dst_output(net, sk, skb);
+
+	return err;
+}
+
 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
 					   struct net_device *vrf_dev)
 {
@@ -292,7 +306,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
 					       RT_SCOPE_LINK);
 	}
 
-	ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
 	if (unlikely(net_xmit_eval(ret)))
 		vrf_dev->stats.tx_errors++;
 	else
@@ -531,6 +545,53 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
 
+/* set dst on skb to send packet to us via dev_xmit path. Allows
+ * packet to go through device based features such as qdisc, netfilter
+ * hooks and packet sockets with skb->dev set to vrf device.
+ */
+static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
+				  struct sock *sk,
+				  struct sk_buff *skb)
+{
+	struct net_vrf *vrf = netdev_priv(vrf_dev);
+	struct dst_entry *dst = NULL;
+	struct rtable *rth;
+
+	rcu_read_lock();
+
+	rth = rcu_dereference(vrf->rth);
+	if (likely(rth)) {
+		dst = &rth->dst;
+		dst_hold(dst);
+	}
+
+	rcu_read_unlock();
+
+	if (unlikely(!dst)) {
+		vrf_tx_error(vrf_dev, skb);
+		return NULL;
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	return skb;
+}
+
+/* called with rcu lock held */
+static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
+				  struct sock *sk,
+				  struct sk_buff *skb,
+				  u16 proto)
+{
+	switch (proto) {
+	case AF_INET:
+		return vrf_ip_out(vrf_dev, sk, skb);
+	}
+
+	return skb;
+}
+
 /* holding rtnl */
 static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf)
 {
@@ -1067,6 +1128,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_get_rtable	= vrf_get_rtable,
 	.l3mdev_get_saddr	= vrf_get_saddr,
 	.l3mdev_l3_rcv		= vrf_l3_rcv,
+	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
 	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
 	.l3mdev_get_saddr6	= vrf_get_saddr6,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f49b2c534e92..ad83f85fb240 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2246,10 +2246,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 				fl4->saddr = inet_select_addr(dev_out, 0,
 							      RT_SCOPE_HOST);
 		}
-
-		rth = l3mdev_get_rtable(dev_out, fl4);
-		if (rth)
-			goto out;
 	}
 
 	if (!fl4->daddr) {
-- 
cgit v1.2.3


From 4c1feac58e06270321cc500b85c2d94a11495775 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:56 -0700
Subject: net: vrf: Flip IPv6 output path from FIB lookup hook to out hook

Flip the IPv6 output path to use the l3mdev tx out hook. The VRF dst
is not returned on the first FIB lookup. Instead, the dst on the
skb is switched at the beginning of the IPv6 output processing to
send the packet to the VRF driver on xmit.

Link scope addresses (linklocal and multicast) need special handling:
specifically the oif the flow struct can not be changed because we
want the lookup tied to the enslaved interface. ie., the source address
and the returned route MUST point to the interface scope passed in.
Convert the existing vrf_get_rt6_dst to handle only link scope addresses.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c    | 124 ++++++++++++++++++++++++++++++++++-----------------
 include/net/l3mdev.h |   8 ++--
 net/ipv6/route.c     |  11 +++--
 net/l3mdev/l3mdev.c  |  15 +++----
 4 files changed, 100 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 08540b96ec18..f5372edf6edc 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -137,6 +137,20 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
+static int vrf_ip6_local_out(struct net *net, struct sock *sk,
+			     struct sk_buff *skb)
+{
+	int err;
+
+	err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
+		      sk, skb, NULL, skb_dst(skb)->dev, dst_output);
+
+	if (likely(err == 1))
+		err = dst_output(net, sk, skb);
+
+	return err;
+}
+
 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 					   struct net_device *dev)
 {
@@ -207,7 +221,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 	/* strip the ethernet header added for pass through VRF device */
 	__skb_pull(skb, skb_network_offset(skb));
 
-	ret = ip6_local_out(net, skb->sk, skb);
+	ret = vrf_ip6_local_out(net, skb->sk, skb);
 	if (unlikely(net_xmit_eval(ret)))
 		dev->stats.tx_errors++;
 	else
@@ -391,6 +405,43 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
+/* set dst on skb to send packet to us via dev_xmit path. Allows
+ * packet to go through device based features such as qdisc, netfilter
+ * hooks and packet sockets with skb->dev set to vrf device.
+ */
+static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
+				   struct sock *sk,
+				   struct sk_buff *skb)
+{
+	struct net_vrf *vrf = netdev_priv(vrf_dev);
+	struct dst_entry *dst = NULL;
+	struct rt6_info *rt6;
+
+	/* don't divert link scope packets */
+	if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
+		return skb;
+
+	rcu_read_lock();
+
+	rt6 = rcu_dereference(vrf->rt6);
+	if (likely(rt6)) {
+		dst = &rt6->dst;
+		dst_hold(dst);
+	}
+
+	rcu_read_unlock();
+
+	if (unlikely(!dst)) {
+		vrf_tx_error(vrf_dev, skb);
+		return NULL;
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	return skb;
+}
+
 /* holding rtnl */
 static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
 {
@@ -477,6 +528,13 @@ out:
 	return rc;
 }
 #else
+static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
+				   struct sock *sk,
+				   struct sk_buff *skb)
+{
+	return skb;
+}
+
 static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
 {
 }
@@ -587,6 +645,8 @@ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
 	switch (proto) {
 	case AF_INET:
 		return vrf_ip_out(vrf_dev, sk, skb);
+	case AF_INET6:
+		return vrf_ip6_out(vrf_dev, sk, skb);
 	}
 
 	return skb;
@@ -1031,53 +1091,33 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
-					 struct flowi6 *fl6)
+/* send to link-local or multicast address via interface enslaved to
+ * VRF device. Force lookup to VRF table without changing flow struct
+ */
+static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
+					      struct flowi6 *fl6)
 {
-	bool need_strict = rt6_need_strict(&fl6->daddr);
-	struct net_vrf *vrf = netdev_priv(dev);
 	struct net *net = dev_net(dev);
+	int flags = RT6_LOOKUP_F_IFACE;
 	struct dst_entry *dst = NULL;
 	struct rt6_info *rt;
 
-	/* send to link-local or multicast address */
-	if (need_strict) {
-		int flags = RT6_LOOKUP_F_IFACE;
-
-		/* VRF device does not have a link-local address and
-		 * sending packets to link-local or mcast addresses over
-		 * a VRF device does not make sense
-		 */
-		if (fl6->flowi6_oif == dev->ifindex) {
-			struct dst_entry *dst = &net->ipv6.ip6_null_entry->dst;
-
-			dst_hold(dst);
-			return dst;
-		}
-
-		if (!ipv6_addr_any(&fl6->saddr))
-			flags |= RT6_LOOKUP_F_HAS_SADDR;
-
-		rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags);
-		if (rt)
-			dst = &rt->dst;
-
-	} else if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
-
-		rcu_read_lock();
-
-		rt = rcu_dereference(vrf->rt6);
-		if (likely(rt)) {
-			dst = &rt->dst;
-			dst_hold(dst);
-		}
-
-		rcu_read_unlock();
+	/* VRF device does not have a link-local address and
+	 * sending packets to link-local or mcast addresses over
+	 * a VRF device does not make sense
+	 */
+	if (fl6->flowi6_oif == dev->ifindex) {
+		dst = &net->ipv6.ip6_null_entry->dst;
+		dst_hold(dst);
+		return dst;
 	}
 
-	/* make sure oif is set to VRF device for lookup */
-	if (!need_strict)
-		fl6->flowi6_oif = dev->ifindex;
+	if (!ipv6_addr_any(&fl6->saddr))
+		flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+	rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags);
+	if (rt)
+		dst = &rt->dst;
 
 	return dst;
 }
@@ -1130,7 +1170,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_l3_rcv		= vrf_l3_rcv,
 	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
-	.l3mdev_get_rt6_dst	= vrf_get_rt6_dst,
+	.l3mdev_link_scope_lookup = vrf_link_scope_lookup,
 	.l3mdev_get_saddr6	= vrf_get_saddr6,
 #endif
 };
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 3ee110518584..51aab20a4d0a 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -27,7 +27,7 @@
  *
  * @l3mdev_get_saddr: Get source address for a flow
  *
- * @l3mdev_get_rt6_dst: Get cached IPv6 rt6_info (dst_entry) for device
+ * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
  */
 
 struct l3mdev_ops {
@@ -45,7 +45,7 @@ struct l3mdev_ops {
 					    struct flowi4 *fl4);
 
 	/* IPv6 ops */
-	struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev,
+	struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
 						 struct flowi6 *fl6);
 	int		   (*l3mdev_get_saddr6)(struct net_device *dev,
 						const struct sock *sk,
@@ -177,7 +177,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 
 int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
 
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6);
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
 int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
 		      struct flowi6 *fl6);
 
@@ -299,7 +299,7 @@ static inline int l3mdev_get_saddr(struct net *net, int ifindex,
 }
 
 static inline
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6)
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 {
 	return NULL;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2c681113c055..87e0a01ce744 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1188,12 +1188,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 					 struct flowi6 *fl6, int flags)
 {
-	struct dst_entry *dst;
 	bool any_src;
 
-	dst = l3mdev_get_rt6_dst(net, fl6);
-	if (dst)
-		return dst;
+	if (rt6_need_strict(&fl6->daddr)) {
+		struct dst_entry *dst;
+
+		dst = l3mdev_link_scope_lookup(net, fl6);
+		if (dst)
+			return dst;
+	}
 
 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
 
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 43610e5acc4e..ac9d928d0a9e 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
 EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
 
 /**
- *	l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns
- *			     cached route for L3 master device if relevant
- *			     to flow
+ *	l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link
+ *			     local and multicast addresses
  *	@net: network namespace for device index lookup
  *	@fl6: IPv6 flow struct for lookup
  */
 
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
-				     struct flowi6 *fl6)
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
+					   struct flowi6 *fl6)
 {
 	struct dst_entry *dst = NULL;
 	struct net_device *dev;
@@ -121,15 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
 			dev = netdev_master_upper_dev_get_rcu(dev);
 
 		if (dev && netif_is_l3_master(dev) &&
-		    dev->l3mdev_ops->l3mdev_get_rt6_dst)
-			dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
+		    dev->l3mdev_ops->l3mdev_link_scope_lookup)
+			dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
 
 		rcu_read_unlock();
 	}
 
 	return dst;
 }
-EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst);
+EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
 
 /**
  *	l3mdev_get_saddr - get source address for a flow based on an interface
-- 
cgit v1.2.3


From e0d56fdd734224666e7bd5fafbc620286d2a7ee8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:57 -0700
Subject: net: l3mdev: remove redundant calls

A previous patch added l3mdev flow update making these hooks
redundant. Remove them.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c    |  3 +--
 net/ipv4/route.c        | 12 ++----------
 net/ipv4/xfrm4_policy.c |  2 +-
 net/ipv6/ip6_output.c   |  2 --
 net/ipv6/ndisc.c        | 11 ++---------
 net/ipv6/route.c        |  7 +------
 net/ipv6/tcp_ipv6.c     |  8 ++------
 net/ipv6/xfrm6_policy.c |  2 +-
 8 files changed, 10 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 41e10e34769c..05d105832bdb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1582,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 	}
 
 	oif = arg->bound_dev_if;
-	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
-		oif = skb->skb_iif;
+	oif = oif ? : skb->skb_iif;
 
 	flowi4_init_output(&fl4, oif,
 			   IP4_REPLY_MARK(net, skb->mark),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ad83f85fb240..b52496fd5107 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1831,7 +1831,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	 *	Now we are ready to route packet.
 	 */
 	fl4.flowi4_oif = 0;
-	fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
+	fl4.flowi4_iif = dev->ifindex;
 	fl4.flowi4_mark = skb->mark;
 	fl4.flowi4_tos = tos;
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -2150,7 +2150,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
-	int master_idx;
 	int orig_oif;
 	int err = -ENETUNREACH;
 
@@ -2160,9 +2159,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 
 	orig_oif = fl4->flowi4_oif;
 
-	master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
-	if (master_idx)
-		fl4->flowi4_oif = master_idx;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
@@ -2263,8 +2259,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	if (err) {
 		res.fi = NULL;
 		res.table = NULL;
-		if (fl4->flowi4_oif &&
-		    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
+		if (fl4->flowi4_oif) {
 			/* Apparently, routing tables are wrong. Assume,
 			   that the destination is on link.
 
@@ -2577,9 +2572,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 
-	if (netif_index_is_l3_master(net, fl4.flowi4_oif))
-		fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
-
 	if (iif) {
 		struct net_device *dev;
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b644a23c3db0..3155ed73d3b3 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 	int oif = 0;
 
 	if (skb_dst(skb))
-		oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl4, 0, sizeof(struct flowi4));
 	fl4->flowi4_mark = skb->mark;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6ea6caace3a8..1cb41b365048 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1070,8 +1070,6 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
 		return ERR_PTR(err);
 	if (final_dst)
 		fl6->daddr = *final_dst;
-	if (!fl6->flowi6_oif)
-		fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
 
 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index fe65cdc28a45..d8e671457d10 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -67,7 +67,6 @@
 #include <net/flow.h>
 #include <net/ip6_checksum.h>
 #include <net/inet_common.h>
-#include <net/l3mdev.h>
 #include <linux/proc_fs.h>
 
 #include <linux/netfilter.h>
@@ -457,11 +456,9 @@ static void ndisc_send_skb(struct sk_buff *skb,
 
 	if (!dst) {
 		struct flowi6 fl6;
-		int oif = l3mdev_fib_oif(skb->dev);
+		int oif = skb->dev->ifindex;
 
 		icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
-		if (oif != skb->dev->ifindex)
-			fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
 		dst = icmp6_dst_alloc(skb->dev, &fl6);
 		if (IS_ERR(dst)) {
 			kfree_skb(skb);
@@ -1538,7 +1535,6 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	int rd_len;
 	u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
 	   ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
-	int oif = l3mdev_fib_oif(dev);
 	bool ret;
 
 	if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
@@ -1555,10 +1551,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	}
 
 	icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
-			 &saddr_buf, &ipv6_hdr(skb)->saddr, oif);
-
-	if (oif != skb->dev->ifindex)
-		fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
+			 &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 87e0a01ce744..ad4a7ff301fc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1164,7 +1164,7 @@ void ip6_route_input(struct sk_buff *skb)
 	int flags = RT6_LOOKUP_F_HAS_SADDR;
 	struct ip_tunnel_info *tun_info;
 	struct flowi6 fl6 = {
-		.flowi6_iif = l3mdev_fib_oif(skb->dev),
+		.flowi6_iif = skb->dev->ifindex,
 		.daddr = iph->daddr,
 		.saddr = iph->saddr,
 		.flowlabel = ip6_flowinfo(iph),
@@ -3349,11 +3349,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	} else {
 		fl6.flowi6_oif = oif;
 
-		if (netif_index_is_l3_master(net, oif)) {
-			fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
-					   FLOWI_FLAG_SKIP_NH_OIF;
-		}
-
 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 04529a3d42cb..54cf7197c7ab 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -818,12 +818,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	fl6.flowi6_proto = IPPROTO_TCP;
 	if (rt6_need_strict(&fl6.daddr) && !oif)
 		fl6.flowi6_oif = tcp_v6_iif(skb);
-	else {
-		if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
-			oif = skb->skb_iif;
-
-		fl6.flowi6_oif = oif;
-	}
+	else
+		fl6.flowi6_oif = oif ? : skb->skb_iif;
 
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
 	fl6.fl6_dport = t1->dest;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6cc97003e4a9..b7b7e863a2bb 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -134,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 	nexthdr = nh[nhoff];
 
 	if (skb_dst(skb))
-		oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl6, 0, sizeof(struct flowi6));
 	fl6->flowi6_mark = skb->mark;
-- 
cgit v1.2.3


From d66f6c0a8f3c0bcc4ee7a9b1da4b0ebe7ee555a3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:58 -0700
Subject: net: ipv4: Remove l3mdev_get_saddr

No longer needed

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c    | 38 --------------------------------------
 include/net/l3mdev.h | 12 ------------
 include/net/route.h  | 10 ----------
 net/ipv4/raw.c       |  6 ------
 net/ipv4/udp.c       |  6 ------
 net/l3mdev/l3mdev.c  | 31 -------------------------------
 6 files changed, 103 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index f5372edf6edc..9ad2a169485f 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -863,43 +863,6 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev,
 	return rth;
 }
 
-/* called under rcu_read_lock */
-static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
-{
-	struct fib_result res = { .tclassid = 0 };
-	struct net *net = dev_net(dev);
-	u32 orig_tos = fl4->flowi4_tos;
-	u8 flags = fl4->flowi4_flags;
-	u8 scope = fl4->flowi4_scope;
-	u8 tos = RT_FL_TOS(fl4);
-	int rc;
-
-	if (unlikely(!fl4->daddr))
-		return 0;
-
-	fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF;
-	fl4->flowi4_iif = LOOPBACK_IFINDEX;
-	/* make sure oif is set to VRF device for lookup */
-	fl4->flowi4_oif = dev->ifindex;
-	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
-	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
-			     RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
-
-	rc = fib_lookup(net, fl4, &res, 0);
-	if (!rc) {
-		if (res.type == RTN_LOCAL)
-			fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr;
-		else
-			fib_select_path(net, &res, fl4, -1);
-	}
-
-	fl4->flowi4_flags = flags;
-	fl4->flowi4_tos = orig_tos;
-	fl4->flowi4_scope = scope;
-
-	return rc;
-}
-
 static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	return 0;
@@ -1166,7 +1129,6 @@ static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk,
 static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_fib_table	= vrf_fib_table,
 	.l3mdev_get_rtable	= vrf_get_rtable,
-	.l3mdev_get_saddr	= vrf_get_saddr,
 	.l3mdev_l3_rcv		= vrf_l3_rcv,
 	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 51aab20a4d0a..1129e1d8cd6e 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -25,8 +25,6 @@
  *
  * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
  *
- * @l3mdev_get_saddr: Get source address for a flow
- *
  * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
  */
 
@@ -41,8 +39,6 @@ struct l3mdev_ops {
 	/* IPv4 ops */
 	struct rtable *	(*l3mdev_get_rtable)(const struct net_device *dev,
 					     const struct flowi4 *fl4);
-	int		(*l3mdev_get_saddr)(struct net_device *dev,
-					    struct flowi4 *fl4);
 
 	/* IPv6 ops */
 	struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
@@ -175,8 +171,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 	return rc;
 }
 
-int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
-
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
 int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
 		      struct flowi6 *fl6);
@@ -292,12 +286,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 	return false;
 }
 
-static inline int l3mdev_get_saddr(struct net *net, int ifindex,
-				   struct flowi4 *fl4)
-{
-	return 0;
-}
-
 static inline
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 {
diff --git a/include/net/route.h b/include/net/route.h
index ad777d79af94..0429d47cad25 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -29,7 +29,6 @@
 #include <net/flow.h>
 #include <net/inet_sock.h>
 #include <net/ip_fib.h>
-#include <net/l3mdev.h>
 #include <linux/in_route.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
@@ -285,15 +284,6 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
 	ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
 			      sport, dport, sk);
 
-	if (!src && oif) {
-		int rc;
-
-		rc = l3mdev_get_saddr(net, oif, fl4);
-		if (rc < 0)
-			return ERR_PTR(rc);
-
-		src = fl4->saddr;
-	}
 	if (!dst || !src) {
 		rt = __ip_route_output_key(net, fl4);
 		if (IS_ERR(rt))
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 438f50c1a676..90a85c955872 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
 			   daddr, saddr, 0, 0);
 
-	if (!saddr && ipc.oif) {
-		err = l3mdev_get_saddr(net, ipc.oif, &fl4);
-		if (err < 0)
-			goto done;
-	}
-
 	if (!inet->hdrincl) {
 		rfv.msg = msg;
 		rfv.hlen = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 058c31286ce1..7d96dc2d3d08 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1021,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				   flow_flags,
 				   faddr, saddr, dport, inet->inet_sport);
 
-		if (!saddr && ipc.oif) {
-			err = l3mdev_get_saddr(net, ipc.oif, fl4);
-			if (err < 0)
-				goto out;
-		}
-
 		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
 		rt = ip_route_output_flow(net, fl4, sk);
 		if (IS_ERR(rt)) {
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index ac9d928d0a9e..be40df60703c 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -130,37 +130,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
 
-/**
- *	l3mdev_get_saddr - get source address for a flow based on an interface
- *			   enslaved to an L3 master device
- *	@net: network namespace for device index lookup
- *	@ifindex: Interface index
- *	@fl4: IPv4 flow struct
- */
-
-int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
-{
-	struct net_device *dev;
-	int rc = 0;
-
-	if (ifindex) {
-		rcu_read_lock();
-
-		dev = dev_get_by_index_rcu(net, ifindex);
-		if (dev && netif_is_l3_slave(dev))
-			dev = netdev_master_upper_dev_get_rcu(dev);
-
-		if (dev && netif_is_l3_master(dev) &&
-		    dev->l3mdev_ops->l3mdev_get_saddr)
-			rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
-
-		rcu_read_unlock();
-	}
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
-
 int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
 		      struct flowi6 *fl6)
 {
-- 
cgit v1.2.3


From 8a966fc016b67d2a8ab4a83d22ded8cde032a0eb Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 10 Sep 2016 12:09:59 -0700
Subject: net: ipv6: Remove l3mdev_get_saddr6

No longer needed

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c     | 41 -----------------------------------------
 include/net/l3mdev.h  | 11 -----------
 net/ipv6/ip6_output.c |  9 +--------
 net/l3mdev/l3mdev.c   | 24 ------------------------
 4 files changed, 1 insertion(+), 84 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9ad2a169485f..3a34f547c578 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1084,46 +1084,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
 
 	return dst;
 }
-
-/* called under rcu_read_lock */
-static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk,
-			  struct flowi6 *fl6)
-{
-	struct net *net = dev_net(dev);
-	struct dst_entry *dst;
-	struct rt6_info *rt;
-	int err;
-
-	if (rt6_need_strict(&fl6->daddr)) {
-		rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif,
-					  RT6_LOOKUP_F_IFACE);
-		if (unlikely(!rt))
-			return 0;
-
-		dst = &rt->dst;
-	} else {
-		__u8 flags = fl6->flowi6_flags;
-
-		fl6->flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
-		fl6->flowi6_flags |= FLOWI_FLAG_SKIP_NH_OIF;
-
-		dst = ip6_route_output(net, sk, fl6);
-		rt = (struct rt6_info *)dst;
-
-		fl6->flowi6_flags = flags;
-	}
-
-	err = dst->error;
-	if (!err) {
-		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
-					  sk ? inet6_sk(sk)->srcprefs : 0,
-					  &fl6->saddr);
-	}
-
-	dst_release(dst);
-
-	return err;
-}
 #endif
 
 static const struct l3mdev_ops vrf_l3mdev_ops = {
@@ -1133,7 +1093,6 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
 	.l3mdev_l3_out		= vrf_l3_out,
 #if IS_ENABLED(CONFIG_IPV6)
 	.l3mdev_link_scope_lookup = vrf_link_scope_lookup,
-	.l3mdev_get_saddr6	= vrf_get_saddr6,
 #endif
 };
 
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 1129e1d8cd6e..a5e506eb51de 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -43,9 +43,6 @@ struct l3mdev_ops {
 	/* IPv6 ops */
 	struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
 						 struct flowi6 *fl6);
-	int		   (*l3mdev_get_saddr6)(struct net_device *dev,
-						const struct sock *sk,
-						struct flowi6 *fl6);
 };
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
@@ -172,8 +169,6 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
 }
 
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
-int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
-		      struct flowi6 *fl6);
 
 static inline
 struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
@@ -292,12 +287,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 	return NULL;
 }
 
-static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
-				    struct flowi6 *fl6)
-{
-	return 0;
-}
-
 static inline
 struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1cb41b365048..6001e781164e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -926,13 +926,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	int err;
 	int flags = 0;
 
-	if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
-	    (!*dst || !(*dst)->error)) {
-		err = l3mdev_get_saddr6(net, sk, fl6);
-		if (err)
-			goto out_err;
-	}
-
 	/* The correct way to handle this would be to do
 	 * ip6_route_get_saddr, and then ip6_route_output; however,
 	 * the route-specific preferred source forces the
@@ -1024,7 +1017,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 out_err_release:
 	dst_release(*dst);
 	*dst = NULL;
-out_err:
+
 	if (err == -ENETUNREACH)
 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 	return err;
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index be40df60703c..8da86ceca33d 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -130,30 +130,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
 
-int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
-		      struct flowi6 *fl6)
-{
-	struct net_device *dev;
-	int rc = 0;
-
-	if (fl6->flowi6_oif) {
-		rcu_read_lock();
-
-		dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
-		if (dev && netif_is_l3_slave(dev))
-			dev = netdev_master_upper_dev_get_rcu(dev);
-
-		if (dev && netif_is_l3_master(dev) &&
-		    dev->l3mdev_ops->l3mdev_get_saddr6)
-			rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
-
-		rcu_read_unlock();
-	}
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
-
 /**
  *	l3mdev_fib_rule_match - Determine if flowi references an
  *				L3 master device
-- 
cgit v1.2.3


From b7fb44dacae04219c82f20897382ba34860d1a16 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Wed, 3 Aug 2016 17:02:15 -0500
Subject: nl80211: Allow GET_INTERFACE dumps to be filtered

This patch allows GET_INTERFACE dumps to be filtered based on
NL80211_ATTR_WIPHY or NL80211_ATTR_WDEV.  The documentation for
GET_INTERFACE mentions that this is possible:
"Request an interface's configuration; either a dump request on
a %NL80211_ATTR_WIPHY or ..."

However, this behavior has not been implemented until now.

Johannes: rewrite most of the patch:
 * use nl80211_dump_wiphy_parse() to also allow passing an interface
   to be able to dump its siblings
 * fix locking (must hold rtnl around using nl80211_fam.attrbuf)
 * make init self-contained instead of relying on other cb->args

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 499785778983..7ebad350abc1 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2525,10 +2525,35 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
 	int if_idx = 0;
 	int wp_start = cb->args[0];
 	int if_start = cb->args[1];
+	int filter_wiphy = -1;
 	struct cfg80211_registered_device *rdev;
 	struct wireless_dev *wdev;
 
 	rtnl_lock();
+	if (!cb->args[2]) {
+		struct nl80211_dump_wiphy_state state = {
+			.filter_wiphy = -1,
+		};
+		int ret;
+
+		ret = nl80211_dump_wiphy_parse(skb, cb, &state);
+		if (ret)
+			return ret;
+
+		filter_wiphy = state.filter_wiphy;
+
+		/*
+		 * if filtering, set cb->args[2] to +1 since 0 is the default
+		 * value needed to determine that parsing is necessary.
+		 */
+		if (filter_wiphy >= 0)
+			cb->args[2] = filter_wiphy + 1;
+		else
+			cb->args[2] = -1;
+	} else if (cb->args[2] > 0) {
+		filter_wiphy = cb->args[2] - 1;
+	}
+
 	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
 		if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
 			continue;
@@ -2536,6 +2561,10 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
 			wp_idx++;
 			continue;
 		}
+
+		if (filter_wiphy >= 0 && filter_wiphy != rdev->wiphy_idx)
+			continue;
+
 		if_idx = 0;
 
 		list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
-- 
cgit v1.2.3


From d82121845d44334f5ec3c98906d1e4a592350beb Mon Sep 17 00:00:00 2001
From: Aviya Erenfeld <aviya.erenfeld@intel.com>
Date: Mon, 29 Aug 2016 23:25:15 +0300
Subject: mac80211: refactor monitor representation in sdata

Insert the u32 monitor flags variable in a new structure
that represents a monitor interface.
This will allow to add more configuration variables to
that structure which will happen in an upcoming change.

Signed-off-by: Aviya Erenfeld <aviya.erenfeld@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c         |  8 ++++----
 net/mac80211/driver-ops.c  |  2 +-
 net/mac80211/ieee80211_i.h |  6 +++++-
 net/mac80211/iface.c       | 16 ++++++++--------
 net/mac80211/rx.c          |  4 ++--
 net/mac80211/status.c      |  2 +-
 net/mac80211/tx.c          |  2 +-
 net/mac80211/util.c        |  2 +-
 8 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 543b1d4fc33d..f2c8cd22d317 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -39,7 +39,7 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy,
 
 	if (type == NL80211_IFTYPE_MONITOR && flags) {
 		sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
-		sdata->u.mntr_flags = *flags;
+		sdata->u.mntr.flags = *flags;
 	}
 
 	return wdev;
@@ -89,11 +89,11 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
 			 *	cooked_mntrs, monitor and all fif_* counters
 			 *	reconfigure hardware
 			 */
-			if ((*flags & mask) != (sdata->u.mntr_flags & mask))
+			if ((*flags & mask) != (sdata->u.mntr.flags & mask))
 				return -EBUSY;
 
 			ieee80211_adjust_monitor_flags(sdata, -1);
-			sdata->u.mntr_flags = *flags;
+			sdata->u.mntr.flags = *flags;
 			ieee80211_adjust_monitor_flags(sdata, 1);
 
 			ieee80211_configure_filter(local);
@@ -103,7 +103,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
 			 * and ieee80211_do_open take care of "everything"
 			 * mentioned in the comment above.
 			 */
-			sdata->u.mntr_flags = *flags;
+			sdata->u.mntr.flags = *flags;
 		}
 	}
 
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index c258f1041d33..c701b6438bd9 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -62,7 +62,7 @@ int drv_add_interface(struct ieee80211_local *local,
 	if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
 		    (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
 		     !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) &&
-		     !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))))
+		     !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))))
 		return -EINVAL;
 
 	trace_drv_add_interface(local, sdata);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342c31b8..9211cce10d3e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -824,6 +824,10 @@ struct txq_info {
 	struct ieee80211_txq txq;
 };
 
+struct ieee80211_if_mntr {
+	u32 flags;
+};
+
 struct ieee80211_sub_if_data {
 	struct list_head list;
 
@@ -922,7 +926,7 @@ struct ieee80211_sub_if_data {
 		struct ieee80211_if_ibss ibss;
 		struct ieee80211_if_mesh mesh;
 		struct ieee80211_if_ocb ocb;
-		u32 mntr_flags;
+		struct ieee80211_if_mntr mntr;
 	} u;
 
 #ifdef CONFIG_MAC80211_DEBUGFS
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b123a9e325b3..c8509d95e09d 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -188,7 +188,7 @@ static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
 			continue;
 
 		if (iter->vif.type == NL80211_IFTYPE_MONITOR &&
-		    !(iter->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+		    !(iter->u.mntr.flags & MONITOR_FLAG_ACTIVE))
 			continue;
 
 		m = iter->vif.addr;
@@ -217,7 +217,7 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr)
 		return -EBUSY;
 
 	if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
-	    !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+	    !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
 		check_dup = false;
 
 	ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup);
@@ -357,7 +357,7 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
 				    const int offset)
 {
 	struct ieee80211_local *local = sdata->local;
-	u32 flags = sdata->u.mntr_flags;
+	u32 flags = sdata->u.mntr.flags;
 
 #define ADJUST(_f, _s)	do {					\
 	if (flags & MONITOR_FLAG_##_f)				\
@@ -589,12 +589,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 		}
 		break;
 	case NL80211_IFTYPE_MONITOR:
-		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+		if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
 			local->cooked_mntrs++;
 			break;
 		}
 
-		if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+		if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
 			res = drv_add_interface(local, sdata);
 			if (res)
 				goto err_stop;
@@ -926,7 +926,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 		/* no need to tell driver */
 		break;
 	case NL80211_IFTYPE_MONITOR:
-		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+		if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
 			local->cooked_mntrs--;
 			break;
 		}
@@ -1012,7 +1012,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 		ieee80211_recalc_idle(local);
 		mutex_unlock(&local->mtx);
 
-		if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+		if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
 			break;
 
 		/* fall through */
@@ -1444,7 +1444,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	case NL80211_IFTYPE_MONITOR:
 		sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
 		sdata->dev->netdev_ops = &ieee80211_monitorif_ops;
-		sdata->u.mntr_flags = MONITOR_FLAG_CONTROL |
+		sdata->u.mntr.flags = MONITOR_FLAG_CONTROL |
 				      MONITOR_FLAG_OTHER_BSS;
 		break;
 	case NL80211_IFTYPE_WDS:
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9dce3b157908..708c3b1e49a1 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -567,7 +567,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 		if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
 			continue;
 
-		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)
+		if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
 			continue;
 
 		if (!ieee80211_sdata_running(sdata))
@@ -3147,7 +3147,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 			continue;
 
 		if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
-		    !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+		    !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES))
 			continue;
 
 		if (prev_dev) {
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index a2a68269675d..fabd9ff710d9 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -709,7 +709,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
 			if (!ieee80211_sdata_running(sdata))
 				continue;
 
-			if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) &&
+			if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) &&
 			    !send_to_cooked)
 				continue;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746dfea57..efc38e7b90b9 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1643,7 +1643,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_MONITOR:
-		if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+		if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
 			vif = &sdata->vif;
 			break;
 		}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 42bf0b6685e8..e777c2a6568f 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -598,7 +598,7 @@ static void __iterate_interfaces(struct ieee80211_local *local,
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_MONITOR:
-			if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+			if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
 				continue;
 			break;
 		case NL80211_IFTYPE_AP_VLAN:
-- 
cgit v1.2.3


From e4819013840bbad025ed6da660c1e8b3e9e8430a Mon Sep 17 00:00:00 2001
From: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Date: Wed, 31 Aug 2016 00:35:07 +0530
Subject: cfg80211: Remove deprecated create_singlethread_workqueue

The workqueue "cfg80211_wq" is involved in cleanup, scan and event related
works. It queues multiple work items &rdev->event_work,
&rdev->dfs_update_channels_wk,
&wiphy_to_rdev(request->wiphy)->scan_done_wk,
&wiphy_to_rdev(wiphy)->sched_scan_results_wk, which require strict
execution ordering.
Hence, an ordered dedicated workqueue has been used.

Since it's a wireless driver, WQ_MEM_RECLAIM has been set to ensure
forward progress under memory pressure.

Signed-off-by: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index 2029b49a1df3..4911cd997b9a 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1252,7 +1252,7 @@ static int __init cfg80211_init(void)
 	if (err)
 		goto out_fail_reg;
 
-	cfg80211_wq = create_singlethread_workqueue("cfg80211");
+	cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM);
 	if (!cfg80211_wq) {
 		err = -ENOMEM;
 		goto out_fail_wq;
-- 
cgit v1.2.3


From 480dd46b9d6812e5fb7172c305ee0f1154c26eed Mon Sep 17 00:00:00 2001
From: Maxim Altshul <maxim.altshul@ti.com>
Date: Mon, 22 Aug 2016 17:14:04 +0300
Subject: mac80211: RX BA support for sta max_rx_aggregation_subframes

The ability to change the max_rx_aggregation frames is useful
in cases of IOP.

There exist some devices (latest mobile phones and some AP's)
that tend to not respect a BA sessions maximum size (in Kbps).
These devices won't respect the AMPDU size that was negotiated during
association (even though they do respect the maximal number of packets).

This violation is characterized by a valid number of packets in
a single AMPDU. Even so, the total size will exceed the size negotiated
during association.

Eventually, this will cause some undefined behavior, which in turn
causes the hw to drop packets, causing the throughput to plummet.

This patch will make the subframe limitation to be held by each station,
instead of being held only by hw.

Signed-off-by: Maxim Altshul <maxim.altshul@ti.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 4 ++++
 net/mac80211/agg-rx.c   | 7 +++++--
 net/mac80211/sta_info.c | 3 +++
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a585c3..a1457ca2a30c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1735,6 +1735,9 @@ struct ieee80211_sta_rates {
  * @supp_rates: Bitmap of supported rates (per band)
  * @ht_cap: HT capabilities of this STA; restricted to our own capabilities
  * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
+ * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU
+ *	that this station is allowed to transmit to us.
+ *	Can be modified by driver.
  * @wme: indicates whether the STA supports QoS/WME (if local devices does,
  *	otherwise always false)
  * @drv_priv: data area for driver use, will always be aligned to
@@ -1775,6 +1778,7 @@ struct ieee80211_sta {
 	u16 aid;
 	struct ieee80211_sta_ht_cap ht_cap;
 	struct ieee80211_sta_vht_cap vht_cap;
+	u8 max_rx_aggregation_subframes;
 	bool wme;
 	u8 uapsd_queues;
 	u8 max_sp;
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index a9aff6079c42..282e99bdb301 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -298,10 +298,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		buf_size = IEEE80211_MAX_AMPDU_BUF;
 
 	/* make sure the size doesn't exceed the maximum supported by the hw */
-	if (buf_size > local->hw.max_rx_aggregation_subframes)
-		buf_size = local->hw.max_rx_aggregation_subframes;
+	if (buf_size > sta->sta.max_rx_aggregation_subframes)
+		buf_size = sta->sta.max_rx_aggregation_subframes;
 	params.buf_size = buf_size;
 
+	ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n",
+	       buf_size, sta->sta.addr);
+
 	/* examine state machine */
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 19f14c907d74..5e70fa52e1ff 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -340,6 +340,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 
 	memcpy(sta->addr, addr, ETH_ALEN);
 	memcpy(sta->sta.addr, addr, ETH_ALEN);
+	sta->sta.max_rx_aggregation_subframes =
+		local->hw.max_rx_aggregation_subframes;
+
 	sta->local = local;
 	sta->sdata = sdata;
 	sta->rx_stats.last_rx = jiffies;
-- 
cgit v1.2.3


From 42bd20d99857e69e368d5421ea402127d5835cd3 Mon Sep 17 00:00:00 2001
From: Aviya Erenfeld <aviya.erenfeld@intel.com>
Date: Mon, 29 Aug 2016 23:25:16 +0300
Subject: mac80211: add support for MU-MIMO air sniffer

add support to MU-MIMO air sniffer according groupID:
in monitor mode, use a given MU-MIMO groupID to monitor stations
that belongs to that group using MU-MIMO.

add support for following a station according to its MAC address
using VHT MU-MIMO sniffer:
the monitors wait until they get an action MU-MIMO notification
frame, then parses it in order to find the groupID that corresponds
to the given MAC address and monitors packets destined to that
groupID using VHT MU-MIMO.

Signed-off-by: Aviya Erenfeld <aviya.erenfeld@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c         | 23 ++++++++++++++++++++++-
 net/mac80211/driver-ops.h  |  3 ++-
 net/mac80211/ieee80211_i.h |  3 ++-
 net/mac80211/iface.c       |  5 +++++
 net/mac80211/rx.c          | 20 ++++++++++++++++++++
 5 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f2c8cd22d317..5d4afead804e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -73,8 +73,29 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
 		sdata->u.mgd.use_4addr = params->use_4addr;
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
+	if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
 		struct ieee80211_local *local = sdata->local;
+		struct ieee80211_sub_if_data *monitor_sdata;
+		u32 mu_mntr_cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
+
+		monitor_sdata = rtnl_dereference(local->monitor_sdata);
+		if (monitor_sdata &&
+		    wiphy_ext_feature_isset(wiphy, mu_mntr_cap_flag)) {
+			memcpy(monitor_sdata->vif.bss_conf.mu_group.membership,
+			       params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN);
+			memcpy(monitor_sdata->vif.bss_conf.mu_group.position,
+			       params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN,
+			       WLAN_USER_POSITION_LEN);
+			monitor_sdata->vif.mu_mimo_owner = true;
+			ieee80211_bss_info_change_notify(monitor_sdata,
+							 BSS_CHANGED_MU_GROUPS);
+
+			ether_addr_copy(monitor_sdata->u.mntr.mu_follow_addr,
+					params->macaddr);
+		}
+
+		if (!flags)
+			return 0;
 
 		if (ieee80211_sdata_running(sdata)) {
 			u32 mask = MONITOR_FLAG_COOK_FRAMES |
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 42a41ae405ba..c39f93b48791 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -162,7 +162,8 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
 		return;
 
 	if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
-			 sdata->vif.type == NL80211_IFTYPE_MONITOR))
+			 (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+			  !sdata->vif.mu_mimo_owner)))
 		return;
 
 	if (!check_sdata_in_driver(sdata))
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9211cce10d3e..75761686a98b 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -3,7 +3,7 @@
  * Copyright 2005, Devicescape Software, Inc.
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
- * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright 2013-2015  Intel Mobile Communications GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -826,6 +826,7 @@ struct txq_info {
 
 struct ieee80211_if_mntr {
 	u32 flags;
+	u8 mu_follow_addr[ETH_ALEN] __aligned(2);
 };
 
 struct ieee80211_sub_if_data {
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index c8509d95e09d..b0abddc714ef 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -43,6 +43,8 @@
  * by either the RTNL, the iflist_mtx or RCU.
  */
 
+static void ieee80211_iface_work(struct work_struct *work);
+
 bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_chanctx_conf *chanctx_conf;
@@ -448,6 +450,9 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 		return ret;
 	}
 
+	skb_queue_head_init(&sdata->skb_queue);
+	INIT_WORK(&sdata->work, ieee80211_iface_work);
+
 	return 0;
 }
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 708c3b1e49a1..6a265aa73a46 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -485,6 +485,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	struct net_device *prev_dev = NULL;
 	int present_fcs_len = 0;
 	unsigned int rtap_vendor_space = 0;
+	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_sub_if_data *monitor_sdata =
+		rcu_dereference(local->monitor_sdata);
 
 	if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
 		struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
@@ -585,6 +588,23 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 		ieee80211_rx_stats(sdata->dev, skb->len);
 	}
 
+	mgmt = (void *)skb->data;
+	if (monitor_sdata &&
+	    skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
+	    ieee80211_is_action(mgmt->frame_control) &&
+	    mgmt->u.action.category == WLAN_CATEGORY_VHT &&
+	    mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
+	    is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
+	    ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
+		struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
+
+		if (mu_skb) {
+			mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+			skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
+			ieee80211_queue_work(&local->hw, &monitor_sdata->work);
+		}
+	}
+
 	if (prev_dev) {
 		skb->dev = prev_dev;
 		netif_receive_skb(skb);
-- 
cgit v1.2.3


From 99ee7cae3bf3ce04e90d7b193d9f4f59a7044d91 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Aug 2016 23:25:17 +0300
Subject: mac80211: add support for radiotap timestamp field

Use the existing device timestamp from the RX status information
to add support for the new radiotap timestamp field. Currently
only 32-bit counters are supported, but we also add the radiotap
mactime where applicable. This new field allows more flexibility
in where the timestamp is taken etc. The non-timestamp data in
the field is taken from a new field in the hw struct.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 21 +++++++++++++++++++++
 include/net/mac80211.h           | 12 ++++++++++++
 net/mac80211/main.c              |  3 +++
 net/mac80211/rx.c                | 30 ++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+)

(limited to 'net')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index b0fd9476c538..ba07b9d8ed63 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -190,6 +190,10 @@ struct ieee80211_radiotap_header {
  * IEEE80211_RADIOTAP_VHT	u16, u8, u8, u8[4], u8, u8, u16
  *
  *	Contains VHT information about this frame.
+ *
+ * IEEE80211_RADIOTAP_TIMESTAMP		u64, u16, u8, u8	variable
+ *
+ *	Contains timestamp information for this frame.
  */
 enum ieee80211_radiotap_type {
 	IEEE80211_RADIOTAP_TSFT = 0,
@@ -214,6 +218,7 @@ enum ieee80211_radiotap_type {
 	IEEE80211_RADIOTAP_MCS = 19,
 	IEEE80211_RADIOTAP_AMPDU_STATUS = 20,
 	IEEE80211_RADIOTAP_VHT = 21,
+	IEEE80211_RADIOTAP_TIMESTAMP = 22,
 
 	/* valid in every it_present bitmap, even vendor namespaces */
 	IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29,
@@ -321,6 +326,22 @@ enum ieee80211_radiotap_type {
 #define IEEE80211_RADIOTAP_CODING_LDPC_USER2			0x04
 #define IEEE80211_RADIOTAP_CODING_LDPC_USER3			0x08
 
+/* For IEEE80211_RADIOTAP_TIMESTAMP */
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK			0x000F
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MS			0x0000
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_US			0x0001
+#define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_NS			0x0003
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_MASK			0x00F0
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_BEGIN_MDPU		0x0000
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU		0x0010
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_PPDU		0x0020
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ		0x0030
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_UNKNOWN		0x00F0
+
+#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT			0x00
+#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT			0x01
+#define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY		0x02
+
 /* helpers */
 static inline int ieee80211_get_radiotap_len(unsigned char *data)
 {
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a1457ca2a30c..08bac23c8de1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2145,6 +2145,14 @@ enum ieee80211_hw_flags {
  *	the default is _GI | _BANDWIDTH.
  *	Use the %IEEE80211_RADIOTAP_VHT_KNOWN_* values.
  *
+ * @radiotap_timestamp: Information for the radiotap timestamp field; if the
+ *	'units_pos' member is set to a non-negative value it must be set to
+ *	a combination of a IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a
+ *	IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value, and then the timestamp
+ *	field will be added and populated from the &struct ieee80211_rx_status
+ *	device_timestamp. If the 'accuracy' member is non-negative, it's put
+ *	into the accuracy radiotap field and the accuracy known flag is set.
+ *
  * @netdev_features: netdev features to be set in each netdev created
  *	from this HW. Note that not all features are usable with mac80211,
  *	other features will be rejected during HW registration.
@@ -2188,6 +2196,10 @@ struct ieee80211_hw {
 	u8 offchannel_tx_hw_queue;
 	u8 radiotap_mcs_details;
 	u16 radiotap_vht_details;
+	struct {
+		int units_pos;
+		s16 accuracy;
+	} radiotap_timestamp;
 	netdev_features_t netdev_features;
 	u8 uapsd_queues;
 	u8 uapsd_max_sp_len;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index d00ea9b13f49..ac053a9df36d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -660,6 +660,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 
 	ieee80211_roc_setup(local);
 
+	local->hw.radiotap_timestamp.units_pos = -1;
+	local->hw.radiotap_timestamp.accuracy = -1;
+
 	return &local->hw;
  err_free:
 	wiphy_free(wiphy);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6a265aa73a46..284f0f25e22e 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -180,6 +180,11 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 		len += 12;
 	}
 
+	if (local->hw.radiotap_timestamp.units_pos >= 0) {
+		len = ALIGN(len, 8);
+		len += 12;
+	}
+
 	if (status->chains) {
 		/* antenna and antenna signal fields */
 		len += 2 * hweight8(status->chains);
@@ -447,6 +452,31 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		pos += 2;
 	}
 
+	if (local->hw.radiotap_timestamp.units_pos >= 0) {
+		u16 accuracy = 0;
+		u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT;
+
+		rthdr->it_present |=
+			cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP);
+
+		/* ensure 8 byte alignment */
+		while ((pos - (u8 *)rthdr) & 7)
+			pos++;
+
+		put_unaligned_le64(status->device_timestamp, pos);
+		pos += sizeof(u64);
+
+		if (local->hw.radiotap_timestamp.accuracy >= 0) {
+			accuracy = local->hw.radiotap_timestamp.accuracy;
+			flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY;
+		}
+		put_unaligned_le16(accuracy, pos);
+		pos += sizeof(u16);
+
+		*pos++ = local->hw.radiotap_timestamp.units_pos;
+		*pos++ = flags;
+	}
+
 	for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
 		*pos++ = status->chain_signal[chain];
 		*pos++ = chain;
-- 
cgit v1.2.3


From bfe40fa395ddc41e45310a4426574703a05e1177 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Aug 2016 23:25:18 +0300
Subject: mac80211: send delBA on unexpected BlockAck data frames

When we receive data frames with ACK policy BlockAck, send
delBA as requested by the 802.11 spec. Since this would be
happening for every frame inside an A-MPDU if it's really
received outside a session, limit it to a single attempt.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-rx.c   | 4 +++-
 net/mac80211/rx.c       | 9 ++++++++-
 net/mac80211/sta_info.h | 3 +++
 3 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 282e99bdb301..a5d69dfc8e03 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -409,8 +409,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	}
 
 end:
-	if (status == WLAN_STATUS_SUCCESS)
+	if (status == WLAN_STATUS_SUCCESS) {
 		__set_bit(tid, sta->ampdu_mlme.agg_session_valid);
+		__clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
+	}
 	mutex_unlock(&sta->ampdu_mlme.mtx);
 
 end_no_lock:
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 284f0f25e22e..ad636c930f84 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1122,8 +1122,15 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
 	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
 
 	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
-	if (!tid_agg_rx)
+	if (!tid_agg_rx) {
+		if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
+		    !test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
+		    !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
+			ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
+					     WLAN_BACK_RECIPIENT,
+					     WLAN_REASON_QSTA_REQUIRE_SETUP);
 		goto dont_reorder;
+	}
 
 	/* qos null data frames are excluded */
 	if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 0556be3e3628..530231b73278 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -230,6 +230,8 @@ struct tid_ampdu_rx {
  * @tid_rx_stop_requested:  bitmap indicating which BA sessions per TID the
  *	driver requested to close until the work for it runs
  * @agg_session_valid: bitmap indicating which TID has a rx BA session open on
+ * @unexpected_agg: bitmap indicating which TID already sent a delBA due to
+ *	unexpected aggregation related frames outside a session
  * @work: work struct for starting/stopping aggregation
  * @tid_tx: aggregation info for Tx per TID
  * @tid_start_tx: sessions where start was requested
@@ -244,6 +246,7 @@ struct sta_ampdu_mlme {
 	unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
 	unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
 	unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+	unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
 	/* tx */
 	struct work_struct work;
 	struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
-- 
cgit v1.2.3


From 53f249747d5ec6434415a6895b5690bf4f1d5d7d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Aug 2016 23:25:19 +0300
Subject: mac80211: send delBA on unexpected BlockAck Request

If we don't have a BA session, send delBA, as requested by the
IEEE 802.11 spec. Apply the same limit of sending such a delBA
only once as in the previous patch.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index ad636c930f84..e796060b7c5e 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2592,6 +2592,12 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
 
 		tid = le16_to_cpu(bar_data.control) >> 12;
 
+		if (!test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
+		    !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
+			ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
+					     WLAN_BACK_RECIPIENT,
+					     WLAN_REASON_QSTA_REQUIRE_SETUP);
+
 		tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
 		if (!tid_agg_rx)
 			return RX_DROP_MONITOR;
-- 
cgit v1.2.3


From 83843c80dcf11a78995d167255b03072a1e49c2c Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 28 Aug 2016 13:10:37 +0200
Subject: mac80211: fix tim recalculation after PS response

Handle the case where the mac80211 intermediate queues are empty and the
driver has buffered frames

Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 76b737dcc36f..aa58df80ede0 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1616,7 +1616,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
 
 		sta_info_recalc_tim(sta);
 	} else {
-		unsigned long tids = sta->txq_buffered_tids & driver_release_tids;
 		int tid;
 
 		/*
@@ -1648,7 +1647,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
 		for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
 			struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
 
-			if (!(tids & BIT(tid)) || txqi->tin.backlog_packets)
+			if (!(driver_release_tids & BIT(tid)) ||
+			    txqi->tin.backlog_packets)
 				continue;
 
 			sta_info_recalc_tim(sta);
-- 
cgit v1.2.3


From df6ef5d8a87ace995d5c10a7bd684be05911a321 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 4 Sep 2016 18:00:59 +0200
Subject: mac80211: fix sequence number assignment for PS response frames

When using intermediate queues, sequence number allocation is deferred
until dequeue. This doesn't work for PS response frames, which bypass
those queues.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 65 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 502396694f47..cc8e95554b48 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -796,6 +796,36 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
 	return ret;
 }
 
+static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
+					  struct ieee80211_vif *vif,
+					  struct ieee80211_sta *pubsta,
+					  struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_txq *txq = NULL;
+
+	if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
+	    (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
+		return NULL;
+
+	if (!ieee80211_is_data(hdr->frame_control))
+		return NULL;
+
+	if (pubsta) {
+		u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+
+		txq = pubsta->txq[tid];
+	} else if (vif) {
+		txq = vif->txq;
+	}
+
+	if (!txq)
+		return NULL;
+
+	return to_txq_info(txq);
+}
+
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
@@ -853,7 +883,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
+	if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta,
+			       tx->skb))
 		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
 
 	return TX_CONTINUE;
@@ -1243,36 +1274,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 	return TX_CONTINUE;
 }
 
-static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
-					  struct ieee80211_vif *vif,
-					  struct ieee80211_sta *pubsta,
-					  struct sk_buff *skb)
-{
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	struct ieee80211_txq *txq = NULL;
-
-	if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
-	    (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
-		return NULL;
-
-	if (!ieee80211_is_data(hdr->frame_control))
-		return NULL;
-
-	if (pubsta) {
-		u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
-
-		txq = pubsta->txq[tid];
-	} else if (vif) {
-		txq = vif->txq;
-	}
-
-	if (!txq)
-		return NULL;
-
-	return to_txq_info(txq);
-}
-
 static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
 {
 	IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
@@ -3264,7 +3265,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
 		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
+		if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb))
 			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-- 
cgit v1.2.3


From 5df20f2141eadb5430caaad20eceac61cfe0f139 Mon Sep 17 00:00:00 2001
From: "Pedersen, Thomas" <twp@qca.qualcomm.com>
Date: Tue, 6 Sep 2016 11:59:00 -0700
Subject: mac80211: make mpath path fixing more robust

A fixed mpath was not quite being treated as such:

1) if a PERR frame was received, a fixed mpath was
   deactivated.

2) queued path discovery for fixed mpath was potentially
   being considered, changing mpath state.

3) other mpath flags were potentially being inherited when
   fixing the mpath. Just assign PATH_FIXED and SN_VALID.

This solves several issues when fixing a mesh path in one
direction. The reverse direction mpath should probably
also be fixed, or root announcements at least be enabled.

Signed-off-by: Thomas Pedersen <twp@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_hwmp.c    | 3 ++-
 net/mac80211/mesh_pathtbl.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 8f9c3bde835f..faccef977670 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -746,6 +746,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
 		sta = next_hop_deref_protected(mpath);
 		if (mpath->flags & MESH_PATH_ACTIVE &&
 		    ether_addr_equal(ta, sta->sta.addr) &&
+		    !(mpath->flags & MESH_PATH_FIXED) &&
 		    (!(mpath->flags & MESH_PATH_SN_VALID) ||
 		    SN_GT(target_sn, mpath->sn)  || target_sn == 0)) {
 			mpath->flags &= ~MESH_PATH_ACTIVE;
@@ -1012,7 +1013,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 		goto enddiscovery;
 
 	spin_lock_bh(&mpath->state_lock);
-	if (mpath->flags & MESH_PATH_DELETED) {
+	if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) {
 		spin_unlock_bh(&mpath->state_lock);
 		goto enddiscovery;
 	}
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 6db2ddfa0695..f0e6175a9821 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
 	mpath->metric = 0;
 	mpath->hop_count = 0;
 	mpath->exp_time = 0;
-	mpath->flags |= MESH_PATH_FIXED;
+	mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID;
 	mesh_path_activate(mpath);
 	spin_unlock_bh(&mpath->state_lock);
 	mesh_path_tx_pending(mpath);
-- 
cgit v1.2.3


From ecfcdfec7e0cc64215a194044305f02a5a836e6d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Sep 2016 15:38:12 +0200
Subject: netfilter: nf_nat: handle NF_DROP from nfnetlink_parse_nat_setup()

nf_nat_setup_info() returns NF_* verdicts, so convert them to error
codes that is what ctnelink expects. This has passed overlook without
having any impact since this nf_nat_setup_info() has always returned
NF_ACCEPT so far. Since 870190a9ec90 ("netfilter: nat: convert nat bysrc
hash to rhashtable"), this is problem.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index de31818417b8..19c081e1b328 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -807,7 +807,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 	if (err < 0)
 		return err;
 
-	return nf_nat_setup_info(ct, &range, manip);
+	return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
 }
 #else
 static int
-- 
cgit v1.2.3


From bf2c4b6f9b74c2ee1dd3c050b181e9b9c86fbcdb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2016 10:50:38 -0400
Subject: svcauth_gss: Revert 64c59a3726f2 ("Remove unnecessary allocation")

rsc_lookup steals the passed-in memory to avoid doing an allocation of
its own, so we can't just pass in a pointer to memory that someone else
is using.

If we really want to avoid allocation there then maybe we should
preallocate somwhere, or reference count these handles.

For now we should revert.

On occasion I see this on my server:

kernel: kernel BUG at /home/cel/src/linux/linux-2.6/mm/slub.c:3851!
kernel: invalid opcode: 0000 [#1] SMP
kernel: Modules linked in: cts rpcsec_gss_krb5 sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd btrfs xor iTCO_wdt iTCO_vendor_support raid6_pq pcspkr i2c_i801 i2c_smbus lpc_ich mfd_core mei_me sg mei shpchp wmi ioatdma ipmi_si ipmi_msghandler acpi_pad acpi_power_meter rpcrdma ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm nfsd nfs_acl lockd grace auth_rpcgss sunrpc ip_tables xfs libcrc32c mlx4_ib mlx4_en ib_core sr_mod cdrom sd_mod ast drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel igb mlx4_core ahci libahci libata ptp pps_core dca i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod
kernel: CPU: 7 PID: 145 Comm: kworker/7:2 Not tainted 4.8.0-rc4-00006-g9d06b0b #15
kernel: Hardware name: Supermicro Super Server/X10SRL-F, BIOS 1.0c 09/09/2015
kernel: Workqueue: events do_cache_clean [sunrpc]
kernel: task: ffff8808541d8000 task.stack: ffff880854344000
kernel: RIP: 0010:[<ffffffff811e7075>]  [<ffffffff811e7075>] kfree+0x155/0x180
kernel: RSP: 0018:ffff880854347d70  EFLAGS: 00010246
kernel: RAX: ffffea0020fe7660 RBX: ffff88083f9db064 RCX: 146ff0f9d5ec5600
kernel: RDX: 000077ff80000000 RSI: ffff880853f01500 RDI: ffff88083f9db064
kernel: RBP: ffff880854347d88 R08: ffff8808594ee000 R09: ffff88087fdd8780
kernel: R10: 0000000000000000 R11: ffffea0020fe76c0 R12: ffff880853f01500
kernel: R13: ffffffffa013cf76 R14: ffffffffa013cff0 R15: ffffffffa04253a0
kernel: FS:  0000000000000000(0000) GS:ffff88087fdc0000(0000) knlGS:0000000000000000
kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kernel: CR2: 00007fed60b020c3 CR3: 0000000001c06000 CR4: 00000000001406e0
kernel: Stack:
kernel: ffff8808589f2f00 ffff880853f01500 0000000000000001 ffff880854347da0
kernel: ffffffffa013cf76 ffff8808589f2f00 ffff880854347db8 ffffffffa013d006
kernel: ffff8808589f2f20 ffff880854347e00 ffffffffa0406f60 0000000057c7044f
kernel: Call Trace:
kernel: [<ffffffffa013cf76>] rsc_free+0x16/0x90 [auth_rpcgss]
kernel: [<ffffffffa013d006>] rsc_put+0x16/0x30 [auth_rpcgss]
kernel: [<ffffffffa0406f60>] cache_clean+0x2e0/0x300 [sunrpc]
kernel: [<ffffffffa04073ee>] do_cache_clean+0xe/0x70 [sunrpc]
kernel: [<ffffffff8109a70f>] process_one_work+0x1ff/0x3b0
kernel: [<ffffffff8109b15c>] worker_thread+0x2bc/0x4a0
kernel: [<ffffffff8109aea0>] ? rescuer_thread+0x3a0/0x3a0
kernel: [<ffffffff810a0ba4>] kthread+0xe4/0xf0
kernel: [<ffffffff8169c47f>] ret_from_fork+0x1f/0x40
kernel: [<ffffffff810a0ac0>] ? kthread_stop+0x110/0x110
kernel: Code: f7 ff ff eb 3b 65 8b 05 da 30 e2 7e 89 c0 48 0f a3 05 a0 38 b8 00 0f 92 c0 84 c0 0f 85 d1 fe ff ff 0f 1f 44 00 00 e9 f5 fe ff ff <0f> 0b 49 8b 03 31 f6 f6 c4 40 0f 85 62 ff ff ff e9 61 ff ff ff
kernel: RIP  [<ffffffff811e7075>] kfree+0x155/0x180
kernel: RSP <ffff880854347d70>
kernel: ---[ end trace 3fdec044969def26 ]---

It seems to be most common after a server reboot where a client has been
using a Kerberos mount, and reconnects to continue its workload.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/auth_gss/svcauth_gss.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1d281816f2bf..d8582028b346 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -569,9 +569,10 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
 	struct rsc *found;
 
 	memset(&rsci, 0, sizeof(rsci));
-	rsci.handle.data = handle->data;
-	rsci.handle.len = handle->len;
+	if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+		return NULL;
 	found = rsc_lookup(cd, &rsci);
+	rsc_free(&rsci);
 	if (!found)
 		return NULL;
 	if (cache_check(cd, &found->h, NULL))
-- 
cgit v1.2.3


From 8d51dbb8c7fb5412f0935c20f66e27d2c63ef4a0 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Mon, 12 Sep 2016 15:55:43 +0200
Subject: mac80211: Re-structure aqm debugfs output and keep CoDel stats per
 txq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the 'aqm' stats in mac80211 only keeps overlimit drop stats,
not CoDel stats. This moves the CoDel stats into the txqi structure to
keep them per txq in order to show them in debugfs.

In addition, the aqm debugfs output is restructured by splitting it up
into three files: One global per phy, one per netdev and one per
station, in the appropriate directories. The files are all called aqm,
and are only created if the driver supports the wake_tx_queue op (rather
than emitting an error on open as previously).

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs.c        | 151 ++++++++----------------------------------
 net/mac80211/debugfs_netdev.c |  37 ++++++++++-
 net/mac80211/debugfs_sta.c    |  52 +++++++++++++++
 net/mac80211/ieee80211_i.h    |   2 +-
 net/mac80211/tx.c             |   4 +-
 5 files changed, 117 insertions(+), 129 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 2906c1004e1a..5bbb470f335f 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -71,138 +71,39 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x",
 DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s",
 	local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver");
 
-struct aqm_info {
-	struct ieee80211_local *local;
-	size_t size;
-	size_t len;
-	unsigned char buf[0];
-};
-
-#define AQM_HDR_LEN 200
-#define AQM_HW_ENTRY_LEN 40
-#define AQM_TXQ_ENTRY_LEN 110
-
-static int aqm_open(struct inode *inode, struct file *file)
+static ssize_t aqm_read(struct file *file,
+			char __user *user_buf,
+			size_t count,
+			loff_t *ppos)
 {
-	struct ieee80211_local *local = inode->i_private;
-	struct ieee80211_sub_if_data *sdata;
-	struct sta_info *sta;
-	struct txq_info *txqi;
+	struct ieee80211_local *local = file->private_data;
 	struct fq *fq = &local->fq;
-	struct aqm_info *info = NULL;
+	char buf[200];
 	int len = 0;
-	int i;
-
-	if (!local->ops->wake_tx_queue)
-		return -EOPNOTSUPP;
-
-	len += AQM_HDR_LEN;
-	len += 6 * AQM_HW_ENTRY_LEN;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(sdata, &local->interfaces, list)
-		len += AQM_TXQ_ENTRY_LEN;
-	list_for_each_entry_rcu(sta, &local->sta_list, list)
-		len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq);
-	rcu_read_unlock();
-
-	info = vmalloc(len);
-	if (!info)
-		return -ENOMEM;
 
 	spin_lock_bh(&local->fq.lock);
 	rcu_read_lock();
 
-	file->private_data = info;
-	info->local = local;
-	info->size = len;
-	len = 0;
-
-	len += scnprintf(info->buf + len, info->size - len,
-			 "* hw\n"
-			 "access name value\n"
-			 "R fq_flows_cnt %u\n"
-			 "R fq_backlog %u\n"
-			 "R fq_overlimit %u\n"
-			 "R fq_collisions %u\n"
-			 "RW fq_limit %u\n"
-			 "RW fq_quantum %u\n",
-			 fq->flows_cnt,
-			 fq->backlog,
-			 fq->overlimit,
-			 fq->collisions,
-			 fq->limit,
-			 fq->quantum);
-
-	len += scnprintf(info->buf + len,
-			 info->size - len,
-			 "* vif\n"
-			 "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
-
-	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-		txqi = to_txq_info(sdata->vif.txq);
-		len += scnprintf(info->buf + len, info->size - len,
-				 "%s %pM %u %u %u %u %u %u %u %u\n",
-				 sdata->name,
-				 sdata->vif.addr,
-				 txqi->txq.ac,
-				 txqi->tin.backlog_bytes,
-				 txqi->tin.backlog_packets,
-				 txqi->tin.flows,
-				 txqi->tin.overlimit,
-				 txqi->tin.collisions,
-				 txqi->tin.tx_bytes,
-				 txqi->tin.tx_packets);
-	}
-
-	len += scnprintf(info->buf + len,
-			 info->size - len,
-			 "* sta\n"
-			 "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
-
-	list_for_each_entry_rcu(sta, &local->sta_list, list) {
-		sdata = sta->sdata;
-		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
-			txqi = to_txq_info(sta->sta.txq[i]);
-			len += scnprintf(info->buf + len, info->size - len,
-					 "%s %pM %d %d %u %u %u %u %u %u %u\n",
-					 sdata->name,
-					 sta->sta.addr,
-					 txqi->txq.tid,
-					 txqi->txq.ac,
-					 txqi->tin.backlog_bytes,
-					 txqi->tin.backlog_packets,
-					 txqi->tin.flows,
-					 txqi->tin.overlimit,
-					 txqi->tin.collisions,
-					 txqi->tin.tx_bytes,
-					 txqi->tin.tx_packets);
-		}
-	}
-
-	info->len = len;
+	len = scnprintf(buf, sizeof(buf),
+			"access name value\n"
+			"R fq_flows_cnt %u\n"
+			"R fq_backlog %u\n"
+			"R fq_overlimit %u\n"
+			"R fq_collisions %u\n"
+			"RW fq_limit %u\n"
+			"RW fq_quantum %u\n",
+			fq->flows_cnt,
+			fq->backlog,
+			fq->overlimit,
+			fq->collisions,
+			fq->limit,
+			fq->quantum);
 
 	rcu_read_unlock();
 	spin_unlock_bh(&local->fq.lock);
 
-	return 0;
-}
-
-static int aqm_release(struct inode *inode, struct file *file)
-{
-	vfree(file->private_data);
-	return 0;
-}
-
-static ssize_t aqm_read(struct file *file,
-			char __user *user_buf,
-			size_t count,
-			loff_t *ppos)
-{
-	struct aqm_info *info = file->private_data;
-
 	return simple_read_from_buffer(user_buf, count, ppos,
-				       info->buf, info->len);
+				       buf, len);
 }
 
 static ssize_t aqm_write(struct file *file,
@@ -210,8 +111,7 @@ static ssize_t aqm_write(struct file *file,
 			 size_t count,
 			 loff_t *ppos)
 {
-	struct aqm_info *info = file->private_data;
-	struct ieee80211_local *local = info->local;
+	struct ieee80211_local *local = file->private_data;
 	char buf[100];
 	size_t len;
 
@@ -237,8 +137,7 @@ static ssize_t aqm_write(struct file *file,
 static const struct file_operations aqm_ops = {
 	.write = aqm_write,
 	.read = aqm_read,
-	.open = aqm_open,
-	.release = aqm_release,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -428,7 +327,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
 	DEBUGFS_ADD(hwflags);
 	DEBUGFS_ADD(user_power);
 	DEBUGFS_ADD(power);
-	DEBUGFS_ADD_MODE(aqm, 0600);
+
+	if (local->ops->wake_tx_queue)
+		DEBUGFS_ADD_MODE(aqm, 0600);
 
 	statsd = debugfs_create_dir("statistics", phyd);
 
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index a5ba739cd2a7..5d35c0f37bb7 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -30,7 +30,7 @@ static ssize_t ieee80211_if_read(
 	size_t count, loff_t *ppos,
 	ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
 {
-	char buf[70];
+	char buf[200];
 	ssize_t ret = -EINVAL;
 
 	read_lock(&dev_base_lock);
@@ -486,6 +486,38 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast(
 }
 IEEE80211_IF_FILE_R(num_buffered_multicast);
 
+static ssize_t ieee80211_if_fmt_aqm(
+	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+	int len;
+
+	spin_lock_bh(&local->fq.lock);
+	rcu_read_lock();
+
+	len = scnprintf(buf,
+			buflen,
+			"ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n"
+			"%u %u %u %u %u %u %u %u %u %u\n",
+			txqi->txq.ac,
+			txqi->tin.backlog_bytes,
+			txqi->tin.backlog_packets,
+			txqi->tin.flows,
+			txqi->cstats.drop_count,
+			txqi->cstats.ecn_mark,
+			txqi->tin.overlimit,
+			txqi->tin.collisions,
+			txqi->tin.tx_bytes,
+			txqi->tin.tx_packets);
+
+	rcu_read_unlock();
+	spin_unlock_bh(&local->fq.lock);
+
+	return len;
+}
+IEEE80211_IF_FILE_R(aqm);
+
 /* IBSS attributes */
 static ssize_t ieee80211_if_fmt_tsf(
 	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -618,6 +650,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
 	DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz);
 	DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
 	DEBUGFS_ADD(hw_queues);
+
+	if (sdata->local->ops->wake_tx_queue)
+		DEBUGFS_ADD(aqm);
 }
 
 static void add_sta_files(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index fd334133ff45..fb2693582e40 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -133,6 +133,55 @@ static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf,
 }
 STA_OPS(last_seq_ctrl);
 
+#define AQM_TXQ_ENTRY_LEN 130
+
+static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
+			size_t count, loff_t *ppos)
+{
+	struct sta_info *sta = file->private_data;
+	struct ieee80211_local *local = sta->local;
+	size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1);
+	char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
+	struct txq_info *txqi;
+	ssize_t rv;
+	int i;
+
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock_bh(&local->fq.lock);
+	rcu_read_lock();
+
+	p += scnprintf(p,
+		       bufsz+buf-p,
+		       "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n");
+
+	for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+		txqi = to_txq_info(sta->sta.txq[i]);
+		p += scnprintf(p, bufsz+buf-p,
+			       "%d %d %u %u %u %u %u %u %u %u %u\n",
+			       txqi->txq.tid,
+			       txqi->txq.ac,
+			       txqi->tin.backlog_bytes,
+			       txqi->tin.backlog_packets,
+			       txqi->tin.flows,
+			       txqi->cstats.drop_count,
+			       txqi->cstats.ecn_mark,
+			       txqi->tin.overlimit,
+			       txqi->tin.collisions,
+			       txqi->tin.tx_bytes,
+			       txqi->tin.tx_packets);
+	}
+
+	rcu_read_unlock();
+	spin_unlock_bh(&local->fq.lock);
+
+	rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+	kfree(buf);
+	return rv;
+}
+STA_OPS(aqm);
+
 static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
 					size_t count, loff_t *ppos)
 {
@@ -478,6 +527,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
 	DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered);
 
+	if (local->ops->wake_tx_queue)
+		DEBUGFS_ADD(aqm);
+
 	if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
 		debugfs_create_x32("driver_buffered_tids", 0400,
 				   sta->debugfs_dir,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 75761686a98b..c71c73594790 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -818,6 +818,7 @@ struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct codel_stats cstats;
 	unsigned long flags;
 
 	/* keep last! */
@@ -1117,7 +1118,6 @@ struct ieee80211_local {
 	struct fq fq;
 	struct codel_vars *cvars;
 	struct codel_params cparams;
-	struct codel_stats cstats;
 
 	const struct ieee80211_ops *ops;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index efc38e7b90b9..ee9e7d60cb78 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1343,7 +1343,7 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq,
 	local = container_of(fq, struct ieee80211_local, fq);
 	txqi = container_of(tin, struct txq_info, tin);
 	cparams = &local->cparams;
-	cstats = &local->cstats;
+	cstats = &txqi->cstats;
 
 	if (flow == &txqi->def_flow)
 		cvars = &txqi->def_cvars;
@@ -1403,6 +1403,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	codel_stats_init(&txqi->cstats);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1441,7 +1442,6 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
 		return ret;
 
 	codel_params_init(&local->cparams);
-	codel_stats_init(&local->cstats);
 	local->cparams.interval = MS2TIME(100);
 	local->cparams.target = MS2TIME(20);
 	local->cparams.ecn = true;
-- 
cgit v1.2.3


From 11d62caf93cf12ce80ff8304849887666ec8880a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 08:28:22 +0200
Subject: mac80211: simplify TDLS RA lookup

smatch pointed out that the second check of "tdls_auth" was
pointless since if it was true, we returned from the function
already. We can further simplify the code by moving the first
check (if it's a TDLS peer at all) into the outer if, to only
handle that inside. This simplifies the control flow here.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index ee9e7d60cb78..61d302d97145 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2263,15 +2263,9 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
 	case NL80211_IFTYPE_STATION:
 		if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) {
 			sta = sta_info_get(sdata, skb->data);
-			if (sta) {
-				bool tdls_peer, tdls_auth;
-
-				tdls_peer = test_sta_flag(sta,
-							  WLAN_STA_TDLS_PEER);
-				tdls_auth = test_sta_flag(sta,
-						WLAN_STA_TDLS_PEER_AUTH);
-
-				if (tdls_peer && tdls_auth) {
+			if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+				if (test_sta_flag(sta,
+						  WLAN_STA_TDLS_PEER_AUTH)) {
 					*sta_out = sta;
 					return 0;
 				}
@@ -2283,8 +2277,7 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
 				 * after a TDLS sta is removed due to being
 				 * unreachable.
 				 */
-				if (tdls_peer && !tdls_auth &&
-				    !ieee80211_is_tdls_setup(skb))
+				if (!ieee80211_is_tdls_setup(skb))
 					return -EINVAL;
 			}
 
-- 
cgit v1.2.3


From 4440a2ab3b9f40dddbe006331ef0659c76859296 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Tue, 13 Sep 2016 08:49:18 +0800
Subject: netfilter: synproxy: Check oom when adding synproxy and seqadj ct
 extensions

When memory is exhausted, nfct_seqadj_ext_add may fail to add the
synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't
check if get valid seqadj pointer by the nfct_seqadj.

Now drop the packet directly when fail to add seqadj extension to
avoid dereference NULL pointer in nf_ct_seqadj_init from
init_conntrack().

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_synproxy.h | 14 ++++++++++++++
 net/netfilter/nf_conntrack_core.c             |  6 +++---
 net/netfilter/nf_nat_core.c                   |  3 ++-
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index 6793614e6502..e6937318546c 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -27,6 +27,20 @@ static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct)
 #endif
 }
 
+static inline bool nf_ct_add_synproxy(struct nf_conn *ct,
+				      const struct nf_conn *tmpl)
+{
+	if (tmpl && nfct_synproxy(tmpl)) {
+		if (!nfct_seqadj_ext_add(ct))
+			return false;
+
+		if (!nfct_synproxy_ext_add(ct))
+			return false;
+	}
+
+	return true;
+}
+
 struct synproxy_stats {
 	unsigned int			syn_received;
 	unsigned int			cookie_invalid;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index dd2c43abf9e2..9934b0c93c1e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	if (IS_ERR(ct))
 		return (struct nf_conntrack_tuple_hash *)ct;
 
-	if (tmpl && nfct_synproxy(tmpl)) {
-		nfct_seqadj_ext_add(ct);
-		nfct_synproxy_ext_add(ct);
+	if (!nf_ct_add_synproxy(ct, tmpl)) {
+		nf_conntrack_free(ct);
+		return ERR_PTR(-ENOMEM);
 	}
 
 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 19c081e1b328..ecee105bbada 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct,
 			ct->status |= IPS_DST_NAT;
 
 		if (nfct_help(ct))
-			nfct_seqadj_ext_add(ct);
+			if (!nfct_seqadj_ext_add(ct))
+				return NF_DROP;
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-- 
cgit v1.2.3


From 4854f175c3182816d906c4bc34be5f30556346a5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 15:39:29 +0200
Subject: mac80211: remove useless open_count check

__ieee80211_suspend() checks early on if there's anything
to do by checking open_count, so there's no need to check
again later in the function. Remove the useless check.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/pm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 00a43a70e1fc..28a3a0957c9e 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -178,8 +178,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
 	WARN_ON(!list_empty(&local->chanctx_list));
 
 	/* stop hardware - this must stop RX */
-	if (local->open_count)
-		ieee80211_stop_device(local);
+	ieee80211_stop_device(local);
 
  suspend:
 	local->suspended = true;
-- 
cgit v1.2.3


From 308433155a67cb097142292c8943e0aa8d1a1c79 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 8 Sep 2016 12:50:43 -0400
Subject: net: bridge: add helper to call /sbin/bridge-stp

If /sbin/bridge-stp is available on the system, bridge tries to execute
it instead of the kernel implementation when starting/stopping STP.

If anything goes wrong with /sbin/bridge-stp, bridge silently falls back
to kernel STP, making hard to debug userspace STP.

This patch adds a br_stp_call_user helper to start/stop userspace STP
and debug errors from the program: abnormal exit status is stored in the
lower byte and normal exit status is stored in higher byte.

Below is a simple example on a kernel with dynamic debug enabled:

    # ln -s /bin/false /sbin/bridge-stp
    # brctl stp br0 on
    br0: failed to start userspace STP (256)
    # dmesg
    br0: /sbin/bridge-stp exited with code 1
    br0: failed to start userspace STP (256)
    br0: using kernel STP

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_stp_if.c | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 341caa0ca63a..d8ad73b38de2 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -134,17 +134,36 @@ void br_stp_disable_port(struct net_bridge_port *p)
 		br_become_root_bridge(br);
 }
 
-static void br_stp_start(struct net_bridge *br)
+static int br_stp_call_user(struct net_bridge *br, char *arg)
 {
-	int r;
-	char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
+	char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL };
 	char *envp[] = { NULL };
+	int rc;
+
+	/* call userspace STP and report program errors */
+	rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+	if (rc > 0) {
+		if (rc & 0xff)
+			br_debug(br, BR_STP_PROG " received signal %d\n",
+				 rc & 0x7f);
+		else
+			br_debug(br, BR_STP_PROG " exited with code %d\n",
+				 (rc >> 8) & 0xff);
+	}
+
+	return rc;
+}
+
+static void br_stp_start(struct net_bridge *br)
+{
 	struct net_bridge_port *p;
+	int err = -ENOENT;
 
 	if (net_eq(dev_net(br->dev), &init_net))
-		r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
-	else
-		r = -ENOENT;
+		err = br_stp_call_user(br, "start");
+
+	if (err && err != -ENOENT)
+		br_err(br, "failed to start userspace STP (%d)\n", err);
 
 	spin_lock_bh(&br->lock);
 
@@ -153,9 +172,10 @@ static void br_stp_start(struct net_bridge *br)
 	else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY)
 		__br_set_forward_delay(br, BR_MAX_FORWARD_DELAY);
 
-	if (r == 0) {
+	if (!err) {
 		br->stp_enabled = BR_USER_STP;
 		br_debug(br, "userspace STP started\n");
+
 		/* Stop hello and hold timers */
 		del_timer(&br->hello_timer);
 		list_for_each_entry(p, &br->port_list, list)
@@ -173,14 +193,13 @@ static void br_stp_start(struct net_bridge *br)
 
 static void br_stp_stop(struct net_bridge *br)
 {
-	int r;
-	char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
-	char *envp[] = { NULL };
 	struct net_bridge_port *p;
+	int err;
 
 	if (br->stp_enabled == BR_USER_STP) {
-		r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
-		br_info(br, "userspace STP stopped, return code %d\n", r);
+		err = br_stp_call_user(br, "stop");
+		if (err)
+			br_err(br, "failed to stop userspace STP (%d)\n", err);
 
 		/* To start timers on any ports left in blocking */
 		mod_timer(&br->hello_timer, jiffies + br->hello_time);
-- 
cgit v1.2.3


From c20cb8119337052a84e40cba94af732d870e22e3 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Sat, 10 Sep 2016 00:56:55 +0000
Subject: tipc: fix possible memory leak in tipc_udp_enable()

'ub' is malloced in tipc_udp_enable() and should be freed before
leaving from the error handling cases, otherwise it will cause
memory leak.

Fixes: ba5aa84a2d22 ("tipc: split UDP nl address parsing")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index dd274687a53d..d80cd3f7503f 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -665,7 +665,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 
 	if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) {
 		pr_err("Invalid UDP bearer configuration");
-		return -EINVAL;
+		err = -EINVAL;
+		goto err;
 	}
 
 	err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local,
-- 
cgit v1.2.3


From 715f5552b1e90ba3eecf6d1a6d044d0d5226663f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 10 Sep 2016 23:11:23 +0800
Subject: sctp: hold the transport before using it in sctp_hash_cmp

Since commit 4f0087812648 ("sctp: apply rhashtable api to send/recv
path"), sctp uses transport rhashtable with .obj_cmpfn sctp_hash_cmp,
in which it compares the members of the transport with the rhashtable
args to check if it's the right transport.

But sctp uses the transport without holding it in sctp_hash_cmp, it can
cause a use-after-free panic. As after it gets transport from hashtable,
another CPU may close the sk and free the asoc. In sctp_association_free,
it frees all the transports, meanwhile, the assoc's refcnt may be reduced
to 0, assoc can be destroyed by sctp_association_destroy.

So after that, transport->assoc is actually an unavailable memory address
in sctp_hash_cmp. Although sctp_hash_cmp is under rcu_read_lock, it still
can not avoid this, as assoc is not freed by RCU.

This patch is to hold the transport before checking it's members with
sctp_transport_hold, in which it checks the refcnt first, holds it if
it's not 0.

Fixes: 4f0087812648 ("sctp: apply rhashtable api to send/recv path")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 69444d32ecda..1555fb8c68e0 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -796,27 +796,34 @@ struct sctp_hash_cmp_arg {
 static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
 				const void *ptr)
 {
+	struct sctp_transport *t = (struct sctp_transport *)ptr;
 	const struct sctp_hash_cmp_arg *x = arg->key;
-	const struct sctp_transport *t = ptr;
-	struct sctp_association *asoc = t->asoc;
-	const struct net *net = x->net;
+	struct sctp_association *asoc;
+	int err = 1;
 
 	if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
-		return 1;
-	if (!net_eq(sock_net(asoc->base.sk), net))
-		return 1;
+		return err;
+	if (!sctp_transport_hold(t))
+		return err;
+
+	asoc = t->asoc;
+	if (!net_eq(sock_net(asoc->base.sk), x->net))
+		goto out;
 	if (x->ep) {
 		if (x->ep != asoc->ep)
-			return 1;
+			goto out;
 	} else {
 		if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
-			return 1;
+			goto out;
 		if (!sctp_bind_addr_match(&asoc->base.bind_addr,
 					  x->laddr, sctp_sk(asoc->base.sk)))
-			return 1;
+			goto out;
 	}
 
-	return 0;
+	err = 0;
+out:
+	sctp_transport_put(t);
+	return err;
 }
 
 static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
-- 
cgit v1.2.3


From ad5987b47e96a0fb6d13fea250e936aed000093c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 15:53:55 +0200
Subject: nl80211: validate number of probe response CSA counters

Due to an apparent copy/paste bug, the number of counters for the
beacon configuration were checked twice, instead of checking the
number of probe response counters. Fix this to check the number of
probe response counters before parsing those.

Cc: stable@vger.kernel.org
Fixes: 9a774c78e211 ("cfg80211: Support multiple CSA counters")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f02653a08993..4809f4d2cdcc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6978,7 +6978,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 
 		params.n_counter_offsets_presp = len / sizeof(u16);
 		if (rdev->wiphy.max_num_csa_counters &&
-		    (params.n_counter_offsets_beacon >
+		    (params.n_counter_offsets_presp >
 		     rdev->wiphy.max_num_csa_counters))
 			return -EINVAL;
 
-- 
cgit v1.2.3


From b6b5555bc89f52e49244104ca4d7764c7b0f11cd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 16:25:58 +0200
Subject: cfg80211: disallow shared key authentication with key index 4

Key index 4 can only be used for an IGTK, so the range checks
for shared key authentication should treat 4 as an error, fix
that in the code.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/mlme.c    | 2 +-
 net/wireless/nl80211.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index c284d883c349..d6abb0704db5 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -222,7 +222,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (auth_type == NL80211_AUTHTYPE_SHARED_KEY)
-		if (!key || !key_len || key_idx < 0 || key_idx > 4)
+		if (!key || !key_len || key_idx < 0 || key_idx > 3)
 			return -EINVAL;
 
 	if (wdev->current_bss &&
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7ebad350abc1..c11c1ef6daa7 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7388,7 +7388,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 		    (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 ||
 		     key.p.key_len != WLAN_KEY_LEN_WEP104))
 			return -EINVAL;
-		if (key.idx > 4)
+		if (key.idx > 3)
 			return -EINVAL;
 	} else {
 		key.p.key_len = 0;
-- 
cgit v1.2.3


From 42ee231cd12c2e1eb015163d04a65950e895d4b7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 15:51:03 +0200
Subject: nl80211: fix connect keys range check

Only key index 0-3 should be accepted, 4/5 are for IGTKs and
cannot be used as connect keys. Fix the range checking to not
allow such erroneous configurations.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c11c1ef6daa7..524f5d238c14 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -866,7 +866,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 		err = -EINVAL;
 		if (!parse.p.key)
 			goto error;
-		if (parse.idx < 0 || parse.idx > 4)
+		if (parse.idx < 0 || parse.idx > 3)
 			goto error;
 		if (parse.def) {
 			if (def)
-- 
cgit v1.2.3


From 386b1f273850380a1887044673922843736c6d0a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 16:10:02 +0200
Subject: nl80211: only allow WEP keys during connect command

This was already documented that way in nl80211.h, but the
parsing code still accepted other key types. Change it to
really only accept WEP keys as documented.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 524f5d238c14..6cb33ae38ae3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -881,16 +881,19 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 						     parse.idx, false, NULL);
 		if (err)
 			goto error;
+		if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+		    parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) {
+			err = -EINVAL;
+			goto error;
+		}
 		result->params[parse.idx].cipher = parse.p.cipher;
 		result->params[parse.idx].key_len = parse.p.key_len;
 		result->params[parse.idx].key = result->data[parse.idx];
 		memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len);
 
-		if (parse.p.cipher == WLAN_CIPHER_SUITE_WEP40 ||
-		    parse.p.cipher == WLAN_CIPHER_SUITE_WEP104) {
-			if (no_ht)
-				*no_ht = true;
-		}
+		/* must be WEP key if we got here */
+		if (no_ht)
+			*no_ht = true;
 	}
 
 	return result;
-- 
cgit v1.2.3


From 9381e267b69acfea96c8429dc99da3e78835cef1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 16:11:32 +0200
Subject: cfg80211: wext: only allow WEP keys to be configured before connected

When not connected, anything but WEP keys shouldn't be allowed to be
configured for later - only static WEP keys make sense at this point.
Change wext to reject anything else just like nl80211 does.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/wext-compat.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 9f27221c8913..e45a76449b43 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -487,6 +487,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	err = 0;
 	if (wdev->current_bss)
 		err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
+	else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
+		 params->cipher != WLAN_CIPHER_SUITE_WEP104)
+		return -EINVAL;
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From e9c8f8d3a4d54106a30f2b981b53d658c9bc0c8e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 16:37:40 +0200
Subject: cfg80211: validate key index better

Don't accept it if a key_idx < 0 snuck through, reject WEP keys with
key index 4 and 5 (which are used for IGTKs) and don't allow IGTKs
with key indices other than 4 and 5. This makes the key data match
expectations better.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 0675f513e7b9..12e2d3fae843 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -218,7 +218,7 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 				   struct key_params *params, int key_idx,
 				   bool pairwise, const u8 *mac_addr)
 {
-	if (key_idx > 5)
+	if (key_idx < 0 || key_idx > 5)
 		return -EINVAL;
 
 	if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
@@ -249,7 +249,13 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 		/* Disallow BIP (group-only) cipher as pairwise cipher */
 		if (pairwise)
 			return -EINVAL;
+		if (key_idx < 4)
+			return -EINVAL;
 		break;
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		if (key_idx > 3)
+			return -EINVAL;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 89b706fb28e431fa7639348536c284fb375eb3c0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 16:39:38 +0200
Subject: cfg80211: reduce connect key caching struct size

After the previous patches, connect keys can only (correctly)
be used for storing static WEP keys. Therefore, remove all the
data for dealing with key index 4/5 and reduce the size of the
key material to the maximum for WEP keys.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.h        | 6 +++---
 net/wireless/ibss.c        | 6 ++----
 net/wireless/nl80211.c     | 1 -
 net/wireless/util.c        | 5 +----
 net/wireless/wext-compat.c | 6 +++---
 net/wireless/wext-sme.c    | 3 +--
 6 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/wireless/core.h b/net/wireless/core.h
index eee91443924d..5555e3c13ae9 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -249,9 +249,9 @@ struct cfg80211_event {
 };
 
 struct cfg80211_cached_keys {
-	struct key_params params[6];
-	u8 data[6][WLAN_MAX_KEY_LEN];
-	int def, defmgmt;
+	struct key_params params[4];
+	u8 data[4][WLAN_KEY_LEN_WEP104];
+	int def;
 };
 
 enum cfg80211_chan_mode {
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 4a4dda53bdf1..896cbb20b6e1 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -284,10 +284,8 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
 	if (!netif_running(wdev->netdev))
 		return 0;
 
-	if (wdev->wext.keys) {
+	if (wdev->wext.keys)
 		wdev->wext.keys->def = wdev->wext.default_key;
-		wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
-	}
 
 	wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
 
@@ -295,7 +293,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
 		ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
 		if (!ck)
 			return -ENOMEM;
-		for (i = 0; i < 6; i++)
+		for (i = 0; i < 4; i++)
 			ck->params[i].key = ck->data[i];
 	}
 	err = __cfg80211_join_ibss(rdev, wdev->netdev,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6cb33ae38ae3..71af96e8a947 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -854,7 +854,6 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 		return ERR_PTR(-ENOMEM);
 
 	result->def = -1;
-	result->defmgmt = -1;
 
 	nla_for_each_nested(key, keys, rem) {
 		memset(&parse, 0, sizeof(parse));
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 12e2d3fae843..9e6e2aaa7766 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -912,7 +912,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
 	if (!wdev->connect_keys)
 		return;
 
-	for (i = 0; i < 6; i++) {
+	for (i = 0; i < 4; i++) {
 		if (!wdev->connect_keys->params[i].cipher)
 			continue;
 		if (rdev_add_key(rdev, dev, i, false, NULL,
@@ -925,9 +925,6 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
 				netdev_err(dev, "failed to set defkey %d\n", i);
 				continue;
 			}
-		if (wdev->connect_keys->defmgmt == i)
-			if (rdev_set_default_mgmt_key(rdev, dev, i))
-				netdev_err(dev, "failed to set mgtdef %d\n", i);
 	}
 
 	kzfree(wdev->connect_keys);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index e45a76449b43..7b97d43b27e1 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -408,10 +408,10 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 
 	if (!wdev->wext.keys) {
 		wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
-					      GFP_KERNEL);
+					  GFP_KERNEL);
 		if (!wdev->wext.keys)
 			return -ENOMEM;
-		for (i = 0; i < 6; i++)
+		for (i = 0; i < 4; i++)
 			wdev->wext.keys->params[i].key =
 				wdev->wext.keys->data[i];
 	}
@@ -460,7 +460,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 		if (err == -ENOENT)
 			err = 0;
 		if (!err) {
-			if (!addr) {
+			if (!addr && idx < 4) {
 				memset(wdev->wext.keys->data[idx], 0,
 				       sizeof(wdev->wext.keys->data[idx]));
 				wdev->wext.keys->params[idx].key_len = 0;
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index a4e8af3321d2..f6523a4387cc 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -35,7 +35,6 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 
 	if (wdev->wext.keys) {
 		wdev->wext.keys->def = wdev->wext.default_key;
-		wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
 		if (wdev->wext.default_key != -1)
 			wdev->wext.connect.privacy = true;
 	}
@@ -47,7 +46,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 		ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
 		if (!ck)
 			return -ENOMEM;
-		for (i = 0; i < 6; i++)
+		for (i = 0; i < 4; i++)
 			ck->params[i].key = ck->data[i];
 	}
 
-- 
cgit v1.2.3


From 08a39685a771b4b1108889ea5e4e0a71b51782ba Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 22:36:21 +0100
Subject: rxrpc: Make sure we initialise the peer hash key

Peer records created for incoming connections weren't getting their hash
key set.  This meant that incoming calls wouldn't see more than one DATA
packet - which is not a problem for AFS CM calls with small request data
blobs.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/peer_object.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 2efe29a4c232..3e6cd174b53d 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -203,6 +203,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
  */
 static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
 {
+	peer->hash_key = hash_key;
 	rxrpc_assess_MTU_size(peer);
 	peer->mtu = peer->if_mtu;
 
@@ -238,7 +239,6 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
 
 	peer = rxrpc_alloc_peer(local, gfp);
 	if (peer) {
-		peer->hash_key = hash_key;
 		memcpy(&peer->srx, srx, sizeof(*srx));
 		rxrpc_init_peer(peer, hash_key);
 	}
-- 
cgit v1.2.3


From bc4abfcf51835420d61440b2b7aa18181bc1f273 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 22:36:21 +0100
Subject: rxrpc: Add missing wakeup on Tx window rotation

We need to wake up the sender when Tx window rotation due to an incoming
ACK makes space in the buffer otherwise the sender is liable to just hang
endlessly.

This problem isn't noticeable if the Tx phase transfers no more than will
fit in a single window or the Tx window rotates fast enough that it doesn't
get full.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index afeba98004b1..a707d5952164 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -59,6 +59,8 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 
 	spin_unlock(&call->lock);
 
+	wake_up(&call->waitq);
+
 	while (list) {
 		skb = list;
 		list = skb->next;
-- 
cgit v1.2.3


From 91c2c7b656a80984362dbcb3d326e4a7274d0607 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 22:36:21 +0100
Subject: rxrpc: The IDLE ACK packet should use rxrpc_idle_ack_delay

The IDLE ACK packet should use the rxrpc_idle_ack_delay setting when the
timer is set for it.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 2b976e789562..61432049869b 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -95,7 +95,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		break;
 
 	case RXRPC_ACK_IDLE:
-		if (rxrpc_soft_ack_delay < expiry)
+		if (rxrpc_idle_ack_delay < expiry)
 			expiry = rxrpc_idle_ack_delay;
 		break;
 
-- 
cgit v1.2.3


From 33b603fda815faf12f66156a49b510126fac984b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 22:36:21 +0100
Subject: rxrpc: Requeue call for recvmsg if more data

rxrpc_recvmsg() needs to make sure that the call it has just been
processing gets requeued for further attention if the buffer has been
filled and there's more data to be consumed.  The softirq producer only
queues the call and wakes the socket if it fills the first slot in the
window, so userspace might end up sleeping forever otherwise, despite there
being data available.

This is not a problem provided the userspace buffer is big enough or it
empties the buffer completely before more data comes in.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/recvmsg.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 20d0b5c6f81b..16ff56f69256 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -463,6 +463,10 @@ try_again:
 					 flags, &copied);
 		if (ret == -EAGAIN)
 			ret = 0;
+
+		if (after(call->rx_top, call->rx_hard_ack) &&
+		    call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK])
+			rxrpc_notify_socket(call);
 		break;
 	default:
 		ret = 0;
-- 
cgit v1.2.3


From b25de3605339c94a6c27d42efe8f7748ea206a8b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 22:36:22 +0100
Subject: rxrpc: Add missing unlock in rxrpc_call_accept()

Add a missing unlock in rxrpc_call_accept() in the path taken if there's no
call to wake up.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_accept.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index b8acec0d596e..06e328f6b0f0 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -425,9 +425,11 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 
 	write_lock(&rx->call_lock);
 
-	ret = -ENODATA;
-	if (list_empty(&rx->to_be_accepted))
-		goto out;
+	if (list_empty(&rx->to_be_accepted)) {
+		write_unlock(&rx->call_lock);
+		kleave(" = -ENODATA [empty]");
+		return ERR_PTR(-ENODATA);
+	}
 
 	/* check the user ID isn't already in use */
 	pp = &rx->calls.rb_node;
-- 
cgit v1.2.3


From 89a80ed4c09afb2aff6abe32f6dd68605f857a7d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 22:36:22 +0100
Subject: rxrpc: Use skb->len not skb->data_len

skb->len should be used rather than skb->data_len when referring to the
amount of data in a packet.  This will only cause a malfunction in the
following cases:

 (1) We receive a jumbo packet (validation and splitting both are wrong).

 (2) We see if there's extra ACK info in an ACK packet (we think it's not
     there and just ignore it).

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index a707d5952164..5958ef8ba2a0 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -127,7 +127,7 @@ static bool rxrpc_validate_jumbo(struct sk_buff *skb)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	unsigned int offset = sp->offset;
-	unsigned int len = skb->data_len;
+	unsigned int len = skb->len;
 	int nr_jumbo = 1;
 	u8 flags = sp->hdr.flags;
 
@@ -196,7 +196,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 	u8 ack = 0, flags, annotation = 0;
 
 	_enter("{%u,%u},{%u,%u}",
-	       call->rx_hard_ack, call->rx_top, skb->data_len, seq);
+	       call->rx_hard_ack, call->rx_top, skb->len, seq);
 
 	_proto("Rx DATA %%%u { #%u f=%02x }",
 	       sp->hdr.serial, seq, sp->hdr.flags);
@@ -233,7 +233,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 next_subpacket:
 	queued = false;
 	ix = seq & RXRPC_RXTX_BUFF_MASK;
-	len = skb->data_len;
+	len = skb->len;
 	if (flags & RXRPC_JUMBO_PACKET)
 		len = RXRPC_JUMBO_DATALEN;
 
@@ -444,7 +444,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 
 	offset = sp->offset + nr_acks + 3;
-	if (skb->data_len >= offset + sizeof(buf.info)) {
+	if (skb->len >= offset + sizeof(buf.info)) {
 		if (skb_copy_bits(skb, offset, &buf.info, sizeof(buf.info)) < 0)
 			return rxrpc_proto_abort("XAI", call, 0);
 		rxrpc_input_ackinfo(call, skb, &buf.info);
-- 
cgit v1.2.3


From 01fd0742248cfc99b3b0cba1e09e1c0ecb8658fa Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 10:23:01 +0100
Subject: rxrpc: Allow tx_winsize to grow in response to an ACK

Allow tx_winsize to grow when the ACK info packet shows a larger receive
window at the other end rather than only permitting it to shrink.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 5958ef8ba2a0..8e529afcd6c1 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -333,14 +333,16 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_peer *peer;
 	unsigned int mtu;
+	u32 rwind = ntohl(ackinfo->rwind);
 
 	_proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
 	       sp->hdr.serial,
 	       ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
-	       ntohl(ackinfo->rwind), ntohl(ackinfo->jumbo_max));
+	       rwind, ntohl(ackinfo->jumbo_max));
 
-	if (call->tx_winsize > ntohl(ackinfo->rwind))
-		call->tx_winsize = ntohl(ackinfo->rwind);
+	if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
+		rwind = RXRPC_RXTX_BUFF_SIZE - 1;
+	call->tx_winsize = rwind;
 
 	mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
 
-- 
cgit v1.2.3


From cbd00891de9bb4756bac6f6edfa945d5a6468977 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 09:12:34 +0100
Subject: rxrpc: Adjust the call ref tracepoint to show kernel API refs

Adjust the call ref tracepoint to show references held on a call by the
kernel API separately as much as possible and add an additional trace to at
the allocation point from the preallocation buffer for an incoming call.

Note that this doesn't show the allocation of a client call for the kernel
separately at the moment.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    | 2 +-
 net/rxrpc/ar-internal.h | 2 ++
 net/rxrpc/call_accept.c | 3 ++-
 net/rxrpc/call_object.c | 2 ++
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index caa226dd436e..25d00ded24bc 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -299,7 +299,7 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
 {
 	_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
 	rxrpc_release_call(rxrpc_sk(sock->sk), call);
-	rxrpc_put_call(call, rxrpc_call_put);
+	rxrpc_put_call(call, rxrpc_call_put_kernel);
 }
 EXPORT_SYMBOL(rxrpc_kernel_end_call);
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index b1cb79ec4e96..47c74a581a0f 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -540,8 +540,10 @@ enum rxrpc_call_trace {
 	rxrpc_call_seen,
 	rxrpc_call_got,
 	rxrpc_call_got_userid,
+	rxrpc_call_got_kernel,
 	rxrpc_call_put,
 	rxrpc_call_put_userid,
+	rxrpc_call_put_kernel,
 	rxrpc_call_put_noqueue,
 	rxrpc_call__nr_trace
 };
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 06e328f6b0f0..5fd9d2c89b7f 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -121,7 +121,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 
 		call->user_call_ID = user_call_ID;
 		call->notify_rx = notify_rx;
-		rxrpc_get_call(call, rxrpc_call_got);
+		rxrpc_get_call(call, rxrpc_call_got_kernel);
 		user_attach_call(call, user_call_ID);
 		rxrpc_get_call(call, rxrpc_call_got_userid);
 		rb_link_node(&call->sock_node, parent, pp);
@@ -300,6 +300,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
 	smp_store_release(&b->call_backlog_tail,
 			  (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
 
+	rxrpc_see_call(call);
 	call->conn = conn;
 	call->peer = rxrpc_get_peer(conn->params.peer);
 	return call;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 18ab13f82f6e..3f9476508204 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -56,8 +56,10 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
 	[rxrpc_call_seen]		= "SEE",
 	[rxrpc_call_got]		= "GOT",
 	[rxrpc_call_got_userid]		= "Gus",
+	[rxrpc_call_got_kernel]		= "Gke",
 	[rxrpc_call_put]		= "PUT",
 	[rxrpc_call_put_userid]		= "Pus",
+	[rxrpc_call_put_kernel]		= "Pke",
 	[rxrpc_call_put_noqueue]	= "PNQ",
 };
 
-- 
cgit v1.2.3


From 3432a757b1f889f8c0d33cd9fcabdae172ed812b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 09:05:14 +0100
Subject: rxrpc: Fix prealloc refcounting

The preallocated call buffer holds a ref on the calls within that buffer.
The ref was being released in the wrong place - it worked okay for incoming
calls to the AFS cache manager service, but doesn't work right for incoming
calls to a userspace service.

Instead of releasing an extra ref service calls in rxrpc_release_call(),
the ref needs to be released during the acceptance/rejectance process.  To
this end:

 (1) The prealloc ref is now normally released during
     rxrpc_new_incoming_call().

 (2) For preallocated kernel API calls, the kernel API's ref needs to be
     released when the call is discarded on socket close.

 (3) We shouldn't take a second ref in rxrpc_accept_call().

 (4) rxrpc_recvmsg_new_call() needs to get a ref of its own when it adds
     the call to the to_be_accepted socket queue.

In doing (4) above, we would prefer not to put the call's refcount down to
0 as that entails doing cleanup in softirq context, but it's unlikely as
there are several refs held elsewhere, at least one of which must be put by
someone in process context calling rxrpc_release_call().  However, it's not
a problem if we do have to do that.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_accept.c | 9 ++++++++-
 net/rxrpc/call_object.c | 3 ---
 net/rxrpc/recvmsg.c     | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 5fd9d2c89b7f..26c293ef98eb 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -221,6 +221,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 		if (rx->discard_new_call) {
 			_debug("discard %lx", call->user_call_ID);
 			rx->discard_new_call(call, call->user_call_ID);
+			rxrpc_put_call(call, rxrpc_call_put_kernel);
 		}
 		rxrpc_call_completed(call);
 		rxrpc_release_call(rx, call);
@@ -402,6 +403,13 @@ found_service:
 	if (call->state == RXRPC_CALL_SERVER_ACCEPTING)
 		rxrpc_notify_socket(call);
 
+	/* We have to discard the prealloc queue's ref here and rely on a
+	 * combination of the RCU read lock and refs held either by the socket
+	 * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel
+	 * service to prevent the call from being deallocated too early.
+	 */
+	rxrpc_put_call(call, rxrpc_call_put);
+
 	_leave(" = %p{%d}", call, call->debug_id);
 out:
 	spin_unlock(&rx->incoming_lock);
@@ -469,7 +477,6 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	}
 
 	/* formalise the acceptance */
-	rxrpc_get_call(call, rxrpc_call_got);
 	call->notify_rx = notify_rx;
 	call->user_call_ID = user_call_ID;
 	rxrpc_get_call(call, rxrpc_call_got_userid);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 3f9476508204..9aa1c4b53563 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -464,9 +464,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 		call->rxtx_buffer[i] = NULL;
 	}
 
-	/* We have to release the prealloc backlog ref */
-	if (rxrpc_is_service_call(call))
-		rxrpc_put_call(call, rxrpc_call_put);
 	_leave("");
 }
 
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 16ff56f69256..a284205b8ecf 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -118,6 +118,7 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
 		list_del_init(&call->recvmsg_link);
 		write_unlock_bh(&rx->recvmsg_lock);
 
+		rxrpc_get_call(call, rxrpc_call_got);
 		write_lock(&rx->call_lock);
 		list_add_tail(&call->accept_link, &rx->to_be_accepted);
 		write_unlock(&rx->call_lock);
-- 
cgit v1.2.3


From 75e42126399220069ada0ca0e93237993c6afccf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 22:36:22 +0100
Subject: rxrpc: Correctly initialise, limit and transmit call->rx_winsize

call->rx_winsize should be initialised to the sysctl setting and the sysctl
setting should be limited to the maximum we want to permit.  Further, we
need to place this in the ACK info instead of the sysctl setting.

Furthermore, discard the idea of accepting the subpackets of a jumbo packet
that lie beyond the receive window when the first packet of the jumbo is
within the window.  Just discard the excess subpackets instead.  This
allows the receive window to be opened up right to the buffer size less one
for the dead slot.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  3 ++-
 net/rxrpc/call_object.c |  2 +-
 net/rxrpc/input.c       | 23 ++++++++++++++++-------
 net/rxrpc/misc.c        |  5 ++++-
 net/rxrpc/output.c      |  4 ++--
 net/rxrpc/sysctl.c      |  2 +-
 6 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 47c74a581a0f..e78c40b37db5 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -498,6 +498,7 @@ struct rxrpc_call {
 	 */
 #define RXRPC_RXTX_BUFF_SIZE	64
 #define RXRPC_RXTX_BUFF_MASK	(RXRPC_RXTX_BUFF_SIZE - 1)
+#define RXRPC_INIT_RX_WINDOW_SIZE 32
 	struct sk_buff		**rxtx_buffer;
 	u8			*rxtx_annotations;
 #define RXRPC_TX_ANNO_ACK	0
@@ -518,7 +519,7 @@ struct rxrpc_call {
 	rxrpc_seq_t		rx_expect_next;	/* Expected next packet sequence number */
 	u8			rx_winsize;	/* Size of Rx window */
 	u8			tx_winsize;	/* Maximum size of Tx window */
-	u8			nr_jumbo_dup;	/* Number of jumbo duplicates */
+	u8			nr_jumbo_bad;	/* Number of jumbo dups/exceeds-windows */
 
 	/* receive-phase ACK management */
 	u8			ackr_reason;	/* reason to ACK */
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 9aa1c4b53563..22f9b0d1a138 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -152,7 +152,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
 
 	/* Leave space in the ring to handle a maxed-out jumbo packet */
-	call->rx_winsize = RXRPC_RXTX_BUFF_SIZE - 1 - 46;
+	call->rx_winsize = rxrpc_rx_window_size;
 	call->tx_winsize = 16;
 	call->rx_expect_next = 1;
 	return call;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 8e529afcd6c1..75af0bd316c7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -164,7 +164,7 @@ protocol_error:
  * (that information is encoded in the ACK packet).
  */
 static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
-				 u8 annotation, bool *_jumbo_dup)
+				 u8 annotation, bool *_jumbo_bad)
 {
 	/* Discard normal packets that are duplicates. */
 	if (annotation == 0)
@@ -174,9 +174,9 @@ static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
 	 * more partially duplicate jumbo packets, we refuse to take any more
 	 * jumbos for this call.
 	 */
-	if (!*_jumbo_dup) {
-		call->nr_jumbo_dup++;
-		*_jumbo_dup = true;
+	if (!*_jumbo_bad) {
+		call->nr_jumbo_bad++;
+		*_jumbo_bad = true;
 	}
 }
 
@@ -191,7 +191,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 	unsigned int ix;
 	rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
 	rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
-	bool immediate_ack = false, jumbo_dup = false, queued;
+	bool immediate_ack = false, jumbo_bad = false, queued;
 	u16 len;
 	u8 ack = 0, flags, annotation = 0;
 
@@ -222,7 +222,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 
 	flags = sp->hdr.flags;
 	if (flags & RXRPC_JUMBO_PACKET) {
-		if (call->nr_jumbo_dup > 3) {
+		if (call->nr_jumbo_bad > 3) {
 			ack = RXRPC_ACK_NOSPACE;
 			ack_serial = serial;
 			goto ack;
@@ -259,7 +259,7 @@ next_subpacket:
 	}
 
 	if (call->rxtx_buffer[ix]) {
-		rxrpc_input_dup_data(call, seq, annotation, &jumbo_dup);
+		rxrpc_input_dup_data(call, seq, annotation, &jumbo_bad);
 		if (ack != RXRPC_ACK_DUPLICATE) {
 			ack = RXRPC_ACK_DUPLICATE;
 			ack_serial = serial;
@@ -304,6 +304,15 @@ skip:
 		annotation++;
 		if (flags & RXRPC_JUMBO_PACKET)
 			annotation |= RXRPC_RX_ANNO_JLAST;
+		if (after(seq, hard_ack + call->rx_winsize)) {
+			ack = RXRPC_ACK_EXCEEDS_WINDOW;
+			ack_serial = serial;
+			if (!jumbo_bad) {
+				call->nr_jumbo_bad++;
+				jumbo_bad = true;
+			}
+			goto ack;
+		}
 
 		_proto("Rx DATA Jumbo %%%u", serial);
 		goto next_subpacket;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index fd096f742e4b..8b910780f1ac 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -50,7 +50,10 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
  * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
  * packets.
  */
-unsigned int rxrpc_rx_window_size = RXRPC_RXTX_BUFF_SIZE - 46;
+unsigned int rxrpc_rx_window_size = RXRPC_INIT_RX_WINDOW_SIZE;
+#if (RXRPC_RXTX_BUFF_SIZE - 1) < RXRPC_INIT_RX_WINDOW_SIZE
+#error Need to reduce RXRPC_INIT_RX_WINDOW_SIZE
+#endif
 
 /*
  * Maximum Rx MTU size.  This indicates to the sender the size of jumbo packet
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 719a4c23f09d..90c7722d5779 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -71,10 +71,10 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 
 	mtu = call->conn->params.peer->if_mtu;
 	mtu -= call->conn->params.peer->hdrsize;
-	jmax = (call->nr_jumbo_dup > 3) ? 1 : rxrpc_rx_jumbo_max;
+	jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max;
 	pkt->ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
 	pkt->ackinfo.maxMTU	= htonl(mtu);
-	pkt->ackinfo.rwind	= htonl(rxrpc_rx_window_size);
+	pkt->ackinfo.rwind	= htonl(call->rx_winsize);
 	pkt->ackinfo.jumbo_max	= htonl(jmax);
 
 	*ackp++ = 0;
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index b7ca8cf13c84..a03c61c672f5 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -20,7 +20,7 @@ static const unsigned int one = 1;
 static const unsigned int four = 4;
 static const unsigned int thirtytwo = 32;
 static const unsigned int n_65535 = 65535;
-static const unsigned int n_max_acks = RXRPC_MAXACKS;
+static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
 
 /*
  * RxRPC operating parameters.
-- 
cgit v1.2.3


From cd5892c756f51ed6ff18ff49c837d219bfd9bb5d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 08:49:05 +0100
Subject: rxrpc: Create an address for sendmsg() to bind unbound socket with

Create an address for sendmsg() to bind unbound socket with rather than
using a completely blank address otherwise the transport socket creation
will fail because it will try to use address family 0.

We use the address family specified in the protocol argument when the
AF_RXRPC socket was created and SOCK_DGRAM as the default.  For anything
else, bind() must be used.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 25d00ded24bc..741b0d8d2e8c 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -401,6 +401,18 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
 
 	switch (rx->sk.sk_state) {
 	case RXRPC_UNBOUND:
+		rx->srx.srx_family = AF_RXRPC;
+		rx->srx.srx_service = 0;
+		rx->srx.transport_type = SOCK_DGRAM;
+		rx->srx.transport.family = rx->family;
+		switch (rx->family) {
+		case AF_INET:
+			rx->srx.transport_len = sizeof(struct sockaddr_in);
+			break;
+		default:
+			ret = -EAFNOSUPPORT;
+			goto error_unlock;
+		}
 		local = rxrpc_lookup_local(&rx->srx);
 		if (IS_ERR(local)) {
 			ret = PTR_ERR(local);
-- 
cgit v1.2.3


From aaa31cbc66733386406464ec6c5c0889d9968a95 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 08:49:05 +0100
Subject: rxrpc: Don't specify protocol to when creating transport socket

Pass 0 as the protocol argument when creating the transport socket rather
than IPPROTO_UDP.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/local_object.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 782b9adf67cb..8720be2a6250 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -103,8 +103,8 @@ static int rxrpc_open_socket(struct rxrpc_local *local)
 	_enter("%p{%d}", local, local->srx.transport_type);
 
 	/* create a socket to represent the local endpoint */
-	ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
-			       IPPROTO_UDP, &local->socket);
+	ret = sock_create_kern(&init_net, local->srx.transport.family,
+			       local->srx.transport_type, 0, &local->socket);
 	if (ret < 0) {
 		_leave(" = %d [socket]", ret);
 		return ret;
-- 
cgit v1.2.3


From 1c2bc7b948a2adee0d3e070f4ce14645efa0a2d2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 08:49:05 +0100
Subject: rxrpc: Use rxrpc_extract_addr_from_skb() rather than doing this
 manually

There are two places that want to transmit a packet in response to one just
received and manually pick the address to reply to out of the sk_buff.
Make them use rxrpc_extract_addr_from_skb() instead so that IPv6 is handled
automatically.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/local_event.c | 13 +++++--------
 net/rxrpc/output.c      | 32 ++++++--------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index cdd58e6e9fbd..f073e932500e 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -15,8 +15,6 @@
 #include <linux/net.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
-#include <linux/udp.h>
-#include <linux/ip.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <generated/utsrelease.h>
@@ -33,7 +31,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
 {
 	struct rxrpc_wire_header whdr;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct sockaddr_in sin;
+	struct sockaddr_rxrpc srx;
 	struct msghdr msg;
 	struct kvec iov[2];
 	size_t len;
@@ -41,12 +39,11 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
 
 	_enter("");
 
-	sin.sin_family = AF_INET;
-	sin.sin_port = udp_hdr(skb)->source;
-	sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+	if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
+		return;
 
-	msg.msg_name	= &sin;
-	msg.msg_namelen	= sizeof(sin);
+	msg.msg_name	= &srx.transport;
+	msg.msg_namelen	= srx.transport_len;
 	msg.msg_control	= NULL;
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 90c7722d5779..ec3621f2c5c8 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -15,8 +15,6 @@
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
-#include <linux/udp.h>
-#include <linux/ip.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
@@ -272,10 +270,7 @@ send_fragmentable:
  */
 void rxrpc_reject_packets(struct rxrpc_local *local)
 {
-	union {
-		struct sockaddr sa;
-		struct sockaddr_in sin;
-	} sa;
+	struct sockaddr_rxrpc srx;
 	struct rxrpc_skb_priv *sp;
 	struct rxrpc_wire_header whdr;
 	struct sk_buff *skb;
@@ -292,32 +287,21 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 	iov[1].iov_len = sizeof(code);
 	size = sizeof(whdr) + sizeof(code);
 
-	msg.msg_name = &sa;
+	msg.msg_name = &srx.transport;
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
 	msg.msg_flags = 0;
 
-	memset(&sa, 0, sizeof(sa));
-	sa.sa.sa_family = local->srx.transport.family;
-	switch (sa.sa.sa_family) {
-	case AF_INET:
-		msg.msg_namelen = sizeof(sa.sin);
-		break;
-	default:
-		msg.msg_namelen = 0;
-		break;
-	}
-
 	memset(&whdr, 0, sizeof(whdr));
 	whdr.type = RXRPC_PACKET_TYPE_ABORT;
 
 	while ((skb = skb_dequeue(&local->reject_queue))) {
 		rxrpc_see_skb(skb);
 		sp = rxrpc_skb(skb);
-		switch (sa.sa.sa_family) {
-		case AF_INET:
-			sa.sin.sin_port = udp_hdr(skb)->source;
-			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+
+		if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
+			msg.msg_namelen = srx.transport_len;
+
 			code = htonl(skb->priority);
 
 			whdr.epoch	= htonl(sp->hdr.epoch);
@@ -329,10 +313,6 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 			whdr.flags	&= RXRPC_CLIENT_INITIATED;
 
 			kernel_sendmsg(local->socket, &msg, iov, 2, size);
-			break;
-
-		default:
-			break;
 		}
 
 		rxrpc_free_skb(skb);
-- 
cgit v1.2.3


From 75b54cb57ca34cbe7a87c6ac757c55360a624590 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 08:49:05 +0100
Subject: rxrpc: Add IPv6 support

Add IPv6 support to AF_RXRPC.  With this, AF_RXRPC sockets can be created:

	service = socket(AF_RXRPC, SOCK_DGRAM, PF_INET6);

instead of:

	service = socket(AF_RXRPC, SOCK_DGRAM, PF_INET);

The AFS filesystem doesn't support IPv6 at the moment, though, since that
requires upgrades to some of the RPC calls.

Note that a good portion of this patch is replacing "%pI4:%u" in print
statements with "%pISpc" which is able to handle both protocols and print
the port.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c     |  15 +++++--
 net/rxrpc/conn_object.c  |   8 ++++
 net/rxrpc/local_object.c |  35 +++++++--------
 net/rxrpc/output.c       |  16 +++++++
 net/rxrpc/peer_event.c   |  24 +++++++++++
 net/rxrpc/peer_object.c  | 109 +++++++++++++++++++++++++++++------------------
 net/rxrpc/proc.c         |  30 ++++++-------
 7 files changed, 154 insertions(+), 83 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 741b0d8d2e8c..f61f7b2d1ca4 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -106,19 +106,23 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
 	case AF_INET:
 		if (srx->transport_len < sizeof(struct sockaddr_in))
 			return -EINVAL;
-		_debug("INET: %x @ %pI4",
-		       ntohs(srx->transport.sin.sin_port),
-		       &srx->transport.sin.sin_addr);
 		tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
 		break;
 
 	case AF_INET6:
+		if (srx->transport_len < sizeof(struct sockaddr_in6))
+			return -EINVAL;
+		tail = offsetof(struct sockaddr_rxrpc, transport) +
+			sizeof(struct sockaddr_in6);
+		break;
+
 	default:
 		return -EAFNOSUPPORT;
 	}
 
 	if (tail < len)
 		memset((void *)srx + tail, 0, len - tail);
+	_debug("INET: %pISp", &srx->transport);
 	return 0;
 }
 
@@ -409,6 +413,9 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
 		case AF_INET:
 			rx->srx.transport_len = sizeof(struct sockaddr_in);
 			break;
+		case AF_INET6:
+			rx->srx.transport_len = sizeof(struct sockaddr_in6);
+			break;
 		default:
 			ret = -EAFNOSUPPORT;
 			goto error_unlock;
@@ -563,7 +570,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	/* we support transport protocol UDP/UDP6 only */
-	if (protocol != PF_INET)
+	if (protocol != PF_INET && protocol != PF_INET6)
 		return -EPROTONOSUPPORT;
 
 	if (sock->type != SOCK_DGRAM)
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index ffa9addb97b2..c0ddba787fd4 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -134,6 +134,14 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
 			    srx.transport.sin.sin_addr.s_addr)
 				goto not_found;
 			break;
+		case AF_INET6:
+			if (peer->srx.transport.sin6.sin6_port !=
+			    srx.transport.sin6.sin6_port ||
+			    memcmp(&peer->srx.transport.sin6.sin6_addr,
+				   &srx.transport.sin6.sin6_addr,
+				   sizeof(struct in6_addr)) != 0)
+				goto not_found;
+			break;
 		default:
 			BUG();
 		}
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 8720be2a6250..f5b9bb0d3f98 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -58,6 +58,15 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
 			memcmp(&local->srx.transport.sin.sin_addr,
 			       &srx->transport.sin.sin_addr,
 			       sizeof(struct in_addr));
+	case AF_INET6:
+		/* If the choice of UDP6 port is left up to the transport, then
+		 * the endpoint record doesn't match.
+		 */
+		return ((u16 __force)local->srx.transport.sin6.sin6_port -
+			(u16 __force)srx->transport.sin6.sin6_port) ?:
+			memcmp(&local->srx.transport.sin6.sin6_addr,
+			       &srx->transport.sin6.sin6_addr,
+			       sizeof(struct in6_addr));
 	default:
 		BUG();
 	}
@@ -100,7 +109,8 @@ static int rxrpc_open_socket(struct rxrpc_local *local)
 	struct sock *sock;
 	int ret, opt;
 
-	_enter("%p{%d}", local, local->srx.transport_type);
+	_enter("%p{%d,%d}",
+	       local, local->srx.transport_type, local->srx.transport.family);
 
 	/* create a socket to represent the local endpoint */
 	ret = sock_create_kern(&init_net, local->srx.transport.family,
@@ -169,18 +179,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
 	long diff;
 	int ret;
 
-	if (srx->transport.family == AF_INET) {
-		_enter("{%d,%u,%pI4+%hu}",
-		       srx->transport_type,
-		       srx->transport.family,
-		       &srx->transport.sin.sin_addr,
-		       ntohs(srx->transport.sin.sin_port));
-	} else {
-		_enter("{%d,%u}",
-		       srx->transport_type,
-		       srx->transport.family);
-		return ERR_PTR(-EAFNOSUPPORT);
-	}
+	_enter("{%d,%d,%pISp}",
+	       srx->transport_type, srx->transport.family, &srx->transport);
 
 	mutex_lock(&rxrpc_local_mutex);
 
@@ -233,13 +233,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
 found:
 	mutex_unlock(&rxrpc_local_mutex);
 
-	_net("LOCAL %s %d {%d,%u,%pI4+%hu}",
-	     age,
-	     local->debug_id,
-	     local->srx.transport_type,
-	     local->srx.transport.family,
-	     &local->srx.transport.sin.sin_addr,
-	     ntohs(local->srx.transport.sin.sin_port));
+	_net("LOCAL %s %d {%pISp}",
+	     age, local->debug_id, &local->srx.transport);
 
 	_leave(" = %p", local);
 	return local;
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index ec3621f2c5c8..d7cd87f17f0d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -258,6 +258,22 @@ send_fragmentable:
 					  (char *)&opt, sizeof(opt));
 		}
 		break;
+
+	case AF_INET6:
+		opt = IPV6_PMTUDISC_DONT;
+		ret = kernel_setsockopt(conn->params.local->socket,
+					SOL_IPV6, IPV6_MTU_DISCOVER,
+					(char *)&opt, sizeof(opt));
+		if (ret == 0) {
+			ret = kernel_sendmsg(conn->params.local->socket, &msg,
+					     iov, 1, iov[0].iov_len);
+
+			opt = IPV6_PMTUDISC_DO;
+			kernel_setsockopt(conn->params.local->socket,
+					  SOL_IPV6, IPV6_MTU_DISCOVER,
+					  (char *)&opt, sizeof(opt));
+		}
+		break;
 	}
 
 	up_write(&conn->params.local->defrag_sem);
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index c8948936c6fc..74217589cf44 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -66,6 +66,30 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 		}
 		break;
 
+	case AF_INET6:
+		srx.transport.sin6.sin6_port = serr->port;
+		srx.transport_len = sizeof(struct sockaddr_in6);
+		switch (serr->ee.ee_origin) {
+		case SO_EE_ORIGIN_ICMP6:
+			_net("Rx ICMP6");
+			memcpy(&srx.transport.sin6.sin6_addr,
+			       skb_network_header(skb) + serr->addr_offset,
+			       sizeof(struct in6_addr));
+			break;
+		case SO_EE_ORIGIN_ICMP:
+			_net("Rx ICMP on v6 sock");
+			memcpy(&srx.transport.sin6.sin6_addr.s6_addr + 12,
+			       skb_network_header(skb) + serr->addr_offset,
+			       sizeof(struct in_addr));
+			break;
+		default:
+			memcpy(&srx.transport.sin6.sin6_addr,
+			       &ipv6_hdr(skb)->saddr,
+			       sizeof(struct in6_addr));
+			break;
+		}
+		break;
+
 	default:
 		BUG();
 	}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 3e6cd174b53d..dfc07b41a472 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -16,12 +16,14 @@
 #include <linux/skbuff.h>
 #include <linux/udp.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/slab.h>
 #include <linux/hashtable.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <net/ip.h>
 #include <net/route.h>
+#include <net/ip6_route.h>
 #include "ar-internal.h"
 
 static DEFINE_HASHTABLE(rxrpc_peer_hash, 10);
@@ -50,6 +52,11 @@ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local,
 		size = sizeof(srx->transport.sin.sin_addr);
 		p = (u16 *)&srx->transport.sin.sin_addr;
 		break;
+	case AF_INET6:
+		hash_key += (u16 __force)srx->transport.sin.sin_port;
+		size = sizeof(srx->transport.sin6.sin6_addr);
+		p = (u16 *)&srx->transport.sin6.sin6_addr;
+		break;
 	default:
 		WARN(1, "AF_RXRPC: Unsupported transport address family\n");
 		return 0;
@@ -93,6 +100,12 @@ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer,
 			memcmp(&peer->srx.transport.sin.sin_addr,
 			       &srx->transport.sin.sin_addr,
 			       sizeof(struct in_addr));
+	case AF_INET6:
+		return ((u16 __force)peer->srx.transport.sin6.sin6_port -
+			(u16 __force)srx->transport.sin6.sin6_port) ?:
+			memcmp(&peer->srx.transport.sin6.sin6_addr,
+			       &srx->transport.sin6.sin6_addr,
+			       sizeof(struct in6_addr));
 	default:
 		BUG();
 	}
@@ -130,17 +143,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
 
 	peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
 	if (peer) {
-		switch (srx->transport.family) {
-		case AF_INET:
-			_net("PEER %d {%d,%u,%pI4+%hu}",
-			     peer->debug_id,
-			     peer->srx.transport_type,
-			     peer->srx.transport.family,
-			     &peer->srx.transport.sin.sin_addr,
-			     ntohs(peer->srx.transport.sin.sin_port));
-			break;
-		}
-
+		_net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
 		_leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
 	}
 	return peer;
@@ -152,22 +155,49 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
  */
 static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 {
+	struct dst_entry *dst;
 	struct rtable *rt;
-	struct flowi4 fl4;
+	struct flowi fl;
+	struct flowi4 *fl4 = &fl.u.ip4;
+	struct flowi6 *fl6 = &fl.u.ip6;
 
 	peer->if_mtu = 1500;
 
-	rt = ip_route_output_ports(&init_net, &fl4, NULL,
-				   peer->srx.transport.sin.sin_addr.s_addr, 0,
-				   htons(7000), htons(7001),
-				   IPPROTO_UDP, 0, 0);
-	if (IS_ERR(rt)) {
-		_leave(" [route err %ld]", PTR_ERR(rt));
-		return;
+	memset(&fl, 0, sizeof(fl));
+	switch (peer->srx.transport.family) {
+	case AF_INET:
+		rt = ip_route_output_ports(
+			&init_net, fl4, NULL,
+			peer->srx.transport.sin.sin_addr.s_addr, 0,
+			htons(7000), htons(7001), IPPROTO_UDP, 0, 0);
+		if (IS_ERR(rt)) {
+			_leave(" [route err %ld]", PTR_ERR(rt));
+			return;
+		}
+		dst = &rt->dst;
+		break;
+
+	case AF_INET6:
+		fl6->flowi6_iif = LOOPBACK_IFINDEX;
+		fl6->flowi6_scope = RT_SCOPE_UNIVERSE;
+		fl6->flowi6_proto = IPPROTO_UDP;
+		memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr,
+		       sizeof(struct in6_addr));
+		fl6->fl6_dport = htons(7001);
+		fl6->fl6_sport = htons(7000);
+		dst = ip6_route_output(&init_net, NULL, fl6);
+		if (IS_ERR(dst)) {
+			_leave(" [route err %ld]", PTR_ERR(dst));
+			return;
+		}
+		break;
+
+	default:
+		BUG();
 	}
 
-	peer->if_mtu = dst_mtu(&rt->dst);
-	dst_release(&rt->dst);
+	peer->if_mtu = dst_mtu(dst);
+	dst_release(dst);
 
 	_leave(" [if_mtu %u]", peer->if_mtu);
 }
@@ -207,17 +237,22 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
 	rxrpc_assess_MTU_size(peer);
 	peer->mtu = peer->if_mtu;
 
-	if (peer->srx.transport.family == AF_INET) {
+	switch (peer->srx.transport.family) {
+	case AF_INET:
 		peer->hdrsize = sizeof(struct iphdr);
-		switch (peer->srx.transport_type) {
-		case SOCK_DGRAM:
-			peer->hdrsize += sizeof(struct udphdr);
-			break;
-		default:
-			BUG();
-			break;
-		}
-	} else {
+		break;
+	case AF_INET6:
+		peer->hdrsize = sizeof(struct ipv6hdr);
+		break;
+	default:
+		BUG();
+	}
+
+	switch (peer->srx.transport_type) {
+	case SOCK_DGRAM:
+		peer->hdrsize += sizeof(struct udphdr);
+		break;
+	default:
 		BUG();
 	}
 
@@ -285,11 +320,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 	struct rxrpc_peer *peer, *candidate;
 	unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
 
-	_enter("{%d,%d,%pI4+%hu}",
-	       srx->transport_type,
-	       srx->transport_len,
-	       &srx->transport.sin.sin_addr,
-	       ntohs(srx->transport.sin.sin_port));
+	_enter("{%pISp}", &srx->transport);
 
 	/* search the peer list first */
 	rcu_read_lock();
@@ -326,11 +357,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 			peer = candidate;
 	}
 
-	_net("PEER %d {%d,%pI4+%hu}",
-	     peer->debug_id,
-	     peer->srx.transport_type,
-	     &peer->srx.transport.sin.sin_addr,
-	     ntohs(peer->srx.transport.sin.sin_port));
+	_net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
 
 	_leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
 	return peer;
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index d529d1b4021c..65cd980767fa 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,11 +52,12 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	struct rxrpc_sock *rx;
 	struct rxrpc_peer *peer;
 	struct rxrpc_call *call;
-	char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+	char lbuff[50], rbuff[50];
 
 	if (v == &rxrpc_calls) {
 		seq_puts(seq,
-			 "Proto Local                  Remote                "
+			 "Proto Local                                          "
+			 " Remote                                         "
 			 " SvID ConnID   CallID   End Use State    Abort   "
 			 " UserID\n");
 		return 0;
@@ -68,9 +69,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	if (rx) {
 		local = READ_ONCE(rx->local);
 		if (local)
-			sprintf(lbuff, "%pI4:%u",
-				&local->srx.transport.sin.sin_addr,
-				ntohs(local->srx.transport.sin.sin_port));
+			sprintf(lbuff, "%pISpc", &local->srx.transport);
 		else
 			strcpy(lbuff, "no_local");
 	} else {
@@ -79,14 +78,12 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 
 	peer = call->peer;
 	if (peer)
-		sprintf(rbuff, "%pI4:%u",
-			&peer->srx.transport.sin.sin_addr,
-			ntohs(peer->srx.transport.sin.sin_port));
+		sprintf(rbuff, "%pISpc", &peer->srx.transport);
 	else
 		strcpy(rbuff, "no_connection");
 
 	seq_printf(seq,
-		   "UDP   %-22.22s %-22.22s %4x %08x %08x %s %3u"
+		   "UDP   %-47.47s %-47.47s %4x %08x %08x %s %3u"
 		   " %-8.8s %08x %lx\n",
 		   lbuff,
 		   rbuff,
@@ -145,11 +142,12 @@ static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
 static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 {
 	struct rxrpc_connection *conn;
-	char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+	char lbuff[50], rbuff[50];
 
 	if (v == &rxrpc_connection_proc_list) {
 		seq_puts(seq,
-			 "Proto Local                  Remote                "
+			 "Proto Local                                          "
+			 " Remote                                         "
 			 " SvID ConnID   End Use State    Key     "
 			 " Serial   ISerial\n"
 			 );
@@ -163,16 +161,12 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 		goto print;
 	}
 
-	sprintf(lbuff, "%pI4:%u",
-		&conn->params.local->srx.transport.sin.sin_addr,
-		ntohs(conn->params.local->srx.transport.sin.sin_port));
+	sprintf(lbuff, "%pISpc", &conn->params.local->srx.transport);
 
-	sprintf(rbuff, "%pI4:%u",
-		&conn->params.peer->srx.transport.sin.sin_addr,
-		ntohs(conn->params.peer->srx.transport.sin.sin_port));
+	sprintf(rbuff, "%pISpc", &conn->params.peer->srx.transport);
 print:
 	seq_printf(seq,
-		   "UDP   %-22.22s %-22.22s %4x %08x %s %3u"
+		   "UDP   %-47.47s %-47.47s %4x %08x %s %3u"
 		   " %s %08x %08x %08x\n",
 		   lbuff,
 		   rbuff,
-- 
cgit v1.2.3


From 0b97a484e52cb423662eb98904aad82dafcc1f10 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 09:41:34 +0200
Subject: mac80211: check skb_linearize() return value

The A-MSDU TX code (within TXQs) didn't always check the return value
of skb_linearize() properly, resulting in potentially passing a frag-
list SKB down to the driver even when it said it can't handle it. Fix
that.

Fixes: 6e0456b545456 ("mac80211: add A-MSDU tx support")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index cc8e95554b48..18b285e06bc8 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1515,8 +1515,12 @@ out:
 	spin_unlock_bh(&fq->lock);
 
 	if (skb && skb_has_frag_list(skb) &&
-	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
-		skb_linearize(skb);
+	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
+		if (skb_linearize(skb)) {
+			ieee80211_free_txskb(&local->hw, skb);
+			return NULL;
+		}
+	}
 
 	return skb;
 }
-- 
cgit v1.2.3


From 85d5313ed717ad60769491c7c072d23bc0a68e7a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 11:38:31 +0200
Subject: mac80211: reject TSPEC TIDs (TSIDs) for aggregation

Since mac80211 doesn't currently support TSIDs 8-15 which can
only be used after QoS TSPEC negotiation (and not even after
WMM negotiation), reject attempts to set up aggregation
sessions for them, which might confuse drivers. In mac80211
we do correctly handle that, but the TSIDs should never get
used anyway, and drivers might not be able to handle it.

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-rx.c | 8 +++++++-
 net/mac80211/agg-tx.c | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index a9aff6079c42..afa94687d5e1 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -261,10 +261,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		.timeout = timeout,
 		.ssn = start_seq_num,
 	};
-
 	int i, ret = -EOPNOTSUPP;
 	u16 status = WLAN_STATUS_REQUEST_DECLINED;
 
+	if (tid >= IEEE80211_FIRST_TSPEC_TSID) {
+		ht_dbg(sta->sdata,
+		       "STA %pM requests BA session on unsupported tid %d\n",
+		       sta->sta.addr, tid);
+		goto end_no_lock;
+	}
+
 	if (!sta->sta.ht_cap.ht_supported) {
 		ht_dbg(sta->sdata,
 		       "STA %pM erroneously requests BA session on tid %d w/o QoS\n",
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 5650c46bf91a..45319cc01121 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -584,6 +584,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
 	    ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW))
 		return -EINVAL;
 
+	if (WARN_ON(tid >= IEEE80211_FIRST_TSPEC_TSID))
+		return -EINVAL;
+
 	ht_dbg(sdata, "Open BA session requested for %pM tid %u\n",
 	       pubsta->addr, tid);
 
-- 
cgit v1.2.3


From f1c1f17ac52d22227c0074b3d661d7ed692b707a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 13 Sep 2016 17:08:23 +0200
Subject: cfg80211: allow connect keys only with default (TX) key

There's no point in allowing connect keys when one of them
isn't also configured as the TX key, it would just confuse
drivers and probably cause them to pick something for TX.
Disallow this confusing and erroneous configuration.

As wpa_supplicant will always send NL80211_ATTR_KEYS, even
when there are no keys inside, allow that and treat it as
though the attribute isn't present at all.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/ibss.c     |  5 ++++-
 net/wireless/nl80211.c  | 14 ++++++++++++++
 net/wireless/sme.c      |  3 +++
 net/wireless/wext-sme.c |  2 +-
 4 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 896cbb20b6e1..eafdfa5798ae 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -114,6 +114,9 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 		}
 	}
 
+	if (WARN_ON(connkeys && connkeys->def < 0))
+		return -EINVAL;
+
 	if (WARN_ON(wdev->connect_keys))
 		kzfree(wdev->connect_keys);
 	wdev->connect_keys = connkeys;
@@ -289,7 +292,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
 
 	wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
 
-	if (wdev->wext.keys) {
+	if (wdev->wext.keys && wdev->wext.keys->def != -1) {
 		ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
 		if (!ck)
 			return -ENOMEM;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 71af96e8a947..f2a77c3daa59 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -848,6 +848,15 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 	struct nlattr *key;
 	struct cfg80211_cached_keys *result;
 	int rem, err, def = 0;
+	bool have_key = false;
+
+	nla_for_each_nested(key, keys, rem) {
+		have_key = true;
+		break;
+	}
+
+	if (!have_key)
+		return NULL;
 
 	result = kzalloc(sizeof(*result), GFP_KERNEL);
 	if (!result)
@@ -895,6 +904,11 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 			*no_ht = true;
 	}
 
+	if (result->def < 0) {
+		err = -EINVAL;
+		goto error;
+	}
+
 	return result;
  error:
 	kfree(result);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index add6824c44fd..c08a3b57dca1 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1043,6 +1043,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 				connect->crypto.ciphers_pairwise[0] = cipher;
 			}
 		}
+	} else {
+		if (WARN_ON(connkeys))
+			return -EINVAL;
 	}
 
 	wdev->connect_keys = connkeys;
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index f6523a4387cc..88f1f6931ab8 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -42,7 +42,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 	if (!wdev->wext.connect.ssid_len)
 		return 0;
 
-	if (wdev->wext.keys) {
+	if (wdev->wext.keys && wdev->wext.keys->def != -1) {
 		ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
 		if (!ck)
 			return -ENOMEM;
-- 
cgit v1.2.3


From 93db1d9e6c96050b74bb2793de8db00cd0afe6ab Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 09:23:51 +0200
Subject: mac80211: fix possible out-of-bounds access

In the unlikely situation that the supplicant has negotiated
admission for the background AC (which it has no reason to as
it's not supposed to be requiring admission control to start
with, and we'd ignore such a requirement anyway), the loop
here may terminate with non_acm_ac == 4, which leads to an
array overrun.

Check this explicitly just for completeness.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8d426f637f58..7486f2dab4ba 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1672,11 +1672,15 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
 			     non_acm_ac++)
 				if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac)))
 					break;
-			/* The loop will result in using BK even if it requires
-			 * admission control, such configuration makes no sense
-			 * and we have to transmit somehow - the AC selection
-			 * does the same thing.
+			/* Usually the loop will result in using BK even if it
+			 * requires admission control, but such a configuration
+			 * makes no sense and we have to transmit somehow - the
+			 * AC selection does the same thing.
+			 * If we started out trying to downgrade from BK, then
+			 * the extra condition here might be needed.
 			 */
+			if (non_acm_ac >= IEEE80211_NUM_ACS)
+				non_acm_ac = IEEE80211_AC_BK;
 			if (drv_conf_tx(local, sdata, ac,
 					&sdata->tx_conf[non_acm_ac]))
 				sdata_err(sdata,
-- 
cgit v1.2.3


From 58bd7f1158ac7543ccdcddc7f4ecd7db458e6d0b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 09:37:54 +0200
Subject: mac80211: fix scan completed tracing

Passing the 'info' pointer where a 'info->aborted' is expected will
always lead to tracing to erroneously record that the scan was aborted,
fix that by passing the correct info->aborted. The remaining data will
be collected in cfg80211, so I haven't duplicated it here.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 070b40f15850..23d8ac829279 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -420,7 +420,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 
-	trace_api_scan_completed(local, info);
+	trace_api_scan_completed(local, info->aborted);
 
 	set_bit(SCAN_COMPLETED, &local->scanning);
 	if (info->aborted)
-- 
cgit v1.2.3


From 76e1fb4b5532a9df9eb14cfe002412c7617c4ad0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 09:55:57 +0200
Subject: nl80211: always check nla_nest_start() return value

If the message got full during nla_nest_start(), it can return
NULL. None of the cases here seem like that can really happen,
but check the return value nonetheless.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f2a77c3daa59..60c8a7429d33 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8022,6 +8022,8 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev,
 	}
 
 	data = nla_nest_start(skb, attr);
+	if (!data)
+		goto nla_put_failure;
 
 	((void **)skb->cb)[0] = rdev;
 	((void **)skb->cb)[1] = hdr;
@@ -9458,8 +9460,14 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
 
 	if (req->n_match_sets) {
 		matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH);
+		if (!matches)
+			return -ENOBUFS;
+
 		for (i = 0; i < req->n_match_sets; i++) {
 			match = nla_nest_start(msg, i);
+			if (!match)
+				return -ENOBUFS;
+
 			nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
 				req->match_sets[i].ssid.ssid_len,
 				req->match_sets[i].ssid.ssid);
@@ -9474,6 +9482,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
 
 	for (i = 0; i < req->n_scan_plans; i++) {
 		scan_plan = nla_nest_start(msg, i + 1);
+		if (!scan_plan)
+			return -ENOBUFS;
+
 		if (!scan_plan ||
 		    nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
 				req->scan_plans[i].interval) ||
-- 
cgit v1.2.3


From 53b18980fded52e39520661af3528577d36eb279 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 09:59:21 +0200
Subject: nl80211: always check nla_put* return values

A few instances were found where we didn't check them, add the
missing checks even though they'll probably never trigger as
the message should be large enough here.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 60c8a7429d33..887c4c114206 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9453,8 +9453,10 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
 	if (!freqs)
 		return -ENOBUFS;
 
-	for (i = 0; i < req->n_channels; i++)
-		nla_put_u32(msg, i, req->channels[i]->center_freq);
+	for (i = 0; i < req->n_channels; i++) {
+		if (nla_put_u32(msg, i, req->channels[i]->center_freq))
+			return -ENOBUFS;
+	}
 
 	nla_nest_end(msg, freqs);
 
@@ -9468,9 +9470,10 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
 			if (!match)
 				return -ENOBUFS;
 
-			nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
-				req->match_sets[i].ssid.ssid_len,
-				req->match_sets[i].ssid.ssid);
+			if (nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
+				    req->match_sets[i].ssid.ssid_len,
+				    req->match_sets[i].ssid.ssid))
+				return -ENOBUFS;
 			nla_nest_end(msg, match);
 		}
 		nla_nest_end(msg, matches);
-- 
cgit v1.2.3


From 5140974dca69f0eace465bccd93891ca242a7e61 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 09:58:31 +0200
Subject: mac80211: remove unused assignment

The next line overwrites this assignment, so remove it; there's
no real value in using it for the next assignment either.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/util.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index e777c2a6568f..b6865d884487 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2555,7 +2555,6 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
 
 		if (need_basic && basic_rates & BIT(i))
 			basic = 0x80;
-		rate = sband->bitrates[i].bitrate;
 		rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
 				    5 * (1 << shift));
 		*pos++ = basic | (u8) rate;
-- 
cgit v1.2.3


From 8826fef95bd5f846f7745d9ce1e3009927ec0cb8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 10:00:23 +0200
Subject: mac80211: remove pointless chanctx NULL check

If chanctx is derived as container_of() from a non-NULL pointer,
it can't ever be NULL. Since we checked conf before, that's true
here, so remove the useless NULL check.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5d4afead804e..e29ff5749944 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2961,10 +2961,6 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	}
 
 	chanctx = container_of(conf, struct ieee80211_chanctx, conf);
-	if (!chanctx) {
-		err = -EBUSY;
-		goto out;
-	}
 
 	ch_switch.timestamp = 0;
 	ch_switch.device_timestamp = 0;
-- 
cgit v1.2.3


From c7e9dbcf09bddd01568113103d62423d8894eabd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Sep 2016 10:03:00 +0200
Subject: mac80211: remove sta_remove_debugfs driver callback

No drivers implement this, relying either on the recursive
directory removal to remove their debugfs, or not having any
to start with. Remove the dead driver callback.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 11 ++---------
 net/mac80211/debugfs_sta.c |  4 ----
 net/mac80211/driver-ops.h  | 15 ---------------
 3 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 08bac23c8de1..d9c8ccd6b4e6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3101,11 +3101,8 @@ enum ieee80211_reconfig_type {
  *
  * @sta_add_debugfs: Drivers can use this callback to add debugfs files
  *	when a station is added to mac80211's station list. This callback
- *	and @sta_remove_debugfs should be within a CONFIG_MAC80211_DEBUGFS
- *	conditional. This callback can sleep.
- *
- * @sta_remove_debugfs: Remove the debugfs files which were added using
- *	@sta_add_debugfs. This callback can sleep.
+ *	should be within a CONFIG_MAC80211_DEBUGFS conditional. This
+ *	callback can sleep.
  *
  * @sta_notify: Notifies low level driver about power state transition of an
  *	associated station, AP,  IBSS/WDS/mesh peer etc. For a VIF operating
@@ -3501,10 +3498,6 @@ struct ieee80211_ops {
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta,
 				struct dentry *dir);
-	void (*sta_remove_debugfs)(struct ieee80211_hw *hw,
-				   struct ieee80211_vif *vif,
-				   struct ieee80211_sta *sta,
-				   struct dentry *dir);
 #endif
 	void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			enum sta_notify_cmd, struct ieee80211_sta *sta);
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index fb2693582e40..a2fcdb47a0e6 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -544,10 +544,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 
 void ieee80211_sta_debugfs_remove(struct sta_info *sta)
 {
-	struct ieee80211_local *local = sta->local;
-	struct ieee80211_sub_if_data *sdata = sta->sdata;
-
-	drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
 	debugfs_remove_recursive(sta->debugfs_dir);
 	sta->debugfs_dir = NULL;
 }
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index c39f93b48791..fe35a1c0dc86 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -499,21 +499,6 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
 		local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
 					    sta, dir);
 }
-
-static inline void drv_sta_remove_debugfs(struct ieee80211_local *local,
-					  struct ieee80211_sub_if_data *sdata,
-					  struct ieee80211_sta *sta,
-					  struct dentry *dir)
-{
-	might_sleep();
-
-	sdata = get_bss_sdata(sdata);
-	check_sdata_in_driver(sdata);
-
-	if (local->ops->sta_remove_debugfs)
-		local->ops->sta_remove_debugfs(&local->hw, &sdata->vif,
-					       sta, dir);
-}
 #endif
 
 static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
-- 
cgit v1.2.3


From e8a24cd4b87247beedb1addc7b683422092047e5 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@qti.qualcomm.com>
Date: Wed, 14 Sep 2016 12:48:32 +0530
Subject: mac80211: allow driver to handle packet-loss mechanism

Based on consecutive msdu failures, mac80211 triggers CQM packet-loss
mechanism. Drivers like ath10k that have its own connection monitoring
algorithm, offloaded to firmware for triggering station kickout. In case
of station kickout, driver will report low ack status by mac80211 API
(ieee80211_report_low_ack).

This flag will enable the driver to completely rely on firmware events
for station kickout and bypass mac80211 packet loss mechanism.

Signed-off-by: Rajkumar Manoharan <rmanohar@qti.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 6 ++++++
 net/mac80211/debugfs.c | 1 +
 net/mac80211/status.c  | 6 ++++++
 3 files changed, 13 insertions(+)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d9c8ccd6b4e6..5296100f3889 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2018,6 +2018,11 @@ struct ieee80211_txq {
  * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
  *	skbs, needed for zero-copy software A-MSDU.
  *
+ * @IEEE80211_HW_REPORTS_LOW_ACK: The driver (or firmware) reports low ack event
+ *	by ieee80211_report_low_ack() based on its own algorithm. For such
+ *	drivers, mac80211 packet loss mechanism will not be triggered and driver
+ *	is completely depending on firmware event for station kickout.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2058,6 +2063,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_USES_RSS,
 	IEEE80211_HW_TX_AMSDU,
 	IEEE80211_HW_TX_FRAG_LIST,
+	IEEE80211_HW_REPORTS_LOW_ACK,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 5bbb470f335f..8ca62b6bb02a 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -201,6 +201,7 @@ static const char *hw_flag_names[] = {
 	FLAG(USES_RSS),
 	FLAG(TX_AMSDU),
 	FLAG(TX_FRAG_LIST),
+	FLAG(REPORTS_LOW_ACK),
 #undef FLAG
 };
 
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index fabd9ff710d9..ea39f8a7baf3 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -557,6 +557,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 static void ieee80211_lost_packet(struct sta_info *sta,
 				  struct ieee80211_tx_info *info)
 {
+	/* If driver relies on its own algorithm for station kickout, skip
+	 * mac80211 packet loss mechanism.
+	 */
+	if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK))
+		return;
+
 	/* This packet was aggregated but doesn't carry status info */
 	if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
 	    !(info->flags & IEEE80211_TX_STAT_AMPDU))
-- 
cgit v1.2.3


From ec53c832ee90b86414ca243d0e6fdbb9cf5e413b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 15 Sep 2016 10:57:50 +0200
Subject: cfg80211: remove unnecessary pointer-of

For an array, there's no need to use &array, so just use the
plain wiphy->addresses[i].addr here to silence smatch.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index e46469bc130f..0082f4b01795 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -57,7 +57,7 @@ static ssize_t addresses_show(struct device *dev,
 		return sprintf(buf, "%pM\n", wiphy->perm_addr);
 
 	for (i = 0; i < wiphy->n_addresses; i++)
-		buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr);
+		buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr);
 
 	return buf - start;
 }
-- 
cgit v1.2.3


From 07c0f09e23b47815251ed9e5ce245a58c6391974 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 12 Sep 2016 15:19:21 +0300
Subject: net/sched: act_tunnel_key: Remove rcu_read_lock protection

Remove rcu_read_lock protection from tunnel_key_dump and use
rtnl_dereference, dump operation is protected by  rtnl lock.

Also, remove rcu_read_lock from tunnel_key_release and use
rcu_dereference_protected.

Both operations are running exclusively and a writer couldn't modify
t->params while those functions are executed.

Fixes: 54d94fd89d90 ('net/sched: Introduce act_tunnel_key')
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_tunnel_key.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index dceff7412dc3..af47bdf2f483 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -194,15 +194,12 @@ static void tunnel_key_release(struct tc_action *a, int bind)
 	struct tcf_tunnel_key *t = to_tunnel_key(a);
 	struct tcf_tunnel_key_params *params;
 
-	rcu_read_lock();
-	params = rcu_dereference(t->params);
+	params = rcu_dereference_protected(t->params, 1);
 
 	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
 		dst_release(&params->tcft_enc_metadata->dst);
 
 	kfree_rcu(params, rcu);
-
-	rcu_read_unlock();
 }
 
 static int tunnel_key_dump_addresses(struct sk_buff *skb,
@@ -245,10 +242,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		.bindcnt  = t->tcf_bindcnt - bind,
 	};
 	struct tcf_t tm;
-	int ret = -1;
 
-	rcu_read_lock();
-	params = rcu_dereference(t->params);
+	params = rtnl_dereference(t->params);
 
 	opt.t_action = params->tcft_action;
 	opt.action = params->action;
@@ -272,15 +267,11 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 			  &tm, TCA_TUNNEL_KEY_PAD))
 		goto nla_put_failure;
 
-	ret = skb->len;
-	goto out;
+	return skb->len;
 
 nla_put_failure:
 	nlmsg_trim(skb, b);
-out:
-	rcu_read_unlock();
-
-	return ret;
+	return -1;
 }
 
 static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
-- 
cgit v1.2.3


From 04b3f8de4b6d90758938a40303c0ee9a86bcb8ab Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 12 Sep 2016 23:38:42 +0200
Subject: bpf: drop unnecessary test in cls_bpf_classify and tcf_bpf

The skb_mac_header_was_set() test in cls_bpf's and act_bpf's fast-path is
actually unnecessary and can be removed altogether. This was added by
commit a166151cbe33 ("bpf: fix bpf helpers to use skb->mac_header relative
offsets"), which was later on improved by 3431205e0397 ("bpf: make programs
see skb->data == L2 for ingress and egress"). We're always guaranteed to
have valid mac header at the time we invoke cls_bpf_classify() or tcf_bpf().

Reason is that since 6d1ccff62780 ("net: reset mac header in dev_start_xmit()")
we do skb_reset_mac_header() in __dev_queue_xmit() before we could call
into sch_handle_egress() or any subsequent enqueue. sch_handle_ingress()
always sees a valid mac header as well (things like skb_reset_mac_len()
would badly fail otherwise). Thus, drop the unnecessary test in classifier
and action case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_bpf.c | 3 ---
 net/sched/cls_bpf.c | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index bfa870731e74..78400defa790 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -44,9 +44,6 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	int action, filter_res;
 	bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
 
-	if (unlikely(!skb_mac_header_was_set(skb)))
-		return TC_ACT_UNSPEC;
-
 	tcf_lastuse_update(&prog->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 4742f415ee5b..1d92d4d3f222 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -83,9 +83,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct cls_bpf_prog *prog;
 	int ret = -1;
 
-	if (unlikely(!skb_mac_header_was_set(skb)))
-		return -1;
-
 	/* Needed here for accessing maps. */
 	rcu_read_lock();
 	list_for_each_entry_rcu(prog, &head->plist, link) {
-- 
cgit v1.2.3


From f53d8c7b18faf1bd361abe91f3c4bcbb21d0c985 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 12 Sep 2016 23:38:43 +0200
Subject: bpf: use skb_at_tc_ingress helper in tcf_bpf

We have a small skb_at_tc_ingress() helper for testing for ingress, so
make use of it. cls_bpf already uses it and so should act_bpf.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 78400defa790..1d3960033f61 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -39,10 +39,10 @@ static struct tc_action_ops act_bpf_ops;
 static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 		   struct tcf_result *res)
 {
+	bool at_ingress = skb_at_tc_ingress(skb);
 	struct tcf_bpf *prog = to_bpf(act);
 	struct bpf_prog *filter;
 	int action, filter_res;
-	bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
 
 	tcf_lastuse_update(&prog->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
-- 
cgit v1.2.3


From 86da71b57383d40993cb90baafb3735cffe5d800 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 12 Sep 2016 20:13:09 -0400
Subject: net_sched: Introduce skbmod action

This action is intended to be an upgrade from a usability perspective
from pedit (as well as operational debugability).
Compare this:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action pedit munge offset -14 u8 set 0x02 \
munge offset -13 u8 set 0x15 \
munge offset -12 u8 set 0x15 \
munge offset -11 u8 set 0x15 \
munge offset -10 u16 set 0x1515 \
pipe

to:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action skbmod dmac 02:15:15:15:15:15

Also try to do a MAC address swap with pedit or worse
try to debug a policy with destination mac, source mac and
etherype. Then make few rules out of those and you'll get my point.

In the future common use cases on pedit can be migrated to this action
(as an example different fields in ip v4/6, transports like tcp/udp/sctp
etc). For this first cut, this allows modifying basic ethernet header.

The most important ethernet use case at the moment is when redirecting or
mirroring packets to a remote machine. The dst mac address needs a re-write
so that it doesnt get dropped or confuse an interconnecting (learning) switch
or dropped by a target machine (which looks at the dst mac). And at times
when flipping back the packet a swap of the MAC addresses is needed.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbmod.h        |  30 ++++
 include/uapi/linux/tc_act/tc_skbmod.h |  39 +++++
 net/sched/Kconfig                     |  11 ++
 net/sched/Makefile                    |   1 +
 net/sched/act_skbmod.c                | 301 ++++++++++++++++++++++++++++++++++
 5 files changed, 382 insertions(+)
 create mode 100644 include/net/tc_act/tc_skbmod.h
 create mode 100644 include/uapi/linux/tc_act/tc_skbmod.h
 create mode 100644 net/sched/act_skbmod.c

(limited to 'net')

diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h
new file mode 100644
index 000000000000..644a2116b47b
--- /dev/null
+++ b/include/net/tc_act/tc_skbmod.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#ifndef __NET_TC_SKBMOD_H
+#define __NET_TC_SKBMOD_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_skbmod.h>
+
+struct tcf_skbmod_params {
+	struct rcu_head	rcu;
+	u64	flags; /*up to 64 types of operations; extend if needed */
+	u8	eth_dst[ETH_ALEN];
+	u16	eth_type;
+	u8	eth_src[ETH_ALEN];
+};
+
+struct tcf_skbmod {
+	struct tc_action	common;
+	struct tcf_skbmod_params __rcu *skbmod_p;
+};
+#define to_skbmod(a) ((struct tcf_skbmod *)a)
+
+#endif /* __NET_TC_SKBMOD_H */
diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
new file mode 100644
index 000000000000..10fc07da6c69
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#ifndef __LINUX_TC_SKBMOD_H
+#define __LINUX_TC_SKBMOD_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_SKBMOD 15
+
+#define SKBMOD_F_DMAC	0x1
+#define SKBMOD_F_SMAC	0x2
+#define SKBMOD_F_ETYPE	0x4
+#define SKBMOD_F_SWAPMAC 0x8
+
+struct tc_skbmod {
+	tc_gen;
+	__u64 flags;
+};
+
+enum {
+	TCA_SKBMOD_UNSPEC,
+	TCA_SKBMOD_TM,
+	TCA_SKBMOD_PARMS,
+	TCA_SKBMOD_DMAC,
+	TCA_SKBMOD_SMAC,
+	TCA_SKBMOD_ETYPE,
+	TCA_SKBMOD_PAD,
+	__TCA_SKBMOD_MAX
+};
+#define TCA_SKBMOD_MAX (__TCA_SKBMOD_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 72e3426fa48f..7795d5a3f79a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -749,6 +749,17 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_SKBMOD
+        tristate "skb data modification action"
+        depends on NET_CLS_ACT
+        ---help---
+         Say Y here to allow modification of skb data
+
+         If unsure, say N.
+
+         To compile this code as a module, choose M here: the
+         module will be called act_skbmod.
+
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index b9d046b9535a..148ae0d5ac2c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 000000000000..e7d96381c908
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,301 @@
+/*
+ * net/sched/act_skbmod.c  skb data modifier
+ *
+ * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbmod.h>
+#include <net/tc_act/tc_skbmod.h>
+
+#define SKBMOD_TAB_MASK     15
+
+static int skbmod_net_id;
+static struct tc_action_ops act_skbmod_ops;
+
+#define MAX_EDIT_LEN ETH_HLEN
+static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	int action;
+	struct tcf_skbmod_params *p;
+	u64 flags;
+	int err;
+
+	tcf_lastuse_update(&d->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+	/* XXX: if you are going to edit more fields beyond ethernet header
+	 * (example when you add IP header replacement or vlan swap)
+	 * then MAX_EDIT_LEN needs to change appropriately
+	*/
+	err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+	if (unlikely(err)) { /* best policy is to drop on the floor */
+		qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+		return TC_ACT_SHOT;
+	}
+
+	rcu_read_lock();
+	action = READ_ONCE(d->tcf_action);
+	if (unlikely(action == TC_ACT_SHOT)) {
+		qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+		rcu_read_unlock();
+		return action;
+	}
+
+	p = rcu_dereference(d->skbmod_p);
+	flags = p->flags;
+	if (flags & SKBMOD_F_DMAC)
+		ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
+	if (flags & SKBMOD_F_SMAC)
+		ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
+	if (flags & SKBMOD_F_ETYPE)
+		eth_hdr(skb)->h_proto = p->eth_type;
+	rcu_read_unlock();
+
+	if (flags & SKBMOD_F_SWAPMAC) {
+		u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
+		/*XXX: I am sure we can come up with more efficient swapping*/
+		ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
+		ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
+		ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
+	}
+
+	return action;
+}
+
+static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
+	[TCA_SKBMOD_PARMS]		= { .len = sizeof(struct tc_skbmod) },
+	[TCA_SKBMOD_DMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_SMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_ETYPE]		= { .type = NLA_U16 },
+};
+
+static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
+	struct tcf_skbmod_params *p, *p_old;
+	struct tc_skbmod *parm;
+	struct tcf_skbmod *d;
+	bool exists = false;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	u16 eth_type = 0;
+	u32 lflags = 0;
+	int ret = 0, err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_SKBMOD_PARMS])
+		return -EINVAL;
+
+	if (tb[TCA_SKBMOD_DMAC]) {
+		daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
+		lflags |= SKBMOD_F_DMAC;
+	}
+
+	if (tb[TCA_SKBMOD_SMAC]) {
+		saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
+		lflags |= SKBMOD_F_SMAC;
+	}
+
+	if (tb[TCA_SKBMOD_ETYPE]) {
+		eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
+		lflags |= SKBMOD_F_ETYPE;
+	}
+
+	parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+	if (parm->flags & SKBMOD_F_SWAPMAC)
+		lflags = SKBMOD_F_SWAPMAC;
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!lflags)
+		return -EINVAL;
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_skbmod_ops, bind, true);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	d = to_skbmod(*a);
+
+	ASSERT_RTNL();
+	p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
+	if (unlikely(!p)) {
+		if (ovr)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+
+	p->flags = lflags;
+	d->tcf_action = parm->action;
+
+	p_old = rtnl_dereference(d->skbmod_p);
+
+	if (ovr)
+		spin_lock_bh(&d->tcf_lock);
+
+	if (lflags & SKBMOD_F_DMAC)
+		ether_addr_copy(p->eth_dst, daddr);
+	if (lflags & SKBMOD_F_SMAC)
+		ether_addr_copy(p->eth_src, saddr);
+	if (lflags & SKBMOD_F_ETYPE)
+		p->eth_type = htons(eth_type);
+
+	rcu_assign_pointer(d->skbmod_p, p);
+	if (ovr)
+		spin_unlock_bh(&d->tcf_lock);
+
+	if (p_old)
+		kfree_rcu(p_old, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	struct tcf_skbmod_params  *p;
+
+	p = rcu_dereference_protected(d->skbmod_p, 1);
+	kfree_rcu(p, rcu);
+}
+
+static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbmod_params  *p = rtnl_dereference(d->skbmod_p);
+	struct tc_skbmod opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
+	struct tcf_t t;
+
+	opt.flags  = p->flags;
+	if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_DMAC) &&
+	    nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_SMAC) &&
+	    nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_ETYPE) &&
+	    nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type)))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &d->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
+		goto nla_put_failure;
+
+	return skb->len;
+nla_put_failure:
+	rcu_read_unlock();
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbmod_ops = {
+	.kind		=	"skbmod",
+	.type		=	TCA_ACT_SKBMOD,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbmod_run,
+	.dump		=	tcf_skbmod_dump,
+	.init		=	tcf_skbmod_init,
+	.cleanup	=	tcf_skbmod_cleanup,
+	.walk		=	tcf_skbmod_walker,
+	.lookup		=	tcf_skbmod_search,
+	.size		=	sizeof(struct tcf_skbmod),
+};
+
+static __net_init int skbmod_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
+}
+
+static void __net_exit skbmod_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbmod_net_ops = {
+	.init = skbmod_init_net,
+	.exit = skbmod_exit_net,
+	.id   = &skbmod_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
+MODULE_DESCRIPTION("SKB data mod-ing");
+MODULE_LICENSE("GPL");
+
+static int __init skbmod_init_module(void)
+{
+	return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+static void __exit skbmod_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+module_init(skbmod_init_module);
+module_exit(skbmod_cleanup_module);
-- 
cgit v1.2.3


From aa72d708373dacfa690960b336543b867784b350 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 15 Sep 2016 15:28:22 +0300
Subject: net/sched: cls_flower: Support masking for matching on tcp/udp ports

Add the definitions for src/dst udp/tcp port masks and use
them when setting && dumping the relevant keys.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  4 ++++
 net/sched/cls_flower.c       | 20 ++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index f9c287c67eae..60ea2a084880 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -442,6 +442,10 @@ enum {
 	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
 	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
 
+	TCA_FLOWER_KEY_TCP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_TCP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_DST_MASK,	/* be16 */
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b084b2aab2d7..027523c82797 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -335,6 +335,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
 	[TCA_FLOWER_KEY_ENC_IPV6_DST]	= { .len = sizeof(struct in6_addr) },
 	[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_TCP_SRC_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_TCP_DST_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_UDP_SRC_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_UDP_DST_MASK]	= { .type = NLA_U16 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -432,17 +436,17 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 
 	if (key->basic.ip_proto == IPPROTO_TCP) {
 		fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-			       &mask->tp.src, TCA_FLOWER_UNSPEC,
+			       &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
 			       sizeof(key->tp.src));
 		fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-			       &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			       &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 			       sizeof(key->tp.dst));
 	} else if (key->basic.ip_proto == IPPROTO_UDP) {
 		fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-			       &mask->tp.src, TCA_FLOWER_UNSPEC,
+			       &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
 			       sizeof(key->tp.src));
 		fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
-			       &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			       &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
 			       sizeof(key->tp.dst));
 	}
 
@@ -877,18 +881,18 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	if (key->basic.ip_proto == IPPROTO_TCP &&
 	    (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-			     &mask->tp.src, TCA_FLOWER_UNSPEC,
+			     &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
 			     sizeof(key->tp.src)) ||
 	     fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-			     &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			     &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 			     sizeof(key->tp.dst))))
 		goto nla_put_failure;
 	else if (key->basic.ip_proto == IPPROTO_UDP &&
 		 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-				  &mask->tp.src, TCA_FLOWER_UNSPEC,
+				  &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
 				  sizeof(key->tp.src)) ||
 		  fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
-				  &mask->tp.dst, TCA_FLOWER_UNSPEC,
+				  &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
 				  sizeof(key->tp.dst))))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From a53d850a79c39b97a2d954d0db5c481c44e8dc7c Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 15 Sep 2016 15:28:23 +0300
Subject: net/sched: cls_flower: Remove an unused field from the filter key
 structure

Commit c3f8324188fa "net: Add full IPv6 addresses to flow_keys" added an
unused instance of struct flow_dissector_key_addrs into struct fl_flow_key,
remove it.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reported-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 027523c82797..a3f4c706dfaa 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -33,7 +33,6 @@ struct fl_flow_key {
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	struct flow_dissector_key_vlan vlan;
-	struct flow_dissector_key_addrs ipaddrs;
 	union {
 		struct flow_dissector_key_ipv4_addrs ipv4;
 		struct flow_dissector_key_ipv6_addrs ipv6;
-- 
cgit v1.2.3


From 2679d040412df847d390a3a8f0f224a7c91f7fae Mon Sep 17 00:00:00 2001
From: Lance Richardson <lrichard@redhat.com>
Date: Tue, 13 Sep 2016 10:08:54 -0400
Subject: openvswitch: avoid deferred execution of recirc actions

The ovs kernel data path currently defers the execution of all
recirc actions until stack utilization is at a minimum.
This is too limiting for some packet forwarding scenarios due to
the small size of the deferred action FIFO (10 entries). For
example, broadcast traffic sent out more than 10 ports with
recirculation results in packet drops when the deferred action
FIFO becomes full, as reported here:

     http://openvswitch.org/pipermail/dev/2016-March/067672.html

Since the current recursion depth is available (it is already tracked
by the exec_actions_level pcpu variable), we can use it to determine
whether to execute recirculation actions immediately (safe when
recursion depth is low) or defer execution until more stack space is
available.

With this change, the deferred action fifo size becomes a non-issue
for currently failing scenarios because it is no longer used when
there are three or fewer recursions through ovs_execute_actions().

Suggested-by: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: Lance Richardson <lrichard@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 4fe9032b1160..863e992dfbc0 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -71,6 +71,8 @@ struct ovs_frag_data {
 static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
 
 #define DEFERRED_ACTION_FIFO_SIZE 10
+#define OVS_RECURSION_LIMIT 5
+#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
 struct action_fifo {
 	int head;
 	int tail;
@@ -78,7 +80,12 @@ struct action_fifo {
 	struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
+struct recirc_keys {
+	struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+};
+
 static struct action_fifo __percpu *action_fifos;
+static struct recirc_keys __percpu *recirc_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
 static void action_fifo_init(struct action_fifo *fifo)
@@ -1020,6 +1027,7 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
 			  const struct nlattr *a, int rem)
 {
 	struct deferred_action *da;
+	int level;
 
 	if (!is_flow_key_valid(key)) {
 		int err;
@@ -1043,6 +1051,18 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
 			return 0;
 	}
 
+	level = this_cpu_read(exec_actions_level);
+	if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+		struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
+		struct sw_flow_key *recirc_key = &rks->key[level - 1];
+
+		*recirc_key = *key;
+		recirc_key->recirc_id = nla_get_u32(a);
+		ovs_dp_process_packet(skb, recirc_key);
+
+		return 0;
+	}
+
 	da = add_deferred_actions(skb, key, NULL);
 	if (da) {
 		da->pkt_key.recirc_id = nla_get_u32(a);
@@ -1209,11 +1229,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			const struct sw_flow_actions *acts,
 			struct sw_flow_key *key)
 {
-	static const int ovs_recursion_limit = 5;
 	int err, level;
 
 	level = __this_cpu_inc_return(exec_actions_level);
-	if (unlikely(level > ovs_recursion_limit)) {
+	if (unlikely(level > OVS_RECURSION_LIMIT)) {
 		net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
 				     ovs_dp_name(dp));
 		kfree_skb(skb);
@@ -1238,10 +1257,17 @@ int action_fifos_init(void)
 	if (!action_fifos)
 		return -ENOMEM;
 
+	recirc_keys = alloc_percpu(struct recirc_keys);
+	if (!recirc_keys) {
+		free_percpu(action_fifos);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 void action_fifos_exit(void)
 {
 	free_percpu(action_fifos);
+	free_percpu(recirc_keys);
 }
-- 
cgit v1.2.3


From 76f0dcbb5ae1a7c3dbeec13dd98233b8e6b0b32a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 13 Sep 2016 22:55:05 -0700
Subject: tcp: fix a stale ooo_last_skb after a replace

When skb replaces another one in ooo queue, I forgot to also
update tp->ooo_last_skb as well, if the replaced skb was the last one
in the queue.

To fix this, we simply can re-use the code that runs after an insertion,
trying to merge skbs at the right of current skb.

This not only fixes the bug, but also remove all small skbs that might
be a subset of the new one.

Example:

We receive segments 2001:3001,  4001:5001

Then we receive 2001:8001 : We should replace 2001:3001 with the big
skb, but also remove 4001:50001 from the queue to save space.

packetdrill test demonstrating the bug

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

+0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
+0.100 < . 1:1(0) ack 1 win 1024
+0 accept(3, ..., ...) = 4

+0.01 < . 1001:2001(1000) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001>

+0.01 < . 1001:3001(2000) ack 1 win 1024
+0    > . 1:1(0) ack 1 <nop,nop, sack 1001:2001 1001:3001>

Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Yuchung Cheng <ycheng@google.com>
Cc: Yaogong Wang <wygivan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 70b892db9901..dad3e7eeed94 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4502,7 +4502,7 @@ coalesce_done:
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
 				__kfree_skb(skb1);
-				goto add_sack;
+				goto merge_right;
 			}
 		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
 			goto coalesce_done;
@@ -4514,6 +4514,7 @@ insert:
 	rb_link_node(&skb->rbnode, parent, p);
 	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
 
+merge_right:
 	/* Remove other segments covered by skb. */
 	while ((q = rb_next(&skb->rbnode)) != NULL) {
 		skb1 = rb_entry(q, struct sk_buff, rbnode);
-- 
cgit v1.2.3


From d6f64d725bac20df66b2eacd847fc41d7a1905e0 Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Thu, 15 Sep 2016 11:40:05 +1200
Subject: net: VRF: Pass original iif to ip_route_input()

The function ip_rcv_finish() calls l3mdev_ip_rcv(). On any VRF except
the global VRF, this replaces skb->dev with the VRF master interface.
When calling ip_route_input_noref() from here, the checks for forwarding
look at this master device instead of the initial ingress interface.
This will allow packets to be routed which normally would be dropped.
For example, an interface that is not assigned an IP address should
drop packets, but because the checking is against the master device, the
packet will be forwarded.

The fix here is to still call l3mdev_ip_rcv(), but remember the initial
net_device. This is passed to the other functions within ip_rcv_finish,
so they still see the original interface.

Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4b351af3e67b..d6feabb03516 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
+	struct net_device *dev = skb->dev;
 
 	/* if ingress device is enslaved to an L3 master device pass the
 	 * skb to its handler for processing
@@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	 */
 	if (!skb_valid_dst(skb)) {
 		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
+					       iph->tos, dev);
 		if (unlikely(err)) {
 			if (err == -EXDEV)
 				__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
@@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 		__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
 	} else if (skb->pkt_type == PACKET_BROADCAST ||
 		   skb->pkt_type == PACKET_MULTICAST) {
-		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 		/* RFC 1122 3.3.6:
 		 *
-- 
cgit v1.2.3


From cafdc45c949b9963cbfb8fe3a68d0ab16b0208ce Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Thu, 15 Sep 2016 16:26:40 +0200
Subject: net-next: dsa: add Qualcomm tag RX/TX handler

Add support for the 2-bytes Qualcomm tag that gigabit switches such as
the QCA8337/N might insert when receiving packets, or that we need
to insert while targeting specific switch ports. The tag is inserted
directly behind the ethernet header.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |   1 +
 net/dsa/Kconfig    |   3 ++
 net/dsa/Makefile   |   1 +
 net/dsa/dsa.c      |   3 ++
 net/dsa/dsa_priv.h |   2 +
 net/dsa/tag_qca.c  | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 148 insertions(+)
 create mode 100644 net/dsa/tag_qca.c

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9d97c5214341..7556646db2d3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -26,6 +26,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_TRAILER,
 	DSA_TAG_PROTO_EDSA,
 	DSA_TAG_PROTO_BRCM,
+	DSA_TAG_PROTO_QCA,
 	DSA_TAG_LAST,		/* MUST BE LAST */
 };
 
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index ff7736f7ff42..96e47c539bee 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA
 config NET_DSA_TAG_TRAILER
 	bool
 
+config NET_DSA_TAG_QCA
+	bool
+
 endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 8af4ded70f1c..a3380ed0e0be 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
 dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index d8d267e9a872..66e31acfcad8 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -53,6 +53,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
 #endif
 #ifdef CONFIG_NET_DSA_TAG_BRCM
 	[DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
+#endif
+#ifdef CONFIG_NET_DSA_TAG_QCA
+	[DSA_TAG_PROTO_QCA] = &qca_netdev_ops,
 #endif
 	[DSA_TAG_PROTO_NONE] = &none_ops,
 };
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 00077a9c97f4..6cfd7388834e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops;
 /* tag_brcm.c */
 extern const struct dsa_device_ops brcm_netdev_ops;
 
+/* tag_qca.c */
+extern const struct dsa_device_ops qca_netdev_ops;
 
 #endif
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
new file mode 100644
index 000000000000..0c90cacee7aa
--- /dev/null
+++ b/net/dsa/tag_qca.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/etherdevice.h>
+#include "dsa_priv.h"
+
+#define QCA_HDR_LEN	2
+#define QCA_HDR_VERSION	0x2
+
+#define QCA_HDR_RECV_VERSION_MASK	GENMASK(15, 14)
+#define QCA_HDR_RECV_VERSION_S		14
+#define QCA_HDR_RECV_PRIORITY_MASK	GENMASK(13, 11)
+#define QCA_HDR_RECV_PRIORITY_S		11
+#define QCA_HDR_RECV_TYPE_MASK		GENMASK(10, 6)
+#define QCA_HDR_RECV_TYPE_S		6
+#define QCA_HDR_RECV_FRAME_IS_TAGGED	BIT(3)
+#define QCA_HDR_RECV_SOURCE_PORT_MASK	GENMASK(2, 0)
+
+#define QCA_HDR_XMIT_VERSION_MASK	GENMASK(15, 14)
+#define QCA_HDR_XMIT_VERSION_S		14
+#define QCA_HDR_XMIT_PRIORITY_MASK	GENMASK(13, 11)
+#define QCA_HDR_XMIT_PRIORITY_S		11
+#define QCA_HDR_XMIT_CONTROL_MASK	GENMASK(10, 8)
+#define QCA_HDR_XMIT_CONTROL_S		8
+#define QCA_HDR_XMIT_FROM_CPU		BIT(7)
+#define QCA_HDR_XMIT_DP_BIT_MASK	GENMASK(6, 0)
+
+static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	u16 *phdr, hdr;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	if (skb_cow_head(skb, 0) < 0)
+		goto out_free;
+
+	skb_push(skb, QCA_HDR_LEN);
+
+	memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
+	phdr = (u16 *)(skb->data + 2 * ETH_ALEN);
+
+	/* Set the version field, and set destination port information */
+	hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
+		QCA_HDR_XMIT_FROM_CPU |
+		BIT(p->port);
+
+	*phdr = htons(hdr);
+
+	return skb;
+
+out_free:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
+	u8 ver;
+	int port;
+	__be16 *phdr, hdr;
+
+	if (unlikely(!dst))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+		goto out_drop;
+
+	/* The QCA header is added by the switch between src addr and Ethertype
+	 * At this point, skb->data points to ethertype so header should be
+	 * right before
+	 */
+	phdr = (__be16 *)(skb->data - 2);
+	hdr = ntohs(*phdr);
+
+	/* Make sure the version is correct */
+	ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+	if (unlikely(ver != QCA_HDR_VERSION))
+		goto out_drop;
+
+	/* Remove QCA tag and recalculate checksum */
+	skb_pull_rcsum(skb, QCA_HDR_LEN);
+	memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
+		ETH_HLEN - QCA_HDR_LEN);
+
+	/* This protocol doesn't support cascading multiple switches so it's
+	 * safe to assume the switch is first in the tree
+	 */
+	ds = dst->ds[0];
+	if (!ds)
+		goto out_drop;
+
+	/* Get source port information */
+	port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+	if (!ds->ports[port].netdev)
+		goto out_drop;
+
+	/* Update skb & forward the frame accordingly */
+	skb_push(skb, ETH_HLEN);
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = ds->ports[port].netdev;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+const struct dsa_device_ops qca_netdev_ops = {
+	.xmit	= qca_tag_xmit,
+	.rcv	= qca_tag_rcv,
+};
-- 
cgit v1.2.3


From c68df2e7be0c1238ea3c281fd744a204ef3b15a0 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Thu, 15 Sep 2016 16:30:02 +0300
Subject: mac80211: allow using AP_LINK_PS with mac80211-generated TIM IE

In 46fa38e84b65 ("mac80211: allow software PS-Poll/U-APSD with
AP_LINK_PS"), Johannes allowed to use mac80211's code for handling
stations that go to PS or send PS-Poll / uAPSD trigger frames for
devices that enable RSS.

This means that mac80211 doesn't look at frames anymore but rather
relies on a notification that will come from the device when a PS
transition occurs or when a PS-Poll / trigger frame is detected by
the device.

iwlwifi will need this capability but still needs mac80211 to take
care of the TIM IE. Today, if a driver sets AP_LINK_PS, mac80211
will not update the TIM IE. Change mac80211 to check existence of
the set_tim driver callback rather than using AP_LINK_PS to decide
if the driver handles the TIM IE internally or not.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
[reword commit message a bit]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 5e70fa52e1ff..1b1b28ff4fdb 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -690,7 +690,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
 	}
 
 	/* No need to do anything if the driver does all */
-	if (ieee80211_hw_check(&local->hw, AP_LINK_PS))
+	if (!local->ops->set_tim)
 		return;
 
 	if (sta->dead)
-- 
cgit v1.2.3


From fbd05e4a6e82fd573d3aa79e284e424b8d78c149 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Thu, 15 Sep 2016 18:15:09 +0300
Subject: cfg80211: add helper to find an IE that matches a byte-array

There are a few places where an IE that matches not only the EID, but
also other bytes inside the element, needs to be found.  To simplify
that and reduce the amount of similar code, implement a new helper
function to match the EID and an extra array of bytes.

Additionally, simplify cfg80211_find_vendor_ie() by using the new
match function.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 33 +++++++++++++++++++++++++++-
 net/wireless/scan.c    | 58 +++++++++++++++++++++++---------------------------
 2 files changed, 59 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d5e7f690bad9..533cb6410678 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3946,6 +3946,34 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 unsigned int cfg80211_classify8021d(struct sk_buff *skb,
 				    struct cfg80211_qos_map *qos_map);
 
+/**
+ * cfg80211_find_ie_match - match information element and byte array in data
+ *
+ * @eid: element ID
+ * @ies: data consisting of IEs
+ * @len: length of data
+ * @match: byte array to match
+ * @match_len: number of bytes in the match array
+ * @match_offset: offset in the IE where the byte array should match.
+ *	If match_len is zero, this must also be set to zero.
+ *	Otherwise this must be set to 2 or more, because the first
+ *	byte is the element id, which is already compared to eid, and
+ *	the second byte is the IE length.
+ *
+ * Return: %NULL if the element ID could not be found or if
+ * the element is invalid (claims to be longer than the given
+ * data) or if the byte array doesn't match, or a pointer to the first
+ * byte of the requested element, that is the byte containing the
+ * element ID.
+ *
+ * Note: There are no checks on the element length other than
+ * having to fit into the given data and being large enough for the
+ * byte array to match.
+ */
+const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len,
+				 const u8 *match, int match_len,
+				 int match_offset);
+
 /**
  * cfg80211_find_ie - find information element in data
  *
@@ -3961,7 +3989,10 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb,
  * Note: There are no checks on the element length other than
  * having to fit into the given data.
  */
-const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len);
+static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+{
+	return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
+}
 
 /**
  * cfg80211_find_vendor_ie - find vendor specific information element in data
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 0358e12be54b..b5bd58d0f731 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -352,52 +352,48 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev)
 	__cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE);
 }
 
-const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len,
+				 const u8 *match, int match_len,
+				 int match_offset)
 {
-	while (len > 2 && ies[0] != eid) {
+	/* match_offset can't be smaller than 2, unless match_len is
+	 * zero, in which case match_offset must be zero as well.
+	 */
+	if (WARN_ON((match_len && match_offset < 2) ||
+		    (!match_len && match_offset)))
+		return NULL;
+
+	while (len >= 2 && len >= ies[1] + 2) {
+		if ((ies[0] == eid) &&
+		    (ies[1] + 2 >= match_offset + match_len) &&
+		    !memcmp(ies + match_offset, match, match_len))
+			return ies;
+
 		len -= ies[1] + 2;
 		ies += ies[1] + 2;
 	}
-	if (len < 2)
-		return NULL;
-	if (len < 2 + ies[1])
-		return NULL;
-	return ies;
+
+	return NULL;
 }
-EXPORT_SYMBOL(cfg80211_find_ie);
+EXPORT_SYMBOL(cfg80211_find_ie_match);
 
 const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
 				  const u8 *ies, int len)
 {
-	struct ieee80211_vendor_ie *ie;
-	const u8 *pos = ies, *end = ies + len;
-	int ie_oui;
+	const u8 *ie;
+	u8 match[] = { oui >> 16, oui >> 8, oui, oui_type };
+	int match_len = (oui_type < 0) ? 3 : sizeof(match);
 
 	if (WARN_ON(oui_type > 0xff))
 		return NULL;
 
-	while (pos < end) {
-		pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos,
-				       end - pos);
-		if (!pos)
-			return NULL;
-
-		ie = (struct ieee80211_vendor_ie *)pos;
-
-		/* make sure we can access ie->len */
-		BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1);
+	ie = cfg80211_find_ie_match(WLAN_EID_VENDOR_SPECIFIC, ies, len,
+				    match, match_len, 2);
 
-		if (ie->len < sizeof(*ie))
-			goto cont;
+	if (ie && (ie[1] < 4))
+		return NULL;
 
-		ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2];
-		if (ie_oui == oui &&
-		    (oui_type < 0 || ie->oui_type == oui_type))
-			return pos;
-cont:
-		pos += 2 + ie->len;
-	}
-	return NULL;
+	return ie;
 }
 EXPORT_SYMBOL(cfg80211_find_vendor_ie);
 
-- 
cgit v1.2.3


From d19127473a575c629c70974cee0bb8acb6374f08 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 07:26:01 +0100
Subject: rxrpc: Make IPv6 support conditional on CONFIG_IPV6

Add CONFIG_AF_RXRPC_IPV6 and make the IPv6 support code conditional on it.
This is then made conditional on CONFIG_IPV6.

Without this, the following can be seen:

   net/built-in.o: In function `rxrpc_init_peer':
>> peer_object.c:(.text+0x18c3c8): undefined reference to `ip6_route_output_flags'

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/Kconfig        |  7 +++++++
 net/rxrpc/af_rxrpc.c     |  7 ++++++-
 net/rxrpc/conn_object.c  |  2 ++
 net/rxrpc/local_object.c |  2 ++
 net/rxrpc/output.c       |  2 ++
 net/rxrpc/peer_event.c   |  4 +++-
 net/rxrpc/peer_object.c  | 10 ++++++++++
 net/rxrpc/utils.c        |  2 ++
 8 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 784c53163b7b..13396c74b5c1 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -19,6 +19,13 @@ config AF_RXRPC
 
 	  See Documentation/networking/rxrpc.txt.
 
+config AF_RXRPC_IPV6
+	bool "IPv6 support for RxRPC"
+	depends on (IPV6 = m && AF_RXRPC = m) || (IPV6 = y && AF_RXRPC)
+	help
+	  Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as
+	  its network transport.
+
 
 config AF_RXRPC_DEBUG
 	bool "RxRPC dynamic debugging"
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index f61f7b2d1ca4..09f81befc705 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -109,12 +109,14 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
 		tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
 		break;
 
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		if (srx->transport_len < sizeof(struct sockaddr_in6))
 			return -EINVAL;
 		tail = offsetof(struct sockaddr_rxrpc, transport) +
 			sizeof(struct sockaddr_in6);
 		break;
+#endif
 
 	default:
 		return -EAFNOSUPPORT;
@@ -413,9 +415,11 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
 		case AF_INET:
 			rx->srx.transport_len = sizeof(struct sockaddr_in);
 			break;
+#ifdef CONFIG_AF_RXRPC_IPV6
 		case AF_INET6:
 			rx->srx.transport_len = sizeof(struct sockaddr_in6);
 			break;
+#endif
 		default:
 			ret = -EAFNOSUPPORT;
 			goto error_unlock;
@@ -570,7 +574,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	/* we support transport protocol UDP/UDP6 only */
-	if (protocol != PF_INET && protocol != PF_INET6)
+	if (protocol != PF_INET &&
+	    IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6)
 		return -EPROTONOSUPPORT;
 
 	if (sock->type != SOCK_DGRAM)
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index c0ddba787fd4..bb1f29280aea 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -134,6 +134,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
 			    srx.transport.sin.sin_addr.s_addr)
 				goto not_found;
 			break;
+#ifdef CONFIG_AF_RXRPC_IPV6
 		case AF_INET6:
 			if (peer->srx.transport.sin6.sin6_port !=
 			    srx.transport.sin6.sin6_port ||
@@ -142,6 +143,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
 				   sizeof(struct in6_addr)) != 0)
 				goto not_found;
 			break;
+#endif
 		default:
 			BUG();
 		}
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index f5b9bb0d3f98..e3fad80b0795 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -58,6 +58,7 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
 			memcmp(&local->srx.transport.sin.sin_addr,
 			       &srx->transport.sin.sin_addr,
 			       sizeof(struct in_addr));
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		/* If the choice of UDP6 port is left up to the transport, then
 		 * the endpoint record doesn't match.
@@ -67,6 +68,7 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
 			memcmp(&local->srx.transport.sin6.sin6_addr,
 			       &srx->transport.sin6.sin6_addr,
 			       sizeof(struct in6_addr));
+#endif
 	default:
 		BUG();
 	}
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index d7cd87f17f0d..06a9aca739d1 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -259,6 +259,7 @@ send_fragmentable:
 		}
 		break;
 
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		opt = IPV6_PMTUDISC_DONT;
 		ret = kernel_setsockopt(conn->params.local->socket,
@@ -274,6 +275,7 @@ send_fragmentable:
 					  (char *)&opt, sizeof(opt));
 		}
 		break;
+#endif
 	}
 
 	up_write(&conn->params.local->defrag_sem);
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 74217589cf44..9e0725f5652b 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -66,6 +66,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 		}
 		break;
 
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		srx.transport.sin6.sin6_port = serr->port;
 		srx.transport_len = sizeof(struct sockaddr_in6);
@@ -78,7 +79,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 			break;
 		case SO_EE_ORIGIN_ICMP:
 			_net("Rx ICMP on v6 sock");
-			memcpy(&srx.transport.sin6.sin6_addr.s6_addr + 12,
+			memcpy(srx.transport.sin6.sin6_addr.s6_addr + 12,
 			       skb_network_header(skb) + serr->addr_offset,
 			       sizeof(struct in_addr));
 			break;
@@ -89,6 +90,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 			break;
 		}
 		break;
+#endif
 
 	default:
 		BUG();
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index dfc07b41a472..f3e5766910fd 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -52,11 +52,13 @@ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local,
 		size = sizeof(srx->transport.sin.sin_addr);
 		p = (u16 *)&srx->transport.sin.sin_addr;
 		break;
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		hash_key += (u16 __force)srx->transport.sin.sin_port;
 		size = sizeof(srx->transport.sin6.sin6_addr);
 		p = (u16 *)&srx->transport.sin6.sin6_addr;
 		break;
+#endif
 	default:
 		WARN(1, "AF_RXRPC: Unsupported transport address family\n");
 		return 0;
@@ -100,12 +102,14 @@ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer,
 			memcmp(&peer->srx.transport.sin.sin_addr,
 			       &srx->transport.sin.sin_addr,
 			       sizeof(struct in_addr));
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		return ((u16 __force)peer->srx.transport.sin6.sin6_port -
 			(u16 __force)srx->transport.sin6.sin6_port) ?:
 			memcmp(&peer->srx.transport.sin6.sin6_addr,
 			       &srx->transport.sin6.sin6_addr,
 			       sizeof(struct in6_addr));
+#endif
 	default:
 		BUG();
 	}
@@ -159,7 +163,9 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 	struct rtable *rt;
 	struct flowi fl;
 	struct flowi4 *fl4 = &fl.u.ip4;
+#ifdef CONFIG_AF_RXRPC_IPV6
 	struct flowi6 *fl6 = &fl.u.ip6;
+#endif
 
 	peer->if_mtu = 1500;
 
@@ -177,6 +183,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 		dst = &rt->dst;
 		break;
 
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		fl6->flowi6_iif = LOOPBACK_IFINDEX;
 		fl6->flowi6_scope = RT_SCOPE_UNIVERSE;
@@ -191,6 +198,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 			return;
 		}
 		break;
+#endif
 
 	default:
 		BUG();
@@ -241,9 +249,11 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
 	case AF_INET:
 		peer->hdrsize = sizeof(struct iphdr);
 		break;
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case AF_INET6:
 		peer->hdrsize = sizeof(struct ipv6hdr);
 		break;
+#endif
 	default:
 		BUG();
 	}
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
index b88914d53ca5..ff7af71c4b49 100644
--- a/net/rxrpc/utils.c
+++ b/net/rxrpc/utils.c
@@ -30,6 +30,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
 		srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
 		return 0;
 
+#ifdef CONFIG_AF_RXRPC_IPV6
 	case ETH_P_IPV6:
 		srx->transport_type = SOCK_DGRAM;
 		srx->transport_len = sizeof(srx->transport.sin6);
@@ -37,6 +38,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
 		srx->transport.sin6.sin6_port = udp_hdr(skb)->source;
 		srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
 		return 0;
+#endif
 
 	default:
 		pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n",
-- 
cgit v1.2.3


From fabf9201806255d70386d8bc9f6a2942c0940da2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:11 +0100
Subject: rxrpc: Remove some whitespace.

Remove a tab that's on a line that should otherwise be blank.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 61432049869b..9367c3be31eb 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -31,7 +31,7 @@ static void rxrpc_set_timer(struct rxrpc_call *call)
 	_enter("{%ld,%ld,%ld:%ld}",
 	       call->ack_at - now, call->resend_at - now, call->expire_at - now,
 	       call->timer.expires - now);
-	
+
 	read_lock_bh(&call->state_lock);
 
 	if (call->state < RXRPC_CALL_COMPLETE) {
-- 
cgit v1.2.3


From 4b22457c06a3a950e14938c486283ad0f308c13d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:11 +0100
Subject: rxrpc: Move the check of rx_pkt_offset from rxrpc_locate_data() to
 caller

Move the check of rx_pkt_offset from rxrpc_locate_data() to the caller,
rxrpc_recvmsg_data(), so that it's more clear what's going on there.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/recvmsg.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a284205b8ecf..0d085f5cf1bf 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -240,9 +240,6 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
 	int ret;
 	u8 annotation = *_annotation;
 
-	if (offset > 0)
-		return 0;
-
 	/* Locate the subpacket */
 	offset = sp->offset;
 	len = skb->len - sp->offset;
@@ -303,8 +300,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		if (msg)
 			sock_recv_timestamp(msg, sock->sk, skb);
 
-		ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix],
-					&rx_pkt_offset, &rx_pkt_len);
+		if (rx_pkt_offset == 0)
+			ret = rxrpc_locate_data(call, skb,
+						&call->rxtx_annotations[ix],
+						&rx_pkt_offset, &rx_pkt_len);
 		_debug("recvmsg %x DATA #%u { %d, %d }",
 		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
 
-- 
cgit v1.2.3


From 2e2ea51dec2ab6a81950d4b436eb66ebf45dd507 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:11 +0100
Subject: rxrpc: Check the return value of rxrpc_locate_data()

Check the return value of rxrpc_locate_data() in rxrpc_recvmsg_data().

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/recvmsg.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 0d085f5cf1bf..1edf2cf62cc5 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -300,10 +300,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		if (msg)
 			sock_recv_timestamp(msg, sock->sk, skb);
 
-		if (rx_pkt_offset == 0)
+		if (rx_pkt_offset == 0) {
 			ret = rxrpc_locate_data(call, skb,
 						&call->rxtx_annotations[ix],
 						&rx_pkt_offset, &rx_pkt_len);
+			if (ret < 0)
+				goto out;
+		}
 		_debug("recvmsg %x DATA #%u { %d, %d }",
 		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
 
-- 
cgit v1.2.3


From 816c9fce12f3745abc959c0fca8ace1c2c51421c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:11 +0100
Subject: rxrpc: Fix handling of the last packet in rxrpc_recvmsg_data()

The code for determining the last packet in rxrpc_recvmsg_data() has been
using the RXRPC_CALL_RX_LAST flag to determine if the rx_top pointer points
to the last packet or not.  This isn't a good idea, however, as the input
code may be running simultaneously on another CPU and that sets the flag
*before* updating the top pointer.

Fix this by the following means:

 (1) Restrict the use of RXRPC_CALL_RX_LAST to the input routines only.
     There's otherwise a synchronisation problem between detecting the flag
     and checking tx_top.  This could probably be dealt with by appropriate
     application of memory barriers, but there's a simpler way.

 (2) Set RXRPC_CALL_RX_LAST after setting rx_top.

 (3) Make rxrpc_rotate_rx_window() consult the flags header field of the
     DATA packet it's about to discard to see if that was the last packet.
     Use this as the basis for ending the Rx phase.  This shouldn't be a
     problem because the recvmsg side of things is guaranteed to see the
     packets in order.

 (4) Make rxrpc_recvmsg_data() return 1 to indicate the end of the data if:

     (a) the packet it has just processed is marked as RXRPC_LAST_PACKET

     (b) the call's Rx phase has been ended.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c   |  4 +++-
 net/rxrpc/recvmsg.c | 49 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 36 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 75af0bd316c7..f0d9115b9b7e 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -238,7 +238,7 @@ next_subpacket:
 		len = RXRPC_JUMBO_DATALEN;
 
 	if (flags & RXRPC_LAST_PACKET) {
-		if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
 		    seq != call->rx_top)
 			return rxrpc_proto_abort("LSN", call, seq);
 	} else {
@@ -282,6 +282,8 @@ next_subpacket:
 	call->rxtx_buffer[ix] = skb;
 	if (after(seq, call->rx_top))
 		smp_store_release(&call->rx_top, seq);
+	if (flags & RXRPC_LAST_PACKET)
+		set_bit(RXRPC_CALL_RX_LAST, &call->flags);
 	queued = true;
 
 	if (after_eq(seq, call->rx_expect_next)) {
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 1edf2cf62cc5..8b8d7e14f800 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -134,6 +134,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 {
 	_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
 
+	ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
+
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
 		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
@@ -163,8 +165,10 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
  */
 static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 {
+	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
 	rxrpc_seq_t hard_ack, top;
+	u8 flags;
 	int ix;
 
 	_enter("%d", call->debug_id);
@@ -177,6 +181,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
 	skb = call->rxtx_buffer[ix];
 	rxrpc_see_skb(skb);
+	sp = rxrpc_skb(skb);
+	flags = sp->hdr.flags;
 	call->rxtx_buffer[ix] = NULL;
 	call->rxtx_annotations[ix] = 0;
 	/* Barrier against rxrpc_input_data(). */
@@ -184,8 +190,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 
 	rxrpc_free_skb(skb);
 
-	_debug("%u,%u,%lx", hard_ack, top, call->flags);
-	if (hard_ack == top && test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+	_debug("%u,%u,%02x", hard_ack, top, flags);
+	if (flags & RXRPC_LAST_PACKET)
 		rxrpc_end_rx_phase(call);
 }
 
@@ -278,13 +284,19 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 	size_t remain;
 	bool last;
 	unsigned int rx_pkt_offset, rx_pkt_len;
-	int ix, copy, ret = 0;
+	int ix, copy, ret = -EAGAIN, ret2;
 
 	_enter("");
 
 	rx_pkt_offset = call->rx_pkt_offset;
 	rx_pkt_len = call->rx_pkt_len;
 
+	if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
+		seq = call->rx_hard_ack;
+		ret = 1;
+		goto done;
+	}
+
 	/* Barriers against rxrpc_input_data(). */
 	hard_ack = call->rx_hard_ack;
 	top = smp_load_acquire(&call->rx_top);
@@ -301,11 +313,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			sock_recv_timestamp(msg, sock->sk, skb);
 
 		if (rx_pkt_offset == 0) {
-			ret = rxrpc_locate_data(call, skb,
-						&call->rxtx_annotations[ix],
-						&rx_pkt_offset, &rx_pkt_len);
-			if (ret < 0)
+			ret2 = rxrpc_locate_data(call, skb,
+						 &call->rxtx_annotations[ix],
+						 &rx_pkt_offset, &rx_pkt_len);
+			if (ret2 < 0) {
+				ret = ret2;
 				goto out;
+			}
 		}
 		_debug("recvmsg %x DATA #%u { %d, %d }",
 		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
@@ -316,10 +330,12 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		if (copy > remain)
 			copy = remain;
 		if (copy > 0) {
-			ret = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
-						     copy);
-			if (ret < 0)
+			ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
+						      copy);
+			if (ret2 < 0) {
+				ret = ret2;
 				goto out;
+			}
 
 			/* handle piecemeal consumption of data packets */
 			_debug("copied %d @%zu", copy, *_offset);
@@ -332,6 +348,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		if (rx_pkt_len > 0) {
 			_debug("buffer full");
 			ASSERTCMP(*_offset, ==, len);
+			ret = 0;
 			break;
 		}
 
@@ -342,19 +359,19 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		rx_pkt_offset = 0;
 		rx_pkt_len = 0;
 
-		ASSERTIFCMP(last, seq, ==, top);
-	}
-
-	if (after(seq, top)) {
-		ret = -EAGAIN;
-		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+		if (last) {
+			ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
 			ret = 1;
+			goto out;
+		}
 	}
+
 out:
 	if (!(flags & MSG_PEEK)) {
 		call->rx_pkt_offset = rx_pkt_offset;
 		call->rx_pkt_len = rx_pkt_len;
 	}
+done:
 	_leave(" = %d [%u/%u]", ret, seq, top);
 	return ret;
 }
-- 
cgit v1.2.3


From e6f3afb3fc058e17b407b6f7cac08058b19e641c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:11 +0100
Subject: rxrpc: Record calls that need to be accepted

Record calls that need to be accepted using sk_acceptq_added() otherwise
the backlog counter goes negative because sk_acceptq_removed() is called.
This causes the preallocator to malfunction.

Calls that are preaccepted by AFS within the kernel aren't affected by
this.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_accept.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 26c293ef98eb..323b8da50163 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -369,6 +369,8 @@ found_service:
 
 	if (rx->notify_new_call)
 		rx->notify_new_call(&rx->sk, call, call->user_call_ID);
+	else
+		sk_acceptq_added(&rx->sk);
 
 	spin_lock(&conn->state_lock);
 	switch (conn->state) {
-- 
cgit v1.2.3


From 0360da6db7d6390e7bd2f6c93b01af29bcd36ad5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:11 +0100
Subject: rxrpc: Purge the to_be_accepted queue on socket release

Purge the queue of to_be_accepted calls on socket release.  Note that
purging sock_calls doesn't release the ref owned by to_be_accepted.

Probably the sock_calls list is redundant given a purges of the recvmsg_q,
the to_be_accepted queue and the calls tree.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_object.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 22f9b0d1a138..b0ffbd9664e6 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -476,6 +476,16 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 
 	_enter("%p", rx);
 
+	while (!list_empty(&rx->to_be_accepted)) {
+		call = list_entry(rx->to_be_accepted.next,
+				  struct rxrpc_call, accept_link);
+		list_del(&call->accept_link);
+		rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+		rxrpc_release_call(rx, call);
+		rxrpc_put_call(call, rxrpc_call_put);
+	}
+
 	while (!list_empty(&rx->sock_calls)) {
 		call = list_entry(rx->sock_calls.next,
 				  struct rxrpc_call, sock_link);
-- 
cgit v1.2.3


From 66d58af7f4af53e8318e852efa31a7cb0e31bfb6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:12 +0100
Subject: rxrpc: Fix the putting of client connections

In rxrpc_put_one_client_conn(), if a connection has RXRPC_CONN_COUNTED set
on it, then it's accounted for in rxrpc_nr_client_conns and may be on
various lists - and this is cleaned up correctly.

However, if the connection doesn't have RXRPC_CONN_COUNTED set on it, then
the put routine returns rather than just skipping the extra bit of cleanup.

Fix this by making the extra bit of clean up conditional instead and always
killing off the connection.

This manifests itself as connections with a zero usage count hanging around
in /proc/net/rxrpc_conns because the connection allocated, but discarded,
due to a race with another process that set up a parallel connection, which
was then shared instead.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/conn_client.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 9344a8416ceb..5a675c43cace 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -818,7 +818,7 @@ idle_connection:
 static struct rxrpc_connection *
 rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
 {
-	struct rxrpc_connection *next;
+	struct rxrpc_connection *next = NULL;
 	struct rxrpc_local *local = conn->params.local;
 	unsigned int nr_conns;
 
@@ -834,24 +834,22 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
 
 	ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
 
-	if (!test_bit(RXRPC_CONN_COUNTED, &conn->flags))
-		return NULL;
-
-	spin_lock(&rxrpc_client_conn_cache_lock);
-	nr_conns = --rxrpc_nr_client_conns;
+	if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+		spin_lock(&rxrpc_client_conn_cache_lock);
+		nr_conns = --rxrpc_nr_client_conns;
+
+		if (nr_conns < rxrpc_max_client_connections &&
+		    !list_empty(&rxrpc_waiting_client_conns)) {
+			next = list_entry(rxrpc_waiting_client_conns.next,
+					  struct rxrpc_connection, cache_link);
+			rxrpc_get_connection(next);
+			rxrpc_activate_conn(next);
+		}
 
-	next = NULL;
-	if (nr_conns < rxrpc_max_client_connections &&
-	    !list_empty(&rxrpc_waiting_client_conns)) {
-		next = list_entry(rxrpc_waiting_client_conns.next,
-				  struct rxrpc_connection, cache_link);
-		rxrpc_get_connection(next);
-		rxrpc_activate_conn(next);
+		spin_unlock(&rxrpc_client_conn_cache_lock);
 	}
 
-	spin_unlock(&rxrpc_client_conn_cache_lock);
 	rxrpc_kill_connection(conn);
-
 	if (next)
 		rxrpc_activate_channels(next);
 
-- 
cgit v1.2.3


From 357f5ef64628c2d6c532e7a6bfc0bc3830b4c221 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:12 +0100
Subject: rxrpc: Call rxrpc_release_call() on error in rxrpc_new_client_call()

Call rxrpc_release_call() on getting an error in rxrpc_new_client_call()
rather than trying to do the cleanup ourselves.  This isn't a problem,
provided we set RXRPC_CALL_HAS_USERID only if we actually add the call to
the calls tree as cleanup code fragments that would otherwise cause
problems are conditional.

Without this, we miss some of the cleanup.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_object.c | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index b0ffbd9664e6..23f5a5f58282 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -226,9 +226,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 			 (const void *)user_call_ID);
 
 	/* Publish the call, even though it is incompletely set up as yet */
-	call->user_call_ID = user_call_ID;
-	__set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
-
 	write_lock(&rx->call_lock);
 
 	pp = &rx->calls.rb_node;
@@ -242,10 +239,12 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 		else if (user_call_ID > xcall->user_call_ID)
 			pp = &(*pp)->rb_right;
 		else
-			goto found_user_ID_now_present;
+			goto error_dup_user_ID;
 	}
 
 	rcu_assign_pointer(call->socket, rx);
+	call->user_call_ID = user_call_ID;
+	__set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
 	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
 	rb_insert_color(&call->sock_node, &rx->calls);
@@ -276,33 +275,22 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 	_leave(" = %p [new]", call);
 	return call;
 
-error:
-	write_lock(&rx->call_lock);
-	rb_erase(&call->sock_node, &rx->calls);
-	write_unlock(&rx->call_lock);
-	rxrpc_put_call(call, rxrpc_call_put_userid);
-
-	write_lock(&rxrpc_call_lock);
-	list_del_init(&call->link);
-	write_unlock(&rxrpc_call_lock);
-
-error_out:
-	__rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
-				    RX_CALL_DEAD, ret);
-	set_bit(RXRPC_CALL_RELEASED, &call->flags);
-	rxrpc_put_call(call, rxrpc_call_put);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-
 	/* We unexpectedly found the user ID in the list after taking
 	 * the call_lock.  This shouldn't happen unless the user races
 	 * with itself and tries to add the same user ID twice at the
 	 * same time in different threads.
 	 */
-found_user_ID_now_present:
+error_dup_user_ID:
 	write_unlock(&rx->call_lock);
 	ret = -EEXIST;
-	goto error_out;
+
+error:
+	__rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+				    RX_CALL_DEAD, ret);
+	rxrpc_release_call(rx, call);
+	rxrpc_put_call(call, rxrpc_call_put);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
 }
 
 /*
-- 
cgit v1.2.3


From 78883793f8ac4bb3f97d48db7a8c71d8476bcf98 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:12 +0100
Subject: rxrpc: Fix unexposed client conn release

If the last call on a client connection is release after the connection has
had a bunch of calls allocated but before any DATA packets are sent (so
that it's not yet marked RXRPC_CONN_EXPOSED), an assertion will happen in
rxrpc_disconnect_client_call().

	af_rxrpc: Assertion failed - 1(0x1) >= 2(0x2) is false
	------------[ cut here ]------------
	kernel BUG at ../net/rxrpc/conn_client.c:753!

This is because it's expecting the conn to have been exposed and to have 2
or more refs - but this isn't necessarily the case.

Simply remove the assertion.  This allows the conn to be moved into the
inactive state and deleted if it isn't resurrected before the final put is
called.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/conn_client.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 5a675c43cace..226bc910e556 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -721,7 +721,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	}
 
 	ASSERTCMP(rcu_access_pointer(chan->call), ==, call);
-	ASSERTCMP(atomic_read(&conn->usage), >=, 2);
 
 	/* If a client call was exposed to the world, we save the result for
 	 * retransmission.
-- 
cgit v1.2.3


From d01dc4c3c1209e865368d5f8d3b5e08f97326ca9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:12 +0100
Subject: rxrpc: Fix the parsing of soft-ACKs

The soft-ACK parser doesn't increment the pointer into the soft-ACK list,
resulting in the first ACK/NACK value being applied to all the relevant
packets in the Tx queue.  This has the potential to miss retransmissions
and cause excessive retransmissions.

Fix this by incrementing the pointer.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index f0d9115b9b7e..c1f83d22f9b7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -384,7 +384,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 
 	for (; nr_acks > 0; nr_acks--, seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
-		switch (*acks) {
+		switch (*acks++) {
 		case RXRPC_ACK_TYPE_ACK:
 			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK;
 			break;
-- 
cgit v1.2.3


From dfa7d9204054b091949d87270e55e0fd5800c3ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:12 +0100
Subject: rxrpc: Fix retransmission algorithm

Make the retransmission algorithm use for-loops instead of do-loops and
move the counter increments into the for-statement increment slots.

Though the do-loops are slighly more efficient since there will be at least
one pass through the each loop, the counter increments are harder to get
right as the continue-statements skip them.

Without this, if there are any positive acks within the loop, the do-loop
will cycle forever because the counter increment is never done.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 9367c3be31eb..f0cabc48a1b7 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -163,8 +163,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	 */
 	now = jiffies;
 	resend_at = now + rxrpc_resend_timeout;
-	seq = cursor + 1;
-	do {
+	for (seq = cursor + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		annotation = call->rxtx_annotations[ix];
 		if (annotation == RXRPC_TX_ANNO_ACK)
@@ -184,8 +183,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 		/* Okay, we need to retransmit a packet. */
 		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
-		seq++;
-	} while (before_eq(seq, top));
+	}
 
 	call->resend_at = resend_at;
 
@@ -194,8 +192,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	 * lock is dropped, it may clear some of the retransmission markers for
 	 * packets that it soft-ACKs.
 	 */
-	seq = cursor + 1;
-	do {
+	for (seq = cursor + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		annotation = call->rxtx_annotations[ix];
 		if (annotation != RXRPC_TX_ANNO_RETRANS)
@@ -237,8 +234,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 		if (after(call->tx_hard_ack, seq))
 			seq = call->tx_hard_ack;
-		seq++;
-	} while (before_eq(seq, top));
+	}
 
 out_unlock:
 	spin_unlock_bh(&call->lock);
-- 
cgit v1.2.3


From 27d0fc431c6b4847231c1490fa541bc3f5a7a351 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Don't transmit an ACK if there's no reason set

Don't transmit an ACK if call->ackr_reason in unset.  There's the
possibility of a race between recvmsg() sending an ACK and the background
processing thread trying to send the same one.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/output.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 06a9aca739d1..aa0507214b31 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -137,6 +137,11 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	switch (type) {
 	case RXRPC_PACKET_TYPE_ACK:
 		spin_lock_bh(&call->lock);
+		if (!call->ackr_reason) {
+			spin_unlock_bh(&call->lock);
+			ret = 0;
+			goto out;
+		}
 		n = rxrpc_fill_out_ack(call, pkt);
 		call->ackr_reason = 0;
 
-- 
cgit v1.2.3


From 2311e327cda015a24a201efc7655a9a983679e55 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:12 +0100
Subject: rxrpc: Be consistent about switch value in rxrpc_send_call_packet()

rxrpc_send_call_packet() should use type in both its switch-statements
rather than using pkt->whdr.type.  This might give the compiler an easier
job of uninitialised variable checking.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index aa0507214b31..0b21ed859de7 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -182,7 +182,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 			     &msg, iov, ioc, len);
 
 	if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) {
-		switch (pkt->whdr.type) {
+		switch (type) {
 		case RXRPC_PACKET_TYPE_ACK:
 			rxrpc_propose_ACK(call, pkt->ack.reason,
 					  ntohs(pkt->ack.maxSkew),
-- 
cgit v1.2.3


From 182f50562490e5861afaa7a2e42dcc0dd9dcfcca Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:12 +0100
Subject: rxrpc: Fix the basic transmit DATA packet content size at 1412 bytes

Fix the basic transmit DATA packet content size at 1412 bytes so that they
can be arbitrarily assembled into jumbo packets.

In the future, I'm thinking of moving to keeping a jumbo packet header at
the beginning of each packet in the Tx queue and creating the packet header
on the spot when kernel_sendmsg() is invoked.  That way, jumbo packets can
be assembled on the spur of the moment for (re-)transmission.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/sendmsg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index cba236575073..8bfddf4e338c 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -214,7 +214,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 					goto maybe_error;
 			}
 
-			max = call->conn->params.peer->maxdata;
+			max = RXRPC_JUMBO_DATALEN;
 			max -= call->conn->security_size;
 			max &= ~(call->conn->size_align - 1UL);
 
-- 
cgit v1.2.3


From a3868bfc8d5b0f36c784deab644ee1d2b0e6974b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Print the packet type name in the Rx packet trace

Print a symbolic packet type name for each valid received packet in the
trace output, not just a number.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 5 +++--
 net/rxrpc/ar-internal.h      | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index ea3b10ed91a8..0a30c673509c 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -93,11 +93,12 @@ TRACE_EVENT(rxrpc_rx_packet,
 		    memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
 			   ),
 
-	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x",
+	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s",
 		      __entry->hdr.epoch, __entry->hdr.cid,
 		      __entry->hdr.callNumber, __entry->hdr.serviceId,
 		      __entry->hdr.serial, __entry->hdr.seq,
-		      __entry->hdr.type, __entry->hdr.flags)
+		      __entry->hdr.type, __entry->hdr.flags,
+		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
 	    );
 
 TRACE_EVENT(rxrpc_rx_done,
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e78c40b37db5..0f6fafa2c271 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -551,6 +551,9 @@ enum rxrpc_call_trace {
 
 extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
 
+extern const char *const rxrpc_pkts[];
+extern const char *rxrpc_acks(u8 reason);
+
 #include <trace/events/rxrpc.h>
 
 /*
@@ -851,11 +854,8 @@ extern unsigned int rxrpc_rx_mtu;
 extern unsigned int rxrpc_rx_jumbo_max;
 extern unsigned int rxrpc_resend_timeout;
 
-extern const char *const rxrpc_pkts[];
 extern const s8 rxrpc_ack_priority[];
 
-extern const char *rxrpc_acks(u8 reason);
-
 /*
  * output.c
  */
-- 
cgit v1.2.3


From a84a46d73050f70fd8820c74840e2815c78d8690 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:14 +0100
Subject: rxrpc: Add some additional call tracing

Add additional call tracepoint points for noting call-connected,
call-released and connection-failed events.

Also fix one tracepoint that was using an integer instead of the
corresponding enum value as the point type.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  3 +++
 net/rxrpc/call_object.c | 18 ++++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 0f6fafa2c271..4a73c20d9436 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -539,6 +539,8 @@ enum rxrpc_call_trace {
 	rxrpc_call_queued,
 	rxrpc_call_queued_ref,
 	rxrpc_call_seen,
+	rxrpc_call_connected,
+	rxrpc_call_release,
 	rxrpc_call_got,
 	rxrpc_call_got_userid,
 	rxrpc_call_got_kernel,
@@ -546,6 +548,7 @@ enum rxrpc_call_trace {
 	rxrpc_call_put_userid,
 	rxrpc_call_put_kernel,
 	rxrpc_call_put_noqueue,
+	rxrpc_call_error,
 	rxrpc_call__nr_trace
 };
 
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 23f5a5f58282..0df9d1af8edb 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -53,6 +53,8 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
 	[rxrpc_call_new_service]	= "NWs",
 	[rxrpc_call_queued]		= "QUE",
 	[rxrpc_call_queued_ref]		= "QUR",
+	[rxrpc_call_connected]		= "CON",
+	[rxrpc_call_release]		= "RLS",
 	[rxrpc_call_seen]		= "SEE",
 	[rxrpc_call_got]		= "GOT",
 	[rxrpc_call_got_userid]		= "Gus",
@@ -61,6 +63,7 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
 	[rxrpc_call_put_userid]		= "Pus",
 	[rxrpc_call_put_kernel]		= "Pke",
 	[rxrpc_call_put_noqueue]	= "PNQ",
+	[rxrpc_call_error]		= "*E*",
 };
 
 struct kmem_cache *rxrpc_call_jar;
@@ -222,8 +225,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 		return call;
 	}
 
-	trace_rxrpc_call(call, 0, atomic_read(&call->usage), here,
-			 (const void *)user_call_ID);
+	trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
+			 here, (const void *)user_call_ID);
 
 	/* Publish the call, even though it is incompletely set up as yet */
 	write_lock(&rx->call_lock);
@@ -263,6 +266,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 	if (ret < 0)
 		goto error;
 
+	trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
+			 here, ERR_PTR(ret));
+
 	spin_lock_bh(&call->conn->params.peer->lock);
 	hlist_add_head(&call->error_link,
 		       &call->conn->params.peer->error_targets);
@@ -287,6 +293,8 @@ error_dup_user_ID:
 error:
 	__rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
 				    RX_CALL_DEAD, ret);
+	trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
+			 here, ERR_PTR(ret));
 	rxrpc_release_call(rx, call);
 	rxrpc_put_call(call, rxrpc_call_put);
 	_leave(" = %d", ret);
@@ -396,15 +404,17 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
  */
 void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 {
+	const void *here = __builtin_return_address(0);
 	struct rxrpc_connection *conn = call->conn;
 	bool put = false;
 	int i;
 
 	_enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
 
-	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
+	trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage),
+			 here, (const void *)call->flags);
 
-	rxrpc_see_call(call);
+	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
 	spin_lock_bh(&call->lock);
 	if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
-- 
cgit v1.2.3


From 363deeab6d0f308d33d011323661ae9cf5f9f8d6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:14 +0100
Subject: rxrpc: Add connection tracepoint and client conn state tracepoint

Add a pair of tracepoints, one to track rxrpc_connection struct ref
counting and the other to track the client connection cache state.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 60 ++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 76 +++++++++++++++++++++++++---------------
 net/rxrpc/call_accept.c      |  4 +++
 net/rxrpc/call_object.c      |  2 --
 net/rxrpc/conn_client.c      | 82 ++++++++++++++++++++++++++++++--------------
 net/rxrpc/conn_event.c       |  2 +-
 net/rxrpc/conn_object.c      | 72 ++++++++++++++++++++++++++++++++++++--
 net/rxrpc/conn_service.c     |  4 +++
 net/rxrpc/misc.c             | 31 +++++++++++++++++
 9 files changed, 274 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 0a30c673509c..c0c496c83f31 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -16,6 +16,66 @@
 
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(rxrpc_conn,
+	    TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op,
+		     int usage, const void *where),
+
+	    TP_ARGS(conn, op, usage, where),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_connection *,	conn		)
+		    __field(int,			op		)
+		    __field(int,			usage		)
+		    __field(const void *,		where		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->conn = conn;
+		    __entry->op = op;
+		    __entry->usage = usage;
+		    __entry->where = where;
+			   ),
+
+	    TP_printk("C=%p %s u=%d sp=%pSR",
+		      __entry->conn,
+		      rxrpc_conn_traces[__entry->op],
+		      __entry->usage,
+		      __entry->where)
+	    );
+
+TRACE_EVENT(rxrpc_client,
+	    TP_PROTO(struct rxrpc_connection *conn, int channel,
+		     enum rxrpc_client_trace op),
+
+	    TP_ARGS(conn, channel, op),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_connection *,	conn		)
+		    __field(u32,			cid		)
+		    __field(int,			channel		)
+		    __field(int,			usage		)
+		    __field(enum rxrpc_client_trace,	op		)
+		    __field(enum rxrpc_conn_cache_state, cs		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->conn = conn;
+		    __entry->channel = channel;
+		    __entry->usage = atomic_read(&conn->usage);
+		    __entry->op = op;
+		    __entry->cid = conn->proto.cid;
+		    __entry->cs = conn->cache_state;
+			   ),
+
+	    TP_printk("C=%p h=%2d %s %s i=%08x u=%d",
+		      __entry->conn,
+		      __entry->channel,
+		      rxrpc_client_traces[__entry->op],
+		      rxrpc_conn_cache_states[__entry->cs],
+		      __entry->cid,
+		      __entry->usage)
+	    );
+
 TRACE_EVENT(rxrpc_call,
 	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op,
 		     int usage, const void *where, const void *aux),
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 4a73c20d9436..6ca40eea3022 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -314,6 +314,7 @@ enum rxrpc_conn_cache_state {
 	RXRPC_CONN_CLIENT_ACTIVE,	/* Conn is on active list, doing calls */
 	RXRPC_CONN_CLIENT_CULLED,	/* Conn is culled and delisted, doing calls */
 	RXRPC_CONN_CLIENT_IDLE,		/* Conn is on idle list, doing mostly nothing */
+	RXRPC_CONN__NR_CACHE_STATES
 };
 
 /*
@@ -533,6 +534,44 @@ struct rxrpc_call {
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 };
 
+enum rxrpc_conn_trace {
+	rxrpc_conn_new_client,
+	rxrpc_conn_new_service,
+	rxrpc_conn_queued,
+	rxrpc_conn_seen,
+	rxrpc_conn_got,
+	rxrpc_conn_put_client,
+	rxrpc_conn_put_service,
+	rxrpc_conn__nr_trace
+};
+
+extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
+
+enum rxrpc_client_trace {
+	rxrpc_client_activate_chans,
+	rxrpc_client_alloc,
+	rxrpc_client_chan_activate,
+	rxrpc_client_chan_disconnect,
+	rxrpc_client_chan_pass,
+	rxrpc_client_chan_unstarted,
+	rxrpc_client_cleanup,
+	rxrpc_client_count,
+	rxrpc_client_discard,
+	rxrpc_client_duplicate,
+	rxrpc_client_exposed,
+	rxrpc_client_replace,
+	rxrpc_client_to_active,
+	rxrpc_client_to_culled,
+	rxrpc_client_to_idle,
+	rxrpc_client_to_inactive,
+	rxrpc_client_to_waiting,
+	rxrpc_client_uncount,
+	rxrpc_client__nr_trace
+};
+
+extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
+extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
+
 enum rxrpc_call_trace {
 	rxrpc_call_new_client,
 	rxrpc_call_new_service,
@@ -734,7 +773,11 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
 void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
 void rxrpc_disconnect_call(struct rxrpc_call *);
 void rxrpc_kill_connection(struct rxrpc_connection *);
-void __rxrpc_put_connection(struct rxrpc_connection *);
+bool rxrpc_queue_conn(struct rxrpc_connection *);
+void rxrpc_see_connection(struct rxrpc_connection *);
+void rxrpc_get_connection(struct rxrpc_connection *);
+struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
+void rxrpc_put_service_conn(struct rxrpc_connection *);
 void __exit rxrpc_destroy_all_connections(void);
 
 static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
@@ -747,38 +790,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
 	return !rxrpc_conn_is_client(conn);
 }
 
-static inline void rxrpc_get_connection(struct rxrpc_connection *conn)
-{
-	atomic_inc(&conn->usage);
-}
-
-static inline
-struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
-{
-	return atomic_inc_not_zero(&conn->usage) ? conn : NULL;
-}
-
 static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
 {
 	if (!conn)
 		return;
 
-	if (rxrpc_conn_is_client(conn)) {
-		if (atomic_dec_and_test(&conn->usage))
-			rxrpc_put_client_conn(conn);
-	} else {
-		if (atomic_dec_return(&conn->usage) == 1)
-			__rxrpc_put_connection(conn);
-	}
-}
-
-static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
-{
-	if (!rxrpc_get_connection_maybe(conn))
-		return false;
-	if (!rxrpc_queue_work(&conn->processor))
-		rxrpc_put_connection(conn);
-	return true;
+	if (rxrpc_conn_is_client(conn))
+		rxrpc_put_client_conn(conn);
+	else
+		rxrpc_put_service_conn(conn);
 }
 
 /*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 323b8da50163..3e474508ba75 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -85,6 +85,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 		b->conn_backlog[head] = conn;
 		smp_store_release(&b->conn_backlog_head,
 				  (head + 1) & (size - 1));
+
+		trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+				 atomic_read(&conn->usage), here);
 	}
 
 	/* Now it gets complicated, because calls get registered with the
@@ -290,6 +293,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
 		rxrpc_get_local(local);
 		conn->params.local = local;
 		conn->params.peer = peer;
+		rxrpc_see_connection(conn);
 		rxrpc_new_incoming_connection(conn, skb);
 	} else {
 		rxrpc_get_connection(conn);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 0df9d1af8edb..54f30482a7fd 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -479,8 +479,6 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 				  struct rxrpc_call, accept_link);
 		list_del(&call->accept_link);
 		rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
-		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
-		rxrpc_release_call(rx, call);
 		rxrpc_put_call(call, rxrpc_call_put);
 	}
 
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 226bc910e556..c76a125df891 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -105,6 +105,14 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
 static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
 			    rxrpc_discard_expired_client_conns);
 
+const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
+	[RXRPC_CONN_CLIENT_INACTIVE]	= "Inac",
+	[RXRPC_CONN_CLIENT_WAITING]	= "Wait",
+	[RXRPC_CONN_CLIENT_ACTIVE]	= "Actv",
+	[RXRPC_CONN_CLIENT_CULLED]	= "Cull",
+	[RXRPC_CONN_CLIENT_IDLE]	= "Idle",
+};
+
 /*
  * Get a connection ID and epoch for a client connection from the global pool.
  * The connection struct pointer is then recorded in the idr radix tree.  The
@@ -220,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
 	rxrpc_get_local(conn->params.local);
 	key_get(conn->params.key);
 
+	trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage),
+			 __builtin_return_address(0));
+	trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
 	_leave(" = %p", conn);
 	return conn;
 
@@ -385,6 +396,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
 			rb_replace_node(&conn->client_node,
 					&candidate->client_node,
 					&local->client_conns);
+			trace_rxrpc_client(conn, -1, rxrpc_client_replace);
 			goto candidate_published;
 		}
 	}
@@ -409,8 +421,11 @@ found_extant_conn:
 	_debug("found conn");
 	spin_unlock(&local->client_conns_lock);
 
-	rxrpc_put_connection(candidate);
-	candidate = NULL;
+	if (candidate) {
+		trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
+		rxrpc_put_connection(candidate);
+		candidate = NULL;
+	}
 
 	spin_lock(&conn->channel_lock);
 	call->conn = conn;
@@ -433,6 +448,7 @@ error:
  */
 static void rxrpc_activate_conn(struct rxrpc_connection *conn)
 {
+	trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
 	conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
 	rxrpc_nr_active_client_conns++;
 	list_move_tail(&conn->cache_link, &rxrpc_active_client_conns);
@@ -462,8 +478,10 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn)
 	spin_lock(&rxrpc_client_conn_cache_lock);
 
 	nr_conns = rxrpc_nr_client_conns;
-	if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags))
+	if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+		trace_rxrpc_client(conn, -1, rxrpc_client_count);
 		rxrpc_nr_client_conns = nr_conns + 1;
+	}
 
 	switch (conn->cache_state) {
 	case RXRPC_CONN_CLIENT_ACTIVE:
@@ -494,6 +512,7 @@ activate_conn:
 
 wait_for_capacity:
 	_debug("wait");
+	trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
 	conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
 	list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns);
 	goto out_unlock;
@@ -524,6 +543,8 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
 					     struct rxrpc_call, chan_wait_link);
 	u32 call_id = chan->call_counter + 1;
 
+	trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
+
 	write_lock_bh(&call->state_lock);
 	call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
 	write_unlock_bh(&call->state_lock);
@@ -563,6 +584,8 @@ static void rxrpc_activate_channels(struct rxrpc_connection *conn)
 
 	_enter("%d", conn->debug_id);
 
+	trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
+
 	if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE ||
 	    conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
 		return;
@@ -657,10 +680,13 @@ int rxrpc_connect_call(struct rxrpc_call *call,
  * had a chance at re-use (the per-connection security negotiation is
  * expensive).
  */
-static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
+static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
+				     unsigned int channel)
 {
-	if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags))
+	if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
+		trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
 		rxrpc_get_connection(conn);
+	}
 }
 
 /*
@@ -669,9 +695,9 @@ static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
  */
 void rxrpc_expose_client_call(struct rxrpc_call *call)
 {
+	unsigned int channel = call->cid & RXRPC_CHANNELMASK;
 	struct rxrpc_connection *conn = call->conn;
-	struct rxrpc_channel *chan =
-		&conn->channels[call->cid & RXRPC_CHANNELMASK];
+	struct rxrpc_channel *chan = &conn->channels[channel];
 
 	if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
 		/* Mark the call ID as being used.  If the callNumber counter
@@ -682,7 +708,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
 		chan->call_counter++;
 		if (chan->call_counter >= INT_MAX)
 			set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
-		rxrpc_expose_client_conn(conn);
+		rxrpc_expose_client_conn(conn, channel);
 	}
 }
 
@@ -695,6 +721,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	struct rxrpc_connection *conn = call->conn;
 	struct rxrpc_channel *chan = &conn->channels[channel];
 
+	trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
 	call->conn = NULL;
 
 	spin_lock(&conn->channel_lock);
@@ -709,6 +736,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 		ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
 		list_del_init(&call->chan_wait_link);
 
+		trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
+
 		/* We must deactivate or idle the connection if it's now
 		 * waiting for nothing.
 		 */
@@ -739,7 +768,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	/* See if we can pass the channel directly to another call. */
 	if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
 	    !list_empty(&conn->waiting_calls)) {
-		_debug("pass chan");
+		trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
 		rxrpc_activate_one_channel(conn, channel);
 		goto out_2;
 	}
@@ -762,7 +791,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 			goto out;
 		}
 
-		_debug("pass chan 2");
+		trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
 		rxrpc_activate_one_channel(conn, channel);
 		goto out;
 
@@ -794,7 +823,7 @@ idle_connection:
 	 * immediately or moved to the idle list for a short while.
 	 */
 	if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
-		_debug("make idle");
+		trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
 		conn->idle_timestamp = jiffies;
 		conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
 		list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns);
@@ -804,7 +833,7 @@ idle_connection:
 					   &rxrpc_client_conn_reap,
 					   rxrpc_conn_idle_client_expiry);
 	} else {
-		_debug("make inactive");
+		trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
 		conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
 		list_del_init(&conn->cache_link);
 	}
@@ -821,6 +850,8 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
 	struct rxrpc_local *local = conn->params.local;
 	unsigned int nr_conns;
 
+	trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+
 	if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
 		spin_lock(&local->client_conns_lock);
 		if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
@@ -834,6 +865,7 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
 	ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
 
 	if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+		trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
 		spin_lock(&rxrpc_client_conn_cache_lock);
 		nr_conns = --rxrpc_nr_client_conns;
 
@@ -863,20 +895,18 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
  */
 void rxrpc_put_client_conn(struct rxrpc_connection *conn)
 {
-	struct rxrpc_connection *next;
+	const void *here = __builtin_return_address(0);
+	int n;
 
 	do {
-		_enter("%p{u=%d,d=%d}",
-		       conn, atomic_read(&conn->usage), conn->debug_id);
-
-		next = rxrpc_put_one_client_conn(conn);
-
-		if (!next)
-			break;
-		conn = next;
-	} while (atomic_dec_and_test(&conn->usage));
-
-	_leave("");
+		n = atomic_dec_return(&conn->usage);
+		trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here);
+		if (n > 0)
+			return;
+		ASSERTCMP(n, >=, 0);
+
+		conn = rxrpc_put_one_client_conn(conn);
+	} while (conn);
 }
 
 /*
@@ -907,9 +937,11 @@ static void rxrpc_cull_active_client_conns(void)
 		ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE);
 
 		if (list_empty(&conn->waiting_calls)) {
+			trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
 			conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
 			list_del_init(&conn->cache_link);
 		} else {
+			trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
 			conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
 			list_move_tail(&conn->cache_link,
 				       &rxrpc_waiting_client_conns);
@@ -983,7 +1015,7 @@ next:
 			goto not_yet_expired;
 	}
 
-	_debug("discard conn %d", conn->debug_id);
+	trace_rxrpc_client(conn, -1, rxrpc_client_discard);
 	if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
 		BUG();
 	conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 0691007cfc02..a43f4c94a88d 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -377,7 +377,7 @@ void rxrpc_process_connection(struct work_struct *work)
 	u32 abort_code = RX_PROTOCOL_ERROR;
 	int ret;
 
-	_enter("{%d}", conn->debug_id);
+	rxrpc_see_connection(conn);
 
 	if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
 		rxrpc_secure_connection(conn);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index bb1f29280aea..3b55aee0c436 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -246,11 +246,77 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn)
 }
 
 /*
- * release a virtual connection
+ * Queue a connection's work processor, getting a ref to pass to the work
+ * queue.
  */
-void __rxrpc_put_connection(struct rxrpc_connection *conn)
+bool rxrpc_queue_conn(struct rxrpc_connection *conn)
 {
-	rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+	const void *here = __builtin_return_address(0);
+	int n = __atomic_add_unless(&conn->usage, 1, 0);
+	if (n == 0)
+		return false;
+	if (rxrpc_queue_work(&conn->processor))
+		trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here);
+	else
+		rxrpc_put_connection(conn);
+	return true;
+}
+
+/*
+ * Note the re-emergence of a connection.
+ */
+void rxrpc_see_connection(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+	if (conn) {
+		int n = atomic_read(&conn->usage);
+
+		trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here);
+	}
+}
+
+/*
+ * Get a ref on a connection.
+ */
+void rxrpc_get_connection(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+	int n = atomic_inc_return(&conn->usage);
+
+	trace_rxrpc_conn(conn, rxrpc_conn_got, n, here);
+}
+
+/*
+ * Try to get a ref on a connection.
+ */
+struct rxrpc_connection *
+rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+
+	if (conn) {
+		int n = __atomic_add_unless(&conn->usage, 1, 0);
+		if (n > 0)
+			trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
+		else
+			conn = NULL;
+	}
+	return conn;
+}
+
+/*
+ * Release a service connection
+ */
+void rxrpc_put_service_conn(struct rxrpc_connection *conn)
+{
+	const void *here = __builtin_return_address(0);
+	int n;
+
+	n = atomic_dec_return(&conn->usage);
+	trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
+	ASSERTCMP(n, >=, 0);
+	if (n == 0)
+		rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
 }
 
 /*
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 83d54da4ce8b..eef551f40dc2 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -136,6 +136,10 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
 		list_add_tail(&conn->link, &rxrpc_connections);
 		list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
 		write_unlock(&rxrpc_connection_lock);
+
+		trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+				 atomic_read(&conn->usage),
+				 __builtin_return_address(0));
 	}
 
 	return conn;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 8b910780f1ac..598064d3bdd2 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -101,3 +101,34 @@ const char *rxrpc_acks(u8 reason)
 		reason = ARRAY_SIZE(str) - 1;
 	return str[reason];
 }
+
+const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
+	[rxrpc_conn_new_client]		= "NWc",
+	[rxrpc_conn_new_service]	= "NWs",
+	[rxrpc_conn_queued]		= "QUE",
+	[rxrpc_conn_seen]		= "SEE",
+	[rxrpc_conn_got]		= "GOT",
+	[rxrpc_conn_put_client]		= "PTc",
+	[rxrpc_conn_put_service]	= "PTs",
+};
+
+const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
+	[rxrpc_client_activate_chans]	= "Activa",
+	[rxrpc_client_alloc]		= "Alloc ",
+	[rxrpc_client_chan_activate]	= "ChActv",
+	[rxrpc_client_chan_disconnect]	= "ChDisc",
+	[rxrpc_client_chan_pass]	= "ChPass",
+	[rxrpc_client_chan_unstarted]	= "ChUnst",
+	[rxrpc_client_cleanup]		= "Clean ",
+	[rxrpc_client_count]		= "Count ",
+	[rxrpc_client_discard]		= "Discar",
+	[rxrpc_client_duplicate]	= "Duplic",
+	[rxrpc_client_exposed]		= "Expose",
+	[rxrpc_client_replace]		= "Replac",
+	[rxrpc_client_to_active]	= "->Actv",
+	[rxrpc_client_to_culled]	= "->Cull",
+	[rxrpc_client_to_idle]		= "->Idle",
+	[rxrpc_client_to_inactive]	= "->Inac",
+	[rxrpc_client_to_waiting]	= "->Wait",
+	[rxrpc_client_uncount]		= "Uncoun",
+};
-- 
cgit v1.2.3


From a124fe3ee5d82f2c9a9b8818ed5cb9f61685f1d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to follow the life of a packet in the Tx
 buffer

Add a tracepoint to follow the insertion of a packet into the transmit
buffer, its transmission and its rotation out of the buffer.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 26 ++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 12 ++++++++++++
 net/rxrpc/input.c            |  2 ++
 net/rxrpc/misc.c             |  9 +++++++++
 net/rxrpc/sendmsg.c          |  9 ++++++++-
 5 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index c0c496c83f31..ffc74b3e5b76 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -208,6 +208,32 @@ TRACE_EVENT(rxrpc_abort,
 		      __entry->abort_code, __entry->error, __entry->why)
 	    );
 
+TRACE_EVENT(rxrpc_transmit,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_transmit_trace why),
+
+	    TP_ARGS(call, why),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_transmit_trace,	why		)
+		    __field(rxrpc_seq_t,		tx_hard_ack	)
+		    __field(rxrpc_seq_t,		tx_top		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->tx_hard_ack = call->tx_hard_ack;
+		    __entry->tx_top = call->tx_top;
+			   ),
+
+	    TP_printk("c=%p %s f=%08x n=%u",
+		      __entry->call,
+		      rxrpc_transmit_traces[__entry->why],
+		      __entry->tx_hard_ack + 1,
+		      __entry->tx_top - __entry->tx_hard_ack)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 6ca40eea3022..afa5dcc05fe0 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -593,6 +593,18 @@ enum rxrpc_call_trace {
 
 extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
 
+enum rxrpc_transmit_trace {
+	rxrpc_transmit_wait,
+	rxrpc_transmit_queue,
+	rxrpc_transmit_queue_reqack,
+	rxrpc_transmit_queue_last,
+	rxrpc_transmit_rotate,
+	rxrpc_transmit_end,
+	rxrpc_transmit__nr_trace
+};
+
+extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c1f83d22f9b7..c7eb5104e91a 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -59,6 +59,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 
 	spin_unlock(&call->lock);
 
+	trace_rxrpc_transmit(call, rxrpc_transmit_rotate);
 	wake_up(&call->waitq);
 
 	while (list) {
@@ -107,6 +108,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
 	}
 
 	write_unlock(&call->state_lock);
+	trace_rxrpc_transmit(call, rxrpc_transmit_end);
 	_leave(" = ok");
 	return true;
 }
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 598064d3bdd2..dca89995f03e 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -132,3 +132,12 @@ const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
 	[rxrpc_client_to_waiting]	= "->Wait",
 	[rxrpc_client_uncount]		= "Uncoun",
 };
+
+const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
+	[rxrpc_transmit_wait]		= "WAI",
+	[rxrpc_transmit_queue]		= "QUE",
+	[rxrpc_transmit_queue_reqack]	= "QRA",
+	[rxrpc_transmit_queue_last]	= "QLS",
+	[rxrpc_transmit_rotate]		= "ROT",
+	[rxrpc_transmit_end]		= "END",
+};
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 8bfddf4e338c..28d8f73cf11d 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -56,6 +56,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 			break;
 		}
 
+		trace_rxrpc_transmit(call, rxrpc_transmit_wait);
 		release_sock(&rx->sk);
 		*timeo = schedule_timeout(*timeo);
 		lock_sock(&rx->sk);
@@ -104,8 +105,14 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
 	call->tx_top = seq;
-	if (last)
+	if (last) {
 		set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+		trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
+	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
+		trace_rxrpc_transmit(call, rxrpc_transmit_queue_reqack);
+	} else {
+		trace_rxrpc_transmit(call, rxrpc_transmit_queue);
+	}
 
 	if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
 		_debug("________awaiting reply/ACK__________");
-- 
cgit v1.2.3


From ec71eb9ada34f8d1a58b7c35d906c59411295445 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to log received ACK packets

Add a tracepoint to log information from received ACK packets.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 26 ++++++++++++++++++++++++++
 net/rxrpc/input.c            |  2 ++
 2 files changed, 28 insertions(+)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index ffc74b3e5b76..2b19f3fa5174 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -234,6 +234,32 @@ TRACE_EVENT(rxrpc_transmit,
 		      __entry->tx_top - __entry->tx_hard_ack)
 	    );
 
+TRACE_EVENT(rxrpc_rx_ack,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first, u8 reason, u8 n_acks),
+
+	    TP_ARGS(call, first, reason, n_acks),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		first		)
+		    __field(u8,				reason		)
+		    __field(u8,				n_acks		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->first = first;
+		    __entry->reason = reason;
+		    __entry->n_acks = n_acks;
+			   ),
+
+	    TP_printk("c=%p %s f=%08x n=%u",
+		      __entry->call,
+		      rxrpc_acks(__entry->reason),
+		      __entry->first,
+		      __entry->n_acks)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c7eb5104e91a..7b18ca124978 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -440,6 +440,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	hard_ack = first_soft_ack - 1;
 	nr_acks = buf.ack.nAcks;
 
+	trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks);
+
 	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
 	       sp->hdr.serial,
 	       ntohs(buf.ack.maxSkew),
-- 
cgit v1.2.3


From f3639df2d90bc919328c459b3c7c49ed5667a52f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to log ACK transmission

Add a tracepoint to log information about ACK transmission.

Signed-off-by: David Howels <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 30 ++++++++++++++++++++++++++++++
 net/rxrpc/conn_event.c       |  3 +++
 net/rxrpc/output.c           |  7 ++++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 2b19f3fa5174..d545d692ae22 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -260,6 +260,36 @@ TRACE_EVENT(rxrpc_rx_ack,
 		      __entry->n_acks)
 	    );
 
+TRACE_EVENT(rxrpc_tx_ack,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first,
+		     rxrpc_serial_t serial, u8 reason, u8 n_acks),
+
+	    TP_ARGS(call, first, serial, reason, n_acks),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		first		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(u8,				reason		)
+		    __field(u8,				n_acks		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->first = first;
+		    __entry->serial = serial;
+		    __entry->reason = reason;
+		    __entry->n_acks = n_acks;
+			   ),
+
+	    TP_printk("c=%p %s f=%08x r=%08x n=%u",
+		      __entry->call,
+		      rxrpc_acks(__entry->reason),
+		      __entry->first,
+		      __entry->serial,
+		      __entry->n_acks)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index a43f4c94a88d..9b19c51831aa 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -98,6 +98,9 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.info.rwind		= htonl(rxrpc_rx_window_size);
 		pkt.info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
 		len += sizeof(pkt.ack) + sizeof(pkt.info);
+
+		trace_rxrpc_tx_ack(NULL, chan->last_seq, 0,
+				   RXRPC_ACK_DUPLICATE, 0);
 		break;
 	}
 
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 0b21ed859de7..2c9daeadce87 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -38,12 +38,14 @@ struct rxrpc_pkt_buffer {
 static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 				 struct rxrpc_pkt_buffer *pkt)
 {
+	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top, seq;
 	int ix;
 	u32 mtu, jmax;
 	u8 *ackp = pkt->acks;
 
 	/* Barrier against rxrpc_input_data(). */
+	serial = call->ackr_serial;
 	hard_ack = READ_ONCE(call->rx_hard_ack);
 	top = smp_load_acquire(&call->rx_top);
 
@@ -51,7 +53,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	pkt->ack.maxSkew	= htons(call->ackr_skew);
 	pkt->ack.firstPacket	= htonl(hard_ack + 1);
 	pkt->ack.previousPacket	= htonl(call->ackr_prev_seq);
-	pkt->ack.serial		= htonl(call->ackr_serial);
+	pkt->ack.serial		= htonl(serial);
 	pkt->ack.reason		= call->ackr_reason;
 	pkt->ack.nAcks		= top - hard_ack;
 
@@ -75,6 +77,9 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	pkt->ackinfo.rwind	= htonl(call->rx_winsize);
 	pkt->ackinfo.jumbo_max	= htonl(jmax);
 
+	trace_rxrpc_tx_ack(call, hard_ack + 1, serial, call->ackr_reason,
+			   top - hard_ack);
+
 	*ackp++ = 0;
 	*ackp++ = 0;
 	*ackp++ = 0;
-- 
cgit v1.2.3


From 58dc63c998ea3c5a27e2bf9251eddbf0977056a6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:13 +0100
Subject: rxrpc: Add a tracepoint to follow packets in the Rx buffer

Add a tracepoint to follow the life of packets that get added to a call's
receive buffer.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 33 +++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 12 ++++++++++++
 net/rxrpc/call_accept.c      |  3 +++
 net/rxrpc/input.c            |  6 +++++-
 net/rxrpc/misc.c             |  9 +++++++++
 net/rxrpc/recvmsg.c          | 11 +++++++++++
 6 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index d545d692ae22..7dd5f0188681 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -290,6 +290,39 @@ TRACE_EVENT(rxrpc_tx_ack,
 		      __entry->n_acks)
 	    );
 
+TRACE_EVENT(rxrpc_receive,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_receive_trace why,
+		     rxrpc_serial_t serial, rxrpc_seq_t seq),
+
+	    TP_ARGS(call, why, serial, seq),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_receive_trace,	why		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(rxrpc_seq_t,		hard_ack	)
+		    __field(rxrpc_seq_t,		top		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->serial = serial;
+		    __entry->seq = seq;
+		    __entry->hard_ack = call->rx_hard_ack;
+		    __entry->top = call->rx_top;
+			   ),
+
+	    TP_printk("c=%p %s r=%08x q=%08x w=%08x-%08x",
+		      __entry->call,
+		      rxrpc_receive_traces[__entry->why],
+		      __entry->serial,
+		      __entry->seq,
+		      __entry->hard_ack,
+		      __entry->top)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index afa5dcc05fe0..e5d2f2fb8e41 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -605,6 +605,18 @@ enum rxrpc_transmit_trace {
 
 extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
 
+enum rxrpc_receive_trace {
+	rxrpc_receive_incoming,
+	rxrpc_receive_queue,
+	rxrpc_receive_queue_last,
+	rxrpc_receive_front,
+	rxrpc_receive_rotate,
+	rxrpc_receive_end,
+	rxrpc_receive__nr_trace
+};
+
+extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 3e474508ba75..a8d39d7cf42c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -367,6 +367,9 @@ found_service:
 		goto out;
 	}
 
+	trace_rxrpc_receive(call, rxrpc_receive_incoming,
+			    sp->hdr.serial, sp->hdr.seq);
+
 	/* Make the call live. */
 	rxrpc_incoming_call(rx, call, skb);
 	conn = call->conn;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 7b18ca124978..b690220533c6 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -284,8 +284,12 @@ next_subpacket:
 	call->rxtx_buffer[ix] = skb;
 	if (after(seq, call->rx_top))
 		smp_store_release(&call->rx_top, seq);
-	if (flags & RXRPC_LAST_PACKET)
+	if (flags & RXRPC_LAST_PACKET) {
 		set_bit(RXRPC_CALL_RX_LAST, &call->flags);
+		trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
+	} else {
+		trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
+	}
 	queued = true;
 
 	if (after_eq(seq, call->rx_expect_next)) {
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index dca89995f03e..db5f1d54fc90 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -141,3 +141,12 @@ const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
 	[rxrpc_transmit_rotate]		= "ROT",
 	[rxrpc_transmit_end]		= "END",
 };
+
+const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
+	[rxrpc_receive_incoming]	= "INC",
+	[rxrpc_receive_queue]		= "QUE",
+	[rxrpc_receive_queue_last]	= "QLS",
+	[rxrpc_receive_front]		= "FRN",
+	[rxrpc_receive_rotate]		= "ROT",
+	[rxrpc_receive_end]		= "END",
+};
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 8b8d7e14f800..22d51087c580 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -134,6 +134,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 {
 	_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
 
+	trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
 	ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
 
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
@@ -167,6 +168,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 {
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
+	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top;
 	u8 flags;
 	int ix;
@@ -183,6 +185,10 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	rxrpc_see_skb(skb);
 	sp = rxrpc_skb(skb);
 	flags = sp->hdr.flags;
+	serial = sp->hdr.serial;
+	if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO)
+		serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1;
+
 	call->rxtx_buffer[ix] = NULL;
 	call->rxtx_annotations[ix] = 0;
 	/* Barrier against rxrpc_input_data(). */
@@ -191,6 +197,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	rxrpc_free_skb(skb);
 
 	_debug("%u,%u,%02x", hard_ack, top, flags);
+	trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
 	if (flags & RXRPC_LAST_PACKET)
 		rxrpc_end_rx_phase(call);
 }
@@ -309,6 +316,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		rxrpc_see_skb(skb);
 		sp = rxrpc_skb(skb);
 
+		if (!(flags & MSG_PEEK))
+			trace_rxrpc_receive(call, rxrpc_receive_front,
+					    sp->hdr.serial, seq);
+
 		if (msg)
 			sock_recv_timestamp(msg, sock->sk, skb);
 
-- 
cgit v1.2.3


From 849979051cbc9352857d8bb31895ae55afe19d96 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 11:13:31 +0100
Subject: rxrpc: Add a tracepoint to follow what recvmsg does

Add a tracepoint to follow what recvmsg does within AF_RXRPC.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 34 ++++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 17 +++++++++++++++++
 net/rxrpc/misc.c             | 14 ++++++++++++++
 net/rxrpc/recvmsg.c          | 34 ++++++++++++++++++++++++++--------
 4 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 7dd5f0188681..58732202e9f0 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -323,6 +323,40 @@ TRACE_EVENT(rxrpc_receive,
 		      __entry->top)
 	    );
 
+TRACE_EVENT(rxrpc_recvmsg,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why,
+		     rxrpc_seq_t seq, unsigned int offset, unsigned int len,
+		     int ret),
+
+	    TP_ARGS(call, why, seq, offset, len, ret),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_recvmsg_trace,	why		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(unsigned int,		offset		)
+		    __field(unsigned int,		len		)
+		    __field(int,			ret		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->seq = seq;
+		    __entry->offset = offset;
+		    __entry->len = len;
+		    __entry->ret = ret;
+			   ),
+
+	    TP_printk("c=%p %s q=%08x o=%u l=%u ret=%d",
+		      __entry->call,
+		      rxrpc_recvmsg_traces[__entry->why],
+		      __entry->seq,
+		      __entry->offset,
+		      __entry->len,
+		      __entry->ret)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e5d2f2fb8e41..a17341d2df3d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -617,6 +617,23 @@ enum rxrpc_receive_trace {
 
 extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
 
+enum rxrpc_recvmsg_trace {
+	rxrpc_recvmsg_enter,
+	rxrpc_recvmsg_wait,
+	rxrpc_recvmsg_dequeue,
+	rxrpc_recvmsg_hole,
+	rxrpc_recvmsg_next,
+	rxrpc_recvmsg_cont,
+	rxrpc_recvmsg_full,
+	rxrpc_recvmsg_data_return,
+	rxrpc_recvmsg_terminal,
+	rxrpc_recvmsg_to_be_accepted,
+	rxrpc_recvmsg_return,
+	rxrpc_recvmsg__nr_trace
+};
+
+extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index db5f1d54fc90..c7065d893d1e 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -150,3 +150,17 @@ const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
 	[rxrpc_receive_rotate]		= "ROT",
 	[rxrpc_receive_end]		= "END",
 };
+
+const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
+	[rxrpc_recvmsg_enter]		= "ENTR",
+	[rxrpc_recvmsg_wait]		= "WAIT",
+	[rxrpc_recvmsg_dequeue]		= "DEQU",
+	[rxrpc_recvmsg_hole]		= "HOLE",
+	[rxrpc_recvmsg_next]		= "NEXT",
+	[rxrpc_recvmsg_cont]		= "CONT",
+	[rxrpc_recvmsg_full]		= "FULL",
+	[rxrpc_recvmsg_data_return]	= "DATA",
+	[rxrpc_recvmsg_terminal]	= "TERM",
+	[rxrpc_recvmsg_to_be_accepted]	= "TBAC",
+	[rxrpc_recvmsg_return]		= "RETN",
+};
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 22d51087c580..b62a08151895 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -94,6 +94,8 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
 		break;
 	}
 
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack,
+			    call->rx_pkt_offset, call->rx_pkt_len, ret);
 	return ret;
 }
 
@@ -124,6 +126,7 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
 		write_unlock(&rx->call_lock);
 	}
 
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret);
 	return ret;
 }
 
@@ -310,8 +313,11 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 	for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		skb = call->rxtx_buffer[ix];
-		if (!skb)
+		if (!skb) {
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq,
+					    rx_pkt_offset, rx_pkt_len, 0);
 			break;
+		}
 		smp_rmb();
 		rxrpc_see_skb(skb);
 		sp = rxrpc_skb(skb);
@@ -327,10 +333,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			ret2 = rxrpc_locate_data(call, skb,
 						 &call->rxtx_annotations[ix],
 						 &rx_pkt_offset, &rx_pkt_len);
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
+					    rx_pkt_offset, rx_pkt_len, ret2);
 			if (ret2 < 0) {
 				ret = ret2;
 				goto out;
 			}
+		} else {
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
+					    rx_pkt_offset, rx_pkt_len, 0);
 		}
 		_debug("recvmsg %x DATA #%u { %d, %d }",
 		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
@@ -357,6 +368,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		}
 
 		if (rx_pkt_len > 0) {
+			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
+					    rx_pkt_offset, rx_pkt_len, 0);
 			_debug("buffer full");
 			ASSERTCMP(*_offset, ==, len);
 			ret = 0;
@@ -383,6 +396,8 @@ out:
 		call->rx_pkt_len = rx_pkt_len;
 	}
 done:
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
+			    rx_pkt_offset, rx_pkt_len, ret);
 	_leave(" = %d [%u/%u]", ret, seq, top);
 	return ret;
 }
@@ -404,7 +419,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	DEFINE_WAIT(wait);
 
-	_enter(",,,%zu,%d", len, flags);
+	trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0);
 
 	if (flags & (MSG_OOB | MSG_TRUNC))
 		return -EOPNOTSUPP;
@@ -424,8 +439,10 @@ try_again:
 
 	if (list_empty(&rx->recvmsg_q)) {
 		ret = -EWOULDBLOCK;
-		if (timeo == 0)
+		if (timeo == 0) {
+			call = NULL;
 			goto error_no_call;
+		}
 
 		release_sock(&rx->sk);
 
@@ -439,6 +456,8 @@ try_again:
 		if (list_empty(&rx->recvmsg_q)) {
 			if (signal_pending(current))
 				goto wait_interrupted;
+			trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait,
+					    0, 0, 0, 0);
 			timeo = schedule_timeout(timeo);
 		}
 		finish_wait(sk_sleep(&rx->sk), &wait);
@@ -457,7 +476,7 @@ try_again:
 		rxrpc_get_call(call, rxrpc_call_got);
 	write_unlock_bh(&rx->recvmsg_lock);
 
-	_debug("recvmsg call %p", call);
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
 
 	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
 		BUG();
@@ -527,16 +546,15 @@ error:
 	rxrpc_put_call(call, rxrpc_call_put);
 error_no_call:
 	release_sock(&rx->sk);
-	_leave(" = %d", ret);
+	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
 	return ret;
 
 wait_interrupted:
 	ret = sock_intr_errno(timeo);
 wait_error:
 	finish_wait(sk_sleep(&rx->sk), &wait);
-	release_sock(&rx->sk);
-	_leave(" = %d [wait]", ret);
-	return ret;
+	call = NULL;
+	goto error_no_call;
 }
 
 /**
-- 
cgit v1.2.3


From ba39f3a0ed756ccd882adf4a77916ec863db3ce4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:14 +0100
Subject: rxrpc: Remove printks from rxrpc_recvmsg_data() to fix uninit var

Remove _enter/_debug/_leave calls from rxrpc_recvmsg_data() of which one
uses an uninitialised variable.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/recvmsg.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index b62a08151895..79e65668bc58 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -296,8 +296,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 	unsigned int rx_pkt_offset, rx_pkt_len;
 	int ix, copy, ret = -EAGAIN, ret2;
 
-	_enter("");
-
 	rx_pkt_offset = call->rx_pkt_offset;
 	rx_pkt_len = call->rx_pkt_len;
 
@@ -343,8 +341,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
 					    rx_pkt_offset, rx_pkt_len, 0);
 		}
-		_debug("recvmsg %x DATA #%u { %d, %d }",
-		       sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
 
 		/* We have to handle short, empty and used-up DATA packets. */
 		remain = len - *_offset;
@@ -360,8 +356,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			}
 
 			/* handle piecemeal consumption of data packets */
-			_debug("copied %d @%zu", copy, *_offset);
-
 			rx_pkt_offset += copy;
 			rx_pkt_len -= copy;
 			*_offset += copy;
@@ -370,7 +364,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		if (rx_pkt_len > 0) {
 			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
 					    rx_pkt_offset, rx_pkt_len, 0);
-			_debug("buffer full");
 			ASSERTCMP(*_offset, ==, len);
 			ret = 0;
 			break;
@@ -398,7 +391,6 @@ out:
 done:
 	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
 			    rx_pkt_offset, rx_pkt_len, ret);
-	_leave(" = %d [%u/%u]", ret, seq, top);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 71f3ca408fd43b586c02480768a503af075b247e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:14 +0100
Subject: rxrpc: Improve skb tracing

Improve sk_buff tracing within AF_RXRPC by the following means:

 (1) Use an enum to note the event type rather than plain integers and use
     an array of event names rather than a big multi ?: list.

 (2) Distinguish Rx from Tx packets and account them separately.  This
     requires the call phase to be tracked so that we know what we might
     find in rxtx_buffer[].

 (3) Add a parameter to rxrpc_{new,see,get,free}_skb() to indicate the
     event type.

 (4) A pair of 'rotate' events are added to indicate packets that are about
     to be rotated out of the Rx and Tx windows.

 (5) A pair of 'lost' events are added, along with rxrpc_lose_skb() for
     packet loss injection recording.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 12 ++++------
 net/rxrpc/af_rxrpc.c         |  5 +++--
 net/rxrpc/ar-internal.h      | 33 ++++++++++++++++++++++-----
 net/rxrpc/call_event.c       |  8 +++----
 net/rxrpc/call_object.c      | 11 ++++++---
 net/rxrpc/conn_event.c       |  6 ++---
 net/rxrpc/input.c            | 13 ++++++-----
 net/rxrpc/local_event.c      |  4 ++--
 net/rxrpc/misc.c             | 18 +++++++++++++++
 net/rxrpc/output.c           |  4 ++--
 net/rxrpc/peer_event.c       | 10 ++++-----
 net/rxrpc/recvmsg.c          |  7 +++---
 net/rxrpc/sendmsg.c          | 10 ++++-----
 net/rxrpc/skbuff.c           | 53 +++++++++++++++++++++++++++++++-------------
 14 files changed, 131 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 58732202e9f0..75a5d8bf50e1 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -107,14 +107,14 @@ TRACE_EVENT(rxrpc_call,
 	    );
 
 TRACE_EVENT(rxrpc_skb,
-	    TP_PROTO(struct sk_buff *skb, int op, int usage, int mod_count,
-		     const void *where),
+	    TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op,
+		     int usage, int mod_count, const void *where),
 
 	    TP_ARGS(skb, op, usage, mod_count, where),
 
 	    TP_STRUCT__entry(
 		    __field(struct sk_buff *,		skb		)
-		    __field(int,			op		)
+		    __field(enum rxrpc_skb_trace,	op		)
 		    __field(int,			usage		)
 		    __field(int,			mod_count	)
 		    __field(const void *,		where		)
@@ -130,11 +130,7 @@ TRACE_EVENT(rxrpc_skb,
 
 	    TP_printk("s=%p %s u=%d m=%d p=%pSR",
 		      __entry->skb,
-		      (__entry->op == 0 ? "NEW" :
-		       __entry->op == 1 ? "SEE" :
-		       __entry->op == 2 ? "GET" :
-		       __entry->op == 3 ? "FRE" :
-		       "PUR"),
+		      rxrpc_skb_traces[__entry->op],
 		      __entry->usage,
 		      __entry->mod_count,
 		      __entry->where)
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 09f81befc705..8dbf7bed2cc4 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -45,7 +45,7 @@ u32 rxrpc_epoch;
 atomic_t rxrpc_debug_id;
 
 /* count of skbs currently in use */
-atomic_t rxrpc_n_skbs;
+atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
 
 struct workqueue_struct *rxrpc_workqueue;
 
@@ -867,7 +867,8 @@ static void __exit af_rxrpc_exit(void)
 	proto_unregister(&rxrpc_proto);
 	rxrpc_destroy_all_calls();
 	rxrpc_destroy_all_connections();
-	ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+	ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
+	ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
 	rxrpc_destroy_all_locals();
 
 	remove_proc_entry("rxrpc_conns", init_net.proc_net);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a17341d2df3d..034f525f2235 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -520,6 +520,7 @@ struct rxrpc_call {
 	rxrpc_seq_t		rx_expect_next;	/* Expected next packet sequence number */
 	u8			rx_winsize;	/* Size of Rx window */
 	u8			tx_winsize;	/* Maximum size of Tx window */
+	bool			tx_phase;	/* T if transmission phase, F if receive phase */
 	u8			nr_jumbo_bad;	/* Number of jumbo dups/exceeds-windows */
 
 	/* receive-phase ACK management */
@@ -534,6 +535,27 @@ struct rxrpc_call {
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 };
 
+enum rxrpc_skb_trace {
+	rxrpc_skb_rx_cleaned,
+	rxrpc_skb_rx_freed,
+	rxrpc_skb_rx_got,
+	rxrpc_skb_rx_lost,
+	rxrpc_skb_rx_received,
+	rxrpc_skb_rx_rotated,
+	rxrpc_skb_rx_purged,
+	rxrpc_skb_rx_seen,
+	rxrpc_skb_tx_cleaned,
+	rxrpc_skb_tx_freed,
+	rxrpc_skb_tx_got,
+	rxrpc_skb_tx_lost,
+	rxrpc_skb_tx_new,
+	rxrpc_skb_tx_rotated,
+	rxrpc_skb_tx_seen,
+	rxrpc_skb__nr_trace
+};
+
+extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
+
 enum rxrpc_conn_trace {
 	rxrpc_conn_new_client,
 	rxrpc_conn_new_service,
@@ -642,7 +664,7 @@ extern const char *rxrpc_acks(u8 reason);
 /*
  * af_rxrpc.c
  */
-extern atomic_t rxrpc_n_skbs;
+extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
 extern u32 rxrpc_epoch;
 extern atomic_t rxrpc_debug_id;
 extern struct workqueue_struct *rxrpc_workqueue;
@@ -1000,10 +1022,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
  */
 void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_packet_destructor(struct sk_buff *);
-void rxrpc_new_skb(struct sk_buff *);
-void rxrpc_see_skb(struct sk_buff *);
-void rxrpc_get_skb(struct sk_buff *);
-void rxrpc_free_skb(struct sk_buff *);
+void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
 void rxrpc_purge_queue(struct sk_buff_head *);
 
 /*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index f0cabc48a1b7..7d1b99824ed9 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -170,7 +170,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			continue;
 
 		skb = call->rxtx_buffer[ix];
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
 		sp = rxrpc_skb(skb);
 
 		if (annotation == RXRPC_TX_ANNO_UNACK) {
@@ -199,7 +199,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			continue;
 
 		skb = call->rxtx_buffer[ix];
-		rxrpc_get_skb(skb);
+		rxrpc_get_skb(skb, rxrpc_skb_tx_got);
 		spin_unlock_bh(&call->lock);
 		sp = rxrpc_skb(skb);
 
@@ -211,7 +211,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 		if (rxrpc_send_data_packet(call->conn, skb) < 0) {
 			call->resend_at = now + 2;
-			rxrpc_free_skb(skb);
+			rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 			return;
 		}
 
@@ -219,7 +219,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			rxrpc_expose_client_call(call);
 		sp->resend_at = now + rxrpc_resend_timeout;
 
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 		spin_lock_bh(&call->lock);
 
 		/* We need to clear the retransmit state, but there are two
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 54f30482a7fd..f50a6094e198 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -182,6 +182,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 		return ERR_PTR(-ENOMEM);
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
 	call->service_id = srx->srx_service;
+	call->tx_phase = true;
 
 	_leave(" = %p", call);
 	return call;
@@ -458,7 +459,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 		rxrpc_disconnect_call(call);
 
 	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
-		rxrpc_free_skb(call->rxtx_buffer[i]);
+		rxrpc_free_skb(call->rxtx_buffer[i],
+			       (call->tx_phase ? rxrpc_skb_tx_cleaned :
+				rxrpc_skb_rx_cleaned));
 		call->rxtx_buffer[i] = NULL;
 	}
 
@@ -552,9 +555,11 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
 
 	/* Clean up the Rx/Tx buffer */
 	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
-		rxrpc_free_skb(call->rxtx_buffer[i]);
+		rxrpc_free_skb(call->rxtx_buffer[i],
+			       (call->tx_phase ? rxrpc_skb_tx_cleaned :
+				rxrpc_skb_rx_cleaned));
 
-	rxrpc_free_skb(call->tx_pending);
+	rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned);
 
 	call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
 }
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 9b19c51831aa..75a15a4c74c3 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -388,7 +388,7 @@ void rxrpc_process_connection(struct work_struct *work)
 	/* go through the conn-level event packets, releasing the ref on this
 	 * connection that each one has when we've finished with it */
 	while ((skb = skb_dequeue(&conn->rx_queue))) {
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		ret = rxrpc_process_event(conn, skb, &abort_code);
 		switch (ret) {
 		case -EPROTO:
@@ -399,7 +399,7 @@ void rxrpc_process_connection(struct work_struct *work)
 			goto requeue_and_leave;
 		case -ECONNABORTED:
 		default:
-			rxrpc_free_skb(skb);
+			rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 			break;
 		}
 	}
@@ -416,7 +416,7 @@ requeue_and_leave:
 protocol_error:
 	if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
 		goto requeue_and_leave;
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 	_leave(" [EPROTO]");
 	goto out;
 }
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index b690220533c6..84bb16d47b85 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -50,7 +50,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 		call->tx_hard_ack++;
 		ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
 		skb = call->rxtx_buffer[ix];
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
 		call->rxtx_buffer[ix] = NULL;
 		call->rxtx_annotations[ix] = 0;
 		skb->next = list;
@@ -66,7 +66,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 		skb = list;
 		list = skb->next;
 		skb->next = NULL;
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	}
 }
 
@@ -99,6 +99,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
 	default:
 		break;
 	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+		call->tx_phase = false;
 		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
 		break;
 	case RXRPC_CALL_SERVER_AWAIT_ACK:
@@ -278,7 +279,7 @@ next_subpacket:
 	 * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
 	 * and also rxrpc_fill_out_ack().
 	 */
-	rxrpc_get_skb(skb);
+	rxrpc_get_skb(skb, rxrpc_skb_rx_got);
 	call->rxtx_annotations[ix] = annotation;
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
@@ -691,13 +692,13 @@ void rxrpc_data_ready(struct sock *udp_sk)
 		return;
 	}
 
-	rxrpc_new_skb(skb);
+	rxrpc_new_skb(skb, rxrpc_skb_rx_received);
 
 	_net("recv skb %p", skb);
 
 	/* we'll probably need to checksum it (didn't call sock_recvmsg) */
 	if (skb_checksum_complete(skb)) {
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		__UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
 		_leave(" [CSUM failed]");
 		return;
@@ -821,7 +822,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
 discard_unlock:
 	rcu_read_unlock();
 discard:
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 out:
 	trace_rxrpc_rx_done(0, 0);
 	return;
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index f073e932500e..190f68bd9e27 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -90,7 +90,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
 	if (skb) {
 		struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
 
 		switch (sp->hdr.type) {
@@ -107,7 +107,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
 			break;
 		}
 
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 	}
 
 	_leave("");
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index c7065d893d1e..026e1f2e83ff 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -102,6 +102,24 @@ const char *rxrpc_acks(u8 reason)
 	return str[reason];
 }
 
+const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
+	[rxrpc_skb_rx_cleaned]		= "Rx CLN",
+	[rxrpc_skb_rx_freed]		= "Rx FRE",
+	[rxrpc_skb_rx_got]		= "Rx GOT",
+	[rxrpc_skb_rx_lost]		= "Rx *L*",
+	[rxrpc_skb_rx_received]		= "Rx RCV",
+	[rxrpc_skb_rx_purged]		= "Rx PUR",
+	[rxrpc_skb_rx_rotated]		= "Rx ROT",
+	[rxrpc_skb_rx_seen]		= "Rx SEE",
+	[rxrpc_skb_tx_cleaned]		= "Tx CLN",
+	[rxrpc_skb_tx_freed]		= "Tx FRE",
+	[rxrpc_skb_tx_got]		= "Tx GOT",
+	[rxrpc_skb_tx_lost]		= "Tx *L*",
+	[rxrpc_skb_tx_new]		= "Tx NEW",
+	[rxrpc_skb_tx_rotated]		= "Tx ROT",
+	[rxrpc_skb_tx_seen]		= "Tx SEE",
+};
+
 const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
 	[rxrpc_conn_new_client]		= "NWc",
 	[rxrpc_conn_new_service]	= "NWs",
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 2c9daeadce87..a2cad5ce7416 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -324,7 +324,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 	whdr.type = RXRPC_PACKET_TYPE_ABORT;
 
 	while ((skb = skb_dequeue(&local->reject_queue))) {
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		sp = rxrpc_skb(skb);
 
 		if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
@@ -343,7 +343,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 			kernel_sendmsg(local->socket, &msg, iov, 2, size);
 		}
 
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 	}
 
 	_leave("");
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 9e0725f5652b..18276e7cb9e0 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -155,11 +155,11 @@ void rxrpc_error_report(struct sock *sk)
 		_leave("UDP socket errqueue empty");
 		return;
 	}
-	rxrpc_new_skb(skb);
+	rxrpc_new_skb(skb, rxrpc_skb_rx_received);
 	serr = SKB_EXT_ERR(skb);
 	if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
 		_leave("UDP empty message");
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		return;
 	}
 
@@ -169,7 +169,7 @@ void rxrpc_error_report(struct sock *sk)
 		peer = NULL;
 	if (!peer) {
 		rcu_read_unlock();
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		_leave(" [no peer]");
 		return;
 	}
@@ -179,7 +179,7 @@ void rxrpc_error_report(struct sock *sk)
 	     serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
 		rxrpc_adjust_mtu(peer, serr);
 		rcu_read_unlock();
-		rxrpc_free_skb(skb);
+		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 		rxrpc_put_peer(peer);
 		_leave(" [MTU update]");
 		return;
@@ -187,7 +187,7 @@ void rxrpc_error_report(struct sock *sk)
 
 	rxrpc_store_error(peer, serr);
 	rcu_read_unlock();
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 
 	/* The ref we obtained is passed off to the work item */
 	rxrpc_queue_work(&peer->error_distributor);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 79e65668bc58..6ba4af5a8d95 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -155,6 +155,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 		break;
 
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
+		call->tx_phase = true;
 		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
 		break;
 	default:
@@ -185,7 +186,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	hard_ack++;
 	ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
 	skb = call->rxtx_buffer[ix];
-	rxrpc_see_skb(skb);
+	rxrpc_see_skb(skb, rxrpc_skb_rx_rotated);
 	sp = rxrpc_skb(skb);
 	flags = sp->hdr.flags;
 	serial = sp->hdr.serial;
@@ -197,7 +198,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	/* Barrier against rxrpc_input_data(). */
 	smp_store_release(&call->rx_hard_ack, hard_ack);
 
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 
 	_debug("%u,%u,%02x", hard_ack, top, flags);
 	trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
@@ -317,7 +318,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 			break;
 		}
 		smp_rmb();
-		rxrpc_see_skb(skb);
+		rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
 		sp = rxrpc_skb(skb);
 
 		if (!(flags & MSG_PEEK))
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 28d8f73cf11d..6a39ee97a0b7 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -100,7 +100,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	ASSERTCMP(seq, ==, call->tx_top + 1);
 
 	ix = seq & RXRPC_RXTX_BUFF_MASK;
-	rxrpc_get_skb(skb);
+	rxrpc_get_skb(skb, rxrpc_skb_tx_got);
 	call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
@@ -146,7 +146,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 		rxrpc_instant_resend(call, ix);
 	}
 
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	_leave("");
 }
 
@@ -201,7 +201,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 	skb = call->tx_pending;
 	call->tx_pending = NULL;
-	rxrpc_see_skb(skb);
+	rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
 
 	copied = 0;
 	do {
@@ -242,7 +242,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			if (!skb)
 				goto maybe_error;
 
-			rxrpc_new_skb(skb);
+			rxrpc_new_skb(skb, rxrpc_skb_tx_new);
 
 			_debug("ALLOC SEND %p", skb);
 
@@ -352,7 +352,7 @@ out:
 	return ret;
 
 call_terminated:
-	rxrpc_free_skb(skb);
+	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	_leave(" = %d", -call->error);
 	return -call->error;
 
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 620d9ccaf3c1..5154cbf7e540 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,54 +18,76 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
+#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
+
 /*
- * Note the existence of a new-to-us socket buffer (allocated or dequeued).
+ * Note the allocation or reception of a socket buffer.
  */
-void rxrpc_new_skb(struct sk_buff *skb)
+void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
-	int n = atomic_inc_return(&rxrpc_n_skbs);
-	trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here);
+	int n = atomic_inc_return(select_skb_count(op));
+	trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 }
 
 /*
  * Note the re-emergence of a socket buffer from a queue or buffer.
  */
-void rxrpc_see_skb(struct sk_buff *skb)
+void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	if (skb) {
-		int n = atomic_read(&rxrpc_n_skbs);
-		trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here);
+		int n = atomic_read(select_skb_count(op));
+		trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 	}
 }
 
 /*
  * Note the addition of a ref on a socket buffer.
  */
-void rxrpc_get_skb(struct sk_buff *skb)
+void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
-	int n = atomic_inc_return(&rxrpc_n_skbs);
-	trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here);
+	int n = atomic_inc_return(select_skb_count(op));
+	trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 	skb_get(skb);
 }
 
 /*
  * Note the destruction of a socket buffer.
  */
-void rxrpc_free_skb(struct sk_buff *skb)
+void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
 {
 	const void *here = __builtin_return_address(0);
 	if (skb) {
 		int n;
 		CHECK_SLAB_OKAY(&skb->users);
-		n = atomic_dec_return(&rxrpc_n_skbs);
-		trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here);
+		n = atomic_dec_return(select_skb_count(op));
+		trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
 		kfree_skb(skb);
 	}
 }
 
+/*
+ * Note the injected loss of a socket buffer.
+ */
+void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+{
+	const void *here = __builtin_return_address(0);
+	if (skb) {
+		int n;
+		CHECK_SLAB_OKAY(&skb->users);
+		if (op == rxrpc_skb_tx_lost) {
+			n = atomic_read(select_skb_count(op));
+			trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+		} else {
+			n = atomic_dec_return(select_skb_count(op));
+			trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+			kfree_skb(skb);
+		}
+	}
+}
+
 /*
  * Clear a queue of socket buffers.
  */
@@ -74,8 +96,9 @@ void rxrpc_purge_queue(struct sk_buff_head *list)
 	const void *here = __builtin_return_address(0);
 	struct sk_buff *skb;
 	while ((skb = skb_dequeue((list))) != NULL) {
-		int n = atomic_dec_return(&rxrpc_n_skbs);
-		trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here);
+		int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged));
+		trace_rxrpc_skb(skb, rxrpc_skb_rx_purged,
+				atomic_read(&skb->users), n, here);
 		kfree_skb(skb);
 	}
 }
-- 
cgit v1.2.3


From 8a681c360559f75a80b37e6a6a9590457361ccb0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 17 Sep 2016 10:49:15 +0100
Subject: rxrpc: Add config to inject packet loss

Add a configuration option to inject packet loss by discarding
approximately every 8th packet received and approximately every 8th DATA
packet transmitted.

Note that no locking is used, but it shouldn't really matter.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/Kconfig  | 7 +++++++
 net/rxrpc/input.c  | 8 ++++++++
 net/rxrpc/output.c | 9 +++++++++
 3 files changed, 24 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 13396c74b5c1..86f8853a038c 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -26,6 +26,13 @@ config AF_RXRPC_IPV6
 	  Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as
 	  its network transport.
 
+config AF_RXRPC_INJECT_LOSS
+	bool "Inject packet loss into RxRPC packet stream"
+	depends on AF_RXRPC
+	help
+	  Say Y here to inject packet loss by discarding some received and some
+	  transmitted packets.
+
 
 config AF_RXRPC_DEBUG
 	bool "RxRPC dynamic debugging"
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 84bb16d47b85..7ac1edf3aac7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -712,6 +712,14 @@ void rxrpc_data_ready(struct sock *udp_sk)
 	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
 
+	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+		static int lose;
+		if ((lose++ & 7) == 7) {
+			rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
+			return;
+		}
+	}
+
 	_net("Rx UDP packet from %08x:%04hu",
 	     ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
 
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index a2cad5ce7416..16e18a94ffa6 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -225,6 +225,15 @@ int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
 	msg.msg_controllen = 0;
 	msg.msg_flags = 0;
 
+	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+		static int lose;
+		if ((lose++ & 7) == 7) {
+			rxrpc_lose_skb(skb, rxrpc_skb_tx_lost);
+			_leave(" = 0 [lose]");
+			return 0;
+		}
+	}
+
 	/* send the packet with the don't fragment bit set if we currently
 	 * think it's small enough */
 	if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
-- 
cgit v1.2.3


From ffb4d6c8508657824bcef68a36b2a0f9d8c09d10 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 15 Sep 2016 08:12:33 -0700
Subject: tcp: fix overflow in __tcp_retransmit_skb()

If a TCP socket gets a large write queue, an overflow can happen
in a test in __tcp_retransmit_skb() preventing all retransmits.

The flow then stalls and resets after timeouts.

Tested:

sysctl -w net.core.wmem_max=1000000000
netperf -H dest -- -s 1000000000

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bdaef7fd6e47..f53d0cca5fa4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2605,7 +2605,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	 * copying overhead: fragmentation, tunneling, mangling etc.
 	 */
 	if (atomic_read(&sk->sk_wmem_alloc) >
-	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+	    min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
+		  sk->sk_sndbuf))
 		return -EAGAIN;
 
 	if (skb_still_in_host_queue(sk, skb))
-- 
cgit v1.2.3


From 8ab86c00e349cef9fb14719093a7f198bcc72629 Mon Sep 17 00:00:00 2001
From: "phil.turnbull@oracle.com" <phil.turnbull@oracle.com>
Date: Thu, 15 Sep 2016 12:41:44 -0400
Subject: irda: Free skb on irda_accept error path.

skb is not freed if newsk is NULL. Rework the error path so free_skb is
unconditionally called on function exit.

Fixes: c3ea9fa27413 ("[IrDA] af_irda: IRDA_ASSERT cleanups")
Signed-off-by: Phil Turnbull <phil.turnbull@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/af_irda.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 8d2f7c9b491d..ccc244406fb9 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 	struct sock *sk = sock->sk;
 	struct irda_sock *new, *self = irda_sk(sk);
 	struct sock *newsk;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	int err;
 
 	err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
@@ -900,7 +900,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 	err = -EPERM; /* value does not seem to make sense. -arnd */
 	if (!new->tsap) {
 		pr_debug("%s(), dup failed!\n", __func__);
-		kfree_skb(skb);
 		goto out;
 	}
 
@@ -919,7 +918,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 	/* Clean up the original one to keep it in listen state */
 	irttp_listen(self->tsap);
 
-	kfree_skb(skb);
 	sk->sk_ack_backlog--;
 
 	newsock->state = SS_CONNECTED;
@@ -927,6 +925,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 	irda_connect_response(new);
 	err = 0;
 out:
+	kfree_skb(skb);
 	release_sock(sk);
 	return err;
 }
-- 
cgit v1.2.3


From 3613b3dbd1ade9a6a626dae1f608c57638eb5e8a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 15 Sep 2016 09:33:02 -0700
Subject: tcp: prepare skbs for better sack shifting

With large BDP TCP flows and lossy networks, it is very important
to keep a low number of skbs in the write queue.

RACK and SACK processing can perform a linear scan of it.

We should avoid putting any payload in skb->head, so that SACK
shifting can be done if needed.

With this patch, we allow to pack ~0.5 MB per skb instead of
the 64KB initially cooked at tcp_sendmsg() time.

This gives a reduction of number of skbs in write queue by eight.
tcp_rack_detect_loss() likes this.

We still allow payload in skb->head for first skb put in the queue,
to not impact RPC workloads.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a13fcb369f52..7dae800092e6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1020,17 +1020,31 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 }
 EXPORT_SYMBOL(tcp_sendpage);
 
-static inline int select_size(const struct sock *sk, bool sg)
+/* Do not bother using a page frag for very small frames.
+ * But use this heuristic only for the first skb in write queue.
+ *
+ * Having no payload in skb->head allows better SACK shifting
+ * in tcp_shift_skb_data(), reducing sack/rack overhead, because
+ * write queue has less skbs.
+ * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
+ * This also speeds up tso_fragment(), since it wont fallback
+ * to tcp_fragment().
+ */
+static int linear_payload_sz(bool first_skb)
+{
+	if (first_skb)
+		return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+	return 0;
+}
+
+static int select_size(const struct sock *sk, bool sg, bool first_skb)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int tmp = tp->mss_cache;
 
 	if (sg) {
 		if (sk_can_gso(sk)) {
-			/* Small frames wont use a full page:
-			 * Payload will immediately follow tcp header.
-			 */
-			tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+			tmp = linear_payload_sz(first_skb);
 		} else {
 			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 
@@ -1161,6 +1175,8 @@ restart:
 		}
 
 		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+			bool first_skb;
+
 new_segment:
 			/* Allocate new segment. If the interface is SG,
 			 * allocate skb fitting to single page.
@@ -1172,10 +1188,11 @@ new_segment:
 				process_backlog = false;
 				goto restart;
 			}
+			first_skb = skb_queue_empty(&sk->sk_write_queue);
 			skb = sk_stream_alloc_skb(sk,
-						  select_size(sk, sg),
+						  select_size(sk, sg, first_skb),
 						  sk->sk_allocation,
-						  skb_queue_empty(&sk->sk_write_queue));
+						  first_skb);
 			if (!skb)
 				goto wait_for_memory;
 
-- 
cgit v1.2.3


From 5ff904d55da821fd194ff493f2928d134ce5b67a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 15 Sep 2016 18:51:25 +0100
Subject: llc: switch type to bool as the timeout is only tested versus 0

(As asked by Dave in Februrary)

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/af_llc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8ae3ed97d95c..db916cf51ffe 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256];
 static struct sockaddr_llc llc_ui_addrnull;
 static const struct proto_ops llc_ui_ops;
 
-static long llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static bool llc_ui_wait_for_conn(struct sock *sk, long timeout);
 static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
 static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
 
@@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
 	return rc;
 }
 
-static long llc_ui_wait_for_conn(struct sock *sk, long timeout)
+static bool llc_ui_wait_for_conn(struct sock *sk, long timeout)
 {
 	DEFINE_WAIT(wait);
 
-- 
cgit v1.2.3


From eb94737d711913a23e466b99c0d9ffdf15651290 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Thu, 15 Sep 2016 22:23:26 +0200
Subject: l2tp: constify net_device_ops structures

Check for net_device_ops structures that are only stored in the netdev_ops
field of a net_device structure.  This field is declared const, so
net_device_ops structures that have this property can be declared as const
also.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@r disable optional_qualifier@
identifier i;
position p;
@@
static struct net_device_ops i@p = { ... };

@ok@
identifier r.i;
struct net_device e;
position p;
@@
e.netdev_ops = &i@p;

@bad@
position p != {r.p,ok.p};
identifier r.i;
struct net_device_ops e;
@@
e@i@p

@depends on !bad disable optional_qualifier@
identifier r.i;
@@
static
+const
 struct net_device_ops i = { ... };
// </smpl>

The result of size on this file before the change is:
   text	      data     bss     dec         hex	  filename
   3401        931      44    4376        1118	net/l2tp/l2tp_eth.o

and after the change it is:
   text	     data        bss	    dec	    hex	filename
   3993       347         44       4384    1120	net/l2tp/l2tp_eth.o

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index ef2cd30ca06e..965f7e344cef 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -121,7 +121,7 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
 }
 
 
-static struct net_device_ops l2tp_eth_netdev_ops = {
+static const struct net_device_ops l2tp_eth_netdev_ops = {
 	.ndo_init		= l2tp_eth_dev_init,
 	.ndo_uninit		= l2tp_eth_dev_uninit,
 	.ndo_start_xmit		= l2tp_eth_dev_xmit,
-- 
cgit v1.2.3


From cfc7381b3002756b1dcada32979e942aa3126e31 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:29 -0700
Subject: ip_tunnel: add collect_md mode to IPIP tunnel

Similar to gre, vxlan, geneve tunnels allow IPIP tunnels to
operate in 'collect metadata' mode.
bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away.
ovs can use it as well in the future (once appropriate ovs-vport
abstractions and user apis are added).
Note that just like in other tunnels we cannot cache the dst,
since tunnel_info metadata can be different for every packet.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h       |  2 ++
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/ip_tunnel.c           | 76 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ipip.c                | 35 +++++++++++++++----
 4 files changed, 108 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index e598c639aa6f..59557c07904b 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -255,6 +255,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+		       const u8 proto);
 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 9865c8caedde..18d5dc13985d 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -73,6 +73,7 @@ enum {
 	IFLA_IPTUN_ENCAP_FLAGS,
 	IFLA_IPTUN_ENCAP_SPORT,
 	IFLA_IPTUN_ENCAP_DPORT,
+	IFLA_IPTUN_COLLECT_METADATA,
 	__IFLA_IPTUN_MAX,
 };
 #define IFLA_IPTUN_MAX	(__IFLA_IPTUN_MAX - 1)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..5719d6ba0824 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/udp.h>
+#include <net/dst_metadata.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	u32 headroom = sizeof(struct iphdr);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	const struct iphdr *inner_iph;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	__be16 df = 0;
+	u8 tos, ttl;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto tx_error;
+	key = &tun_info->key;
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	tos = key->tos;
+	if (tos == 1) {
+		if (skb->protocol == htons(ETH_P_IP))
+			tos = inner_iph->tos;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+	}
+	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+			 RT_TOS(tos), tunnel->parms.link);
+	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+		goto tx_error;
+	rt = ip_route_output_key(tunnel->net, &fl4);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+	if (rt->dst.dev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+	ttl = key->ttl;
+	if (ttl == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			ttl = inner_iph->ttl;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+		else
+			ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+		df = htons(IP_DF);
+	else if (skb->protocol == htons(ETH_P_IP))
+		df = inner_iph->frag_off & htons(IP_DF);
+	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+	if (headroom > dev->needed_headroom)
+		dev->needed_headroom = headroom;
+
+	if (skb_cow_head(skb, dev->needed_headroom)) {
+		ip_rt_put(rt);
+		goto tx_dropped;
+	}
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+	return;
+tx_error:
+	dev->stats.tx_errors++;
+	goto kfree;
+tx_dropped:
+	dev->stats.tx_dropped++;
+kfree:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, u8 protocol)
 {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc..c9392589c415 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -115,6 +115,7 @@
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+	struct metadata_dst *tun_dst = NULL;
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
 
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 			tpi = &ipip_tpi;
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		if (tunnel->collect_md) {
+			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+		}
+		return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 	}
 
 	return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 
 	skb_set_inner_ipproto(skb, ipproto);
 
-	ip_tunnel_xmit(skb, dev, tiph, ipproto);
+	if (tunnel->collect_md)
+		ip_md_tunnel_xmit(skb, dev, ipproto);
+	else
+		ip_tunnel_xmit(skb, dev, tiph, ipproto);
 	return NETDEV_TX_OK;
 
 tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
 }
 
 static void ipip_netlink_parms(struct nlattr *data[],
-			       struct ip_tunnel_parm *parms)
+			       struct ip_tunnel_parm *parms, bool *collect_md)
 {
 	memset(parms, 0, sizeof(*parms));
 
 	parms->iph.version = 4;
 	parms->iph.protocol = IPPROTO_IPIP;
 	parms->iph.ihl = 5;
+	*collect_md = false;
 
 	if (!data)
 		return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
 
 	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
 		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		*collect_md = true;
 }
 
 /* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[])
 {
+	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
-		struct ip_tunnel *t = netdev_priv(dev);
 		int err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &t->collect_md);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
 
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 {
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	bool collect_md;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
 		struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &collect_md);
+	if (collect_md)
+		return -EINVAL;
 
 	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
 	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_IPTUN_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			tunnel->encap.flags))
 		goto nla_put_failure;
 
+	if (tunnel->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
-- 
cgit v1.2.3


From 8d79266bc48c6ab6477d04e159cabf1e7809cb72 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:30 -0700
Subject: ip6_tunnel: add collect_md mode to IPv6 tunnels

Similar to gre, vxlan, geneve tunnels allow IPIP6 and IP6IP6 tunnels
to operate in 'collect metadata' mode.
Unlike ipv4 code here it's possible to reuse ip6_tnl_xmit() function
for both collect_md and traditional tunnels.
bpf_skb_[gs]et_tunnel_key() helpers and ovs (in the future) are the users.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |   1 +
 net/ipv6/ip6_tunnel.c    | 178 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 134 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 43a5a0e4524c..20ed9699fcd4 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -23,6 +23,7 @@ struct __ip6_tnl_parm {
 	__u8 proto;		/* tunnel protocol */
 	__u8 encap_limit;	/* encapsulation limit for tunnel */
 	__u8 hop_limit;		/* hop limit for tunnel */
+	bool collect_md;
 	__be32 flowinfo;	/* traffic class and flowlabel for tunnel */
 	__u32 flags;		/* tunnel flags */
 	struct in6_addr laddr;	/* local tunnel end-point address */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5c5779720ef1..6a66adba0c22 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -57,6 +57,7 @@
 #include <net/inet_ecn.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 MODULE_AUTHOR("Ville Nuorvala");
 MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -90,6 +91,7 @@ struct ip6_tnl_net {
 	struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
 	struct ip6_tnl __rcu *tnls_wc[1];
 	struct ip6_tnl __rcu **tnls[2];
+	struct ip6_tnl __rcu *collect_md_tun;
 };
 
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
 			return t;
 	}
 
+	t = rcu_dereference(ip6n->collect_md_tun);
+	if (t)
+		return t;
+
 	t = rcu_dereference(ip6n->tnls_wc[0]);
 	if (t && (t->dev->flags & IFF_UP))
 		return t;
@@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 {
 	struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ip6n->collect_md_tun, t);
 	rcu_assign_pointer(t->next , rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
 }
@@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *iter;
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ip6n->collect_md_tun, NULL);
+
 	for (tp = ip6_tnl_bucket(ip6n, &t->parms);
 	     (iter = rtnl_dereference(*tp)) != NULL;
 	     tp = &iter->next) {
@@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
 
 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 
+	if (tun_dst)
+		skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
@@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 {
 	struct ip6_tnl *t;
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct metadata_dst *tun_dst = NULL;
 	int ret = -1;
 
 	rcu_read_lock();
@@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 			goto drop;
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
-		ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
+		if (t->parms.collect_md) {
+			tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+		}
+		ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
 				    log_ecn_error);
 	}
 
@@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	int mtu;
 	unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
 	unsigned int max_headroom = psh_hlen;
+	u8 hop_limit;
 	int err = -1;
 
+	if (t->parms.collect_md) {
+		hop_limit = skb_tunnel_info(skb)->key.ttl;
+		goto route_lookup;
+	} else {
+		hop_limit = t->parms.hop_limit;
+	}
+
 	/* NBMA tunnel */
 	if (ipv6_addr_any(&t->parms.raddr)) {
 		struct in6_addr *addr6;
@@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 		goto tx_err_link_failure;
 
 	if (!dst) {
+route_lookup:
 		dst = ip6_route_output(net, NULL, fl6);
 
 		if (dst->error)
@@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 			dst = NULL;
 			goto tx_err_link_failure;
 		}
+		if (t->parms.collect_md &&
+		    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+				       &fl6->daddr, 0, &fl6->saddr))
+			goto tx_err_link_failure;
 		ndst = dst;
 	}
 
@@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	}
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
-	if (skb_dst(skb))
+	if (skb_dst(skb) && !t->parms.collect_md)
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 	if (skb->len > mtu && !skb_is_gso(skb)) {
 		*pmtu = mtu;
@@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 		skb = new_skb;
 	}
 
-	if (!fl6->flowi6_mark && ndst)
-		dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+	if (t->parms.collect_md) {
+		if (t->encap.type != TUNNEL_ENCAP_NONE)
+			goto tx_err_dst_release;
+	} else {
+		if (!fl6->flowi6_mark && ndst)
+			dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+	}
 	skb_dst_set(skb, dst);
 
 	if (encap_limit >= 0) {
@@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	ipv6h = ipv6_hdr(skb);
 	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
-	ipv6h->hop_limit = t->parms.hop_limit;
+	ipv6h->hop_limit = hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
 	ipv6h->daddr = fl6->daddr;
@@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (tproto != IPPROTO_IPIP && tproto != 0)
 		return -1;
 
-	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
+	dsfield = ipv4_get_dsfield(iph);
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_IPIP;
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
 
-	dsfield = ipv4_get_dsfield(iph);
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
+			return -1;
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPIP;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+	} else {
+		if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+			encap_limit = t->parms.encap_limit;
 
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
-					  & IPV6_TCLASS_MASK;
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPIP;
+
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+			fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+					 & IPV6_TCLASS_MASK;
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+			fl6.flowi6_mark = skb->mark;
+	}
 
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
@@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	    ip6_tnl_addr_conflict(t, ipv6h))
 		return -1;
 
-	offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-	if (offset > 0) {
-		struct ipv6_tlv_tnl_enc_lim *tel;
-		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
-		if (tel->encap_limit == 0) {
-			icmpv6_send(skb, ICMPV6_PARAMPROB,
-				    ICMPV6_HDR_FIELD, offset + 2);
+	dsfield = ipv6_get_dsfield(ipv6h);
+
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
+
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
 			return -1;
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPV6;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+	} else {
+		offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+		if (offset > 0) {
+			struct ipv6_tlv_tnl_enc_lim *tel;
+
+			tel = (void *)&skb_network_header(skb)[offset];
+			if (tel->encap_limit == 0) {
+				icmpv6_send(skb, ICMPV6_PARAMPROB,
+					    ICMPV6_HDR_FIELD, offset + 2);
+				return -1;
+			}
+			encap_limit = tel->encap_limit - 1;
+		} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+			encap_limit = t->parms.encap_limit;
 		}
-		encap_limit = tel->encap_limit - 1;
-	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_IPV6;
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPV6;
 
-	dsfield = ipv6_get_dsfield(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-		fl6.flowlabel |= ip6_flowlabel(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+			fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+			fl6.flowlabel |= ip6_flowlabel(ipv6h);
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+			fl6.flowi6_mark = skb->mark;
+	}
 
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
@@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
 	if (err)
 		return err;
 	ip6_tnl_link_config(t);
+	if (t->parms.collect_md) {
+		dev->features |= NETIF_F_NETNS_LOCAL;
+		netif_keep_dst(dev);
+	}
 	return 0;
 }
 
@@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_IPTUN_PROTO])
 		parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		parms->collect_md = true;
 }
 
 static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[])
 {
 	struct net *net = dev_net(dev);
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 	struct ip6_tnl *nt, *t;
 	struct ip_tunnel_encap ipencap;
 
@@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 
 	ip6_tnl_netlink_parms(data, &nt->parms);
 
-	t = ip6_tnl_locate(net, &nt->parms, 0);
-	if (!IS_ERR(t))
-		return -EEXIST;
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ip6n->collect_md_tun))
+			return -EEXIST;
+	} else {
+		t = ip6_tnl_locate(net, &nt->parms, 0);
+		if (!IS_ERR(t))
+			return -EEXIST;
+	}
 
 	return ip6_tnl_create2(dev);
 }
@@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 	ip6_tnl_netlink_parms(data, &p);
+	if (p.collect_md)
+		return -EINVAL;
 
 	t = ip6_tnl_locate(net, &p, 0);
 	if (!IS_ERR(t)) {
@@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_IPTUN_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
 		goto nla_put_failure;
 
-	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
-			tunnel->encap.type) ||
-	nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
-		     tunnel->encap.sport) ||
-	nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
-		     tunnel->encap.dport) ||
-	nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
-		    tunnel->encap.flags))
+	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
+	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
 		goto nla_put_failure;
 
+	if (parm->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ip6_link_ops __read_mostly = {
-- 
cgit v1.2.3


From 2c89791eeb6f3873349c240345c1879ef6a16f63 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:18 +0800
Subject: sctp: remove the unnecessary state check in sctp_outq_tail

Data Chunks are only sent by sctp_primitive_SEND, in which sctp checks
the asoc's state through statetable before calling sctp_outq_tail. So
there's no need to check the asoc's state again in sctp_outq_tail.

Besides, sctp_do_sm is protected by lock_sock, even if sending msg is
interrupted by timer events, the event's processes still need to acquire
lock_sock first. It means no others CMDs can be enqueue into side effect
list before CMD_SEND_MSG to change asoc->state, so it's safe to remove it.

This patch is to remove redundant asoc->state check from sctp_outq_tail.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 53 ++++++++++++++---------------------------------------
 1 file changed, 14 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 72e54a416af6..da2418b64c86 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -299,50 +299,25 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 	 * immediately.
 	 */
 	if (sctp_chunk_is_data(chunk)) {
-		/* Is it OK to queue data chunks?  */
-		/* From 9. Termination of Association
-		 *
-		 * When either endpoint performs a shutdown, the
-		 * association on each peer will stop accepting new
-		 * data from its user and only deliver data in queue
-		 * at the time of sending or receiving the SHUTDOWN
-		 * chunk.
-		 */
-		switch (q->asoc->state) {
-		case SCTP_STATE_CLOSED:
-		case SCTP_STATE_SHUTDOWN_PENDING:
-		case SCTP_STATE_SHUTDOWN_SENT:
-		case SCTP_STATE_SHUTDOWN_RECEIVED:
-		case SCTP_STATE_SHUTDOWN_ACK_SENT:
-			/* Cannot send after transport endpoint shutdown */
-			error = -ESHUTDOWN;
-			break;
-
-		default:
-			pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
-				 __func__, q, chunk, chunk && chunk->chunk_hdr ?
-				 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
-				 "illegal chunk");
-
-			sctp_chunk_hold(chunk);
-			sctp_outq_tail_data(q, chunk);
-			if (chunk->asoc->prsctp_enable &&
-			    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
-				chunk->asoc->sent_cnt_removable++;
-			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
-				SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
-			else
-				SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
-			break;
-		}
+		pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
+			 __func__, q, chunk, chunk && chunk->chunk_hdr ?
+			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+			 "illegal chunk");
+
+		sctp_chunk_hold(chunk);
+		sctp_outq_tail_data(q, chunk);
+		if (chunk->asoc->prsctp_enable &&
+		    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+			chunk->asoc->sent_cnt_removable++;
+		if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+			SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
+		else
+			SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
 	} else {
 		list_add_tail(&chunk->list, &q->control_chunk_list);
 		SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
 	}
 
-	if (error < 0)
-		return error;
-
 	if (!q->cork)
 		error = sctp_outq_flush(q, 0, gfp);
 
-- 
cgit v1.2.3


From 66388f2c08dfa38071f9eceae7bb29060d9be9aa Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:19 +0800
Subject: sctp: do not return the transmit err back to sctp_sendmsg

Once a chunk is enqueued successfully, sctp queues can take care of it.
Even if it is failed to transmit (like because of nomem), it should be
put into retransmit queue.

If sctp report this error to users, it confuses them, they may resend
that msg, but actually in kernel sctp stack is in charge of retransmit
it already.

Besides, this error probably is not from the failure of transmitting
current msg, but transmitting or retransmitting another msg's chunks,
as sctp_outq_flush just tries to send out all transports' chunks.

This patch is to make sctp_cmd_send_msg return avoid, and not return the
transmit err back to sctp_sendmsg

Fixes: 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg after sending a msg")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_sideeffect.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 12d45193357c..cf6e4f0de729 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1020,19 +1020,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
  * This way the whole message is queued up and bundling if
  * encouraged for small fragments.
  */
-static int sctp_cmd_send_msg(struct sctp_association *asoc,
-				struct sctp_datamsg *msg, gfp_t gfp)
+static void sctp_cmd_send_msg(struct sctp_association *asoc,
+			      struct sctp_datamsg *msg, gfp_t gfp)
 {
 	struct sctp_chunk *chunk;
-	int error = 0;
-
-	list_for_each_entry(chunk, &msg->chunks, frag_list) {
-		error = sctp_outq_tail(&asoc->outqueue, chunk, gfp);
-		if (error)
-			break;
-	}
 
-	return error;
+	list_for_each_entry(chunk, &msg->chunks, frag_list)
+		sctp_outq_tail(&asoc->outqueue, chunk, gfp);
 }
 
 
@@ -1709,7 +1703,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_outq_cork(&asoc->outqueue);
 				local_cork = 1;
 			}
-			error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
+			sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
 			break;
 		case SCTP_CMD_SEND_NEXT_ASCONF:
 			sctp_cmd_send_asconf(asoc);
-- 
cgit v1.2.3


From b61c654f9b3f1a271217e46c893f80565b1f754d Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:20 +0800
Subject: sctp: free msg->chunks when sctp_primitive_SEND return err

Last patch "sctp: do not return the transmit err back to sctp_sendmsg"
made sctp_primitive_SEND return err only when asoc state is unavailable.
In this case, chunks are not enqueued, they have no chance to be freed if
we don't take care of them later.

This Patch is actually to revert commit 1cd4d5c4326a ("sctp: remove the
unused sctp_datamsg_free()"), commit 69b5777f2e57 ("sctp: hold the chunks
only after the chunk is enqueued in outq") and commit 8b570dc9f7b6 ("sctp:
only drop the reference on the datamsg after sending a msg"), to use
sctp_datamsg_free to free the chunks of current msg.

Fixes: 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg after sending a msg")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 +
 net/sctp/chunk.c           | 13 +++++++++++++
 net/sctp/outqueue.c        |  1 -
 net/sctp/socket.c          |  8 ++++++--
 4 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ce93c4b10d26..f61fb7c87e53 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -537,6 +537,7 @@ struct sctp_datamsg {
 struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
 					    struct sctp_sndrcvinfo *,
 					    struct iov_iter *);
+void sctp_datamsg_free(struct sctp_datamsg *);
 void sctp_datamsg_put(struct sctp_datamsg *);
 void sctp_chunk_fail(struct sctp_chunk *, int error);
 int sctp_chunk_abandoned(struct sctp_chunk *);
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index a55e54738b81..af9cc8055465 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -70,6 +70,19 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
 	return msg;
 }
 
+void sctp_datamsg_free(struct sctp_datamsg *msg)
+{
+	struct sctp_chunk *chunk;
+
+	/* This doesn't have to be a _safe vairant because
+	 * sctp_chunk_free() only drops the refs.
+	 */
+	list_for_each_entry(chunk, &msg->chunks, frag_list)
+		sctp_chunk_free(chunk);
+
+	sctp_datamsg_put(msg);
+}
+
 /* Final destructruction of datamsg memory. */
 static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index da2418b64c86..6c109b0f8495 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -304,7 +304,6 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
 			 "illegal chunk");
 
-		sctp_chunk_hold(chunk);
 		sctp_outq_tail_data(q, chunk);
 		if (chunk->asoc->prsctp_enable &&
 		    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9fc417a8b476..6cdc61c21438 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1958,6 +1958,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	/* Now send the (possibly) fragmented message. */
 	list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
+		sctp_chunk_hold(chunk);
+
 		/* Do accounting for the write space.  */
 		sctp_set_owner_w(chunk);
 
@@ -1970,13 +1972,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	 * breaks.
 	 */
 	err = sctp_primitive_SEND(net, asoc, datamsg);
-	sctp_datamsg_put(datamsg);
 	/* Did the lower layer accept the chunk? */
-	if (err)
+	if (err) {
+		sctp_datamsg_free(datamsg);
 		goto out_free;
+	}
 
 	pr_debug("%s: we sent primitively\n", __func__);
 
+	sctp_datamsg_put(datamsg);
 	err = msg_len;
 
 	if (unlikely(wait_connect)) {
-- 
cgit v1.2.3


From 645194409b0634a43890ec27c491c368b3bffc07 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:21 +0800
Subject: sctp: save transmit error to sk_err in sctp_outq_flush

Every time when sctp calls sctp_outq_flush, it sends out the chunks of
control queue, retransmit queue and data queue. Even if some trunks are
failed to transmit, it still has to flush all the transports, as it's
the only chance to clean that transmit_list.

So the latest transmit error here should be returned back. This transmit
error is an internal error of sctp stack.

I checked all the places where it uses the transmit error (the return
value of sctp_outq_flush), most of them are actually just save it to
sk_err.

Except for sctp_assoc/endpoint_bh_rcv, they will drop the chunk if
it's failed to send a REPLY, which is actually incorrect, as we can't
be sure the error that sctp_outq_flush returns is from sending that
REPLY.

So it's meaningless for sctp_outq_flush to return error back.

This patch is to save transmit error to sk_err in sctp_outq_flush, the
new error can update the old value. Eventually, sctp_wait_for_* would
check for it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/output.c   |  3 ++-
 net/sctp/outqueue.c | 21 ++++++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 31b7bc35895d..f2597a9eff74 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -180,7 +180,6 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 				       int one_packet, gfp_t gfp)
 {
 	sctp_xmit_t retval;
-	int error = 0;
 
 	pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
 		 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
@@ -188,6 +187,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 	switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
 	case SCTP_XMIT_PMTU_FULL:
 		if (!packet->has_cookie_echo) {
+			int error = 0;
+
 			error = sctp_packet_transmit(packet, gfp);
 			if (error < 0)
 				chunk->skb->sk->sk_err = -error;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 6c109b0f8495..052a4796a457 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -533,7 +533,6 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 		     sctp_retransmit_reason_t reason)
 {
 	struct net *net = sock_net(q->asoc->base.sk);
-	int error = 0;
 
 	switch (reason) {
 	case SCTP_RTXR_T3_RTX:
@@ -577,10 +576,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 	 * will be flushed at the end.
 	 */
 	if (reason != SCTP_RTXR_FAST_RTX)
-		error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
-
-	if (error)
-		q->asoc->base.sk->sk_err = -error;
+		sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
 }
 
 /*
@@ -893,8 +889,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			sctp_packet_config(&singleton, vtag, 0);
 			sctp_packet_append_chunk(&singleton, chunk);
 			error = sctp_packet_transmit(&singleton, gfp);
-			if (error < 0)
-				return error;
+			if (error < 0) {
+				asoc->base.sk->sk_err = -error;
+				return 0;
+			}
 			break;
 
 		case SCTP_CID_ABORT:
@@ -992,6 +990,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		retran:
 			error = sctp_outq_flush_rtx(q, packet,
 						    rtx_timeout, &start_timer);
+			if (error < 0)
+				asoc->base.sk->sk_err = -error;
 
 			if (start_timer) {
 				sctp_transport_reset_t3_rtx(transport);
@@ -1166,14 +1166,17 @@ sctp_flush_out:
 						      struct sctp_transport,
 						      send_ready);
 		packet = &t->packet;
-		if (!sctp_packet_empty(packet))
+		if (!sctp_packet_empty(packet)) {
 			error = sctp_packet_transmit(packet, gfp);
+			if (error < 0)
+				asoc->base.sk->sk_err = -error;
+		}
 
 		/* Clear the burst limited state, if any */
 		sctp_transport_burst_reset(t);
 	}
 
-	return error;
+	return 0;
 }
 
 /* Update unack_data based on the incoming SACK chunk */
-- 
cgit v1.2.3


From 83dbc3d4a38411ef38f680d7045c8478cc9c5a56 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:22 +0800
Subject: sctp: make sctp_outq_flush/tail/uncork return void

sctp_outq_flush return value is meaningless now, this patch is
to make sctp_outq_flush return void, as well as sctp_outq_fail
and sctp_outq_uncork.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ++--
 net/sctp/outqueue.c        | 19 +++++++------------
 net/sctp/sm_sideeffect.c   |  9 ++++-----
 3 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index f61fb7c87e53..8693dc452a7f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1077,7 +1077,7 @@ struct sctp_outq {
 void sctp_outq_init(struct sctp_association *, struct sctp_outq *);
 void sctp_outq_teardown(struct sctp_outq *);
 void sctp_outq_free(struct sctp_outq*);
-int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
+void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
 int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *);
 int sctp_outq_is_empty(const struct sctp_outq *);
 void sctp_outq_restart(struct sctp_outq *);
@@ -1085,7 +1085,7 @@ void sctp_outq_restart(struct sctp_outq *);
 void sctp_retransmit(struct sctp_outq *, struct sctp_transport *,
 		     sctp_retransmit_reason_t);
 void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
-int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
+void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
 void sctp_prsctp_prune(struct sctp_association *asoc,
 		       struct sctp_sndrcvinfo *sinfo, int msg_len);
 /* Uncork and flush an outqueue.  */
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 052a4796a457..8c3f446d965c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
 
 static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
 
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
 static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,10 +285,9 @@ void sctp_outq_free(struct sctp_outq *q)
 }
 
 /* Put a new chunk in an sctp_outq.  */
-int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
+void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 {
 	struct net *net = sock_net(q->asoc->base.sk);
-	int error = 0;
 
 	pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
 		 chunk && chunk->chunk_hdr ?
@@ -318,9 +317,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 	}
 
 	if (!q->cork)
-		error = sctp_outq_flush(q, 0, gfp);
-
-	return error;
+		sctp_outq_flush(q, 0, gfp);
 }
 
 /* Insert a chunk into the sorted list based on the TSNs.  The retransmit list
@@ -748,12 +745,12 @@ redo:
 }
 
 /* Cork the outqueue so queued chunks are really queued. */
-int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
+void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 {
 	if (q->cork)
 		q->cork = 0;
 
-	return sctp_outq_flush(q, 0, gfp);
+	sctp_outq_flush(q, 0, gfp);
 }
 
 
@@ -766,7 +763,7 @@ int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
  * locking concerns must be made.  Today we use the sock lock to protect
  * this function.
  */
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
 	struct sctp_packet singleton;
@@ -891,7 +888,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			error = sctp_packet_transmit(&singleton, gfp);
 			if (error < 0) {
 				asoc->base.sk->sk_err = -error;
-				return 0;
+				return;
 			}
 			break;
 
@@ -1175,8 +1172,6 @@ sctp_flush_out:
 		/* Clear the burst limited state, if any */
 		sctp_transport_burst_reset(t);
 	}
-
-	return 0;
 }
 
 /* Update unack_data based on the incoming SACK chunk */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index cf6e4f0de729..c345bf153bed 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1421,8 +1421,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				local_cork = 1;
 			}
 			/* Send a chunk to our peer.  */
-			error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk,
-					       gfp);
+			sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp);
 			break;
 
 		case SCTP_CMD_SEND_PKT:
@@ -1676,7 +1675,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_FORCE_PRIM_RETRAN:
 			t = asoc->peer.retran_path;
 			asoc->peer.retran_path = asoc->peer.primary_path;
-			error = sctp_outq_uncork(&asoc->outqueue, gfp);
+			sctp_outq_uncork(&asoc->outqueue, gfp);
 			local_cork = 0;
 			asoc->peer.retran_path = t;
 			break;
@@ -1733,9 +1732,9 @@ out:
 	 */
 	if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
 		if (chunk->end_of_packet || chunk->singleton)
-			error = sctp_outq_uncork(&asoc->outqueue, gfp);
+			sctp_outq_uncork(&asoc->outqueue, gfp);
 	} else if (local_cork)
-		error = sctp_outq_uncork(&asoc->outqueue, gfp);
+		sctp_outq_uncork(&asoc->outqueue, gfp);
 
 	if (sp->data_ready_signalled)
 		sp->data_ready_signalled = 0;
-- 
cgit v1.2.3


From 41001faf95faaff7c4f4f93c6bb544ee227ad0cc Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Sep 2016 02:04:23 +0800
Subject: sctp: not return ENOMEM err back in sctp_packet_transmit

As David and Marcelo's suggestion, ENOMEM err shouldn't return back to
user in transmit path. Instead, sctp's retransmit would take care of
the chunks that fail to send because of ENOMEM.

This patch is only to do some release job when alloc_skb fails, not to
return ENOMEM back any more.

Besides, it also cleans up sctp_packet_transmit's err path, and fixes
some issues in err path:

 - It didn't free the head skb in nomem: path.
 - No need to check nskb in no_route: path.
 - It should goto err: path if alloc_skb fails for head.
 - Not all the NOMEMs should free nskb.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/output.c | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/sctp/output.c b/net/sctp/output.c
index f2597a9eff74..0c605ec74dc4 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -442,14 +442,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 			 * time. Application may notice this error.
 			 */
 			pr_err_once("Trying to GSO but underlying device doesn't support it.");
-			goto nomem;
+			goto err;
 		}
 	} else {
 		pkt_size = packet->size;
 	}
 	head = alloc_skb(pkt_size + MAX_HEADER, gfp);
 	if (!head)
-		goto nomem;
+		goto err;
 	if (gso) {
 		NAPI_GRO_CB(head)->last = head;
 		skb_shinfo(head)->gso_type = sk->sk_gso_type;
@@ -470,8 +470,12 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 		}
 	}
 	dst = dst_clone(tp->dst);
-	if (!dst)
-		goto no_route;
+	if (!dst) {
+		if (asoc)
+			IP_INC_STATS(sock_net(asoc->base.sk),
+				     IPSTATS_MIB_OUTNOROUTES);
+		goto nodst;
+	}
 	skb_dst_set(head, dst);
 
 	/* Build the SCTP header.  */
@@ -622,8 +626,10 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 		if (!gso)
 			break;
 
-		if (skb_gro_receive(&head, nskb))
+		if (skb_gro_receive(&head, nskb)) {
+			kfree_skb(nskb);
 			goto nomem;
+		}
 		nskb = NULL;
 		if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
 				 sk->sk_gso_max_segs))
@@ -717,18 +723,13 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	}
 	head->ignore_df = packet->ipfragok;
 	tp->af_specific->sctp_xmit(head, tp);
+	goto out;
 
-out:
-	sctp_packet_reset(packet);
-	return err;
-no_route:
-	kfree_skb(head);
-	if (nskb != head)
-		kfree_skb(nskb);
-
-	if (asoc)
-		IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
+nomem:
+	if (packet->auth && list_empty(&packet->auth->list))
+		sctp_chunk_free(packet->auth);
 
+nodst:
 	/* FIXME: Returning the 'err' will effect all the associations
 	 * associated with a socket, although only one of the paths of the
 	 * association is unreachable.
@@ -737,22 +738,18 @@ no_route:
 	 * required.
 	 */
 	 /* err = -EHOSTUNREACH; */
-err:
-	/* Control chunks are unreliable so just drop them.  DATA chunks
-	 * will get resent or dropped later.
-	 */
+	kfree_skb(head);
 
+err:
 	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
 		list_del_init(&chunk->list);
 		if (!sctp_chunk_is_data(chunk))
 			sctp_chunk_free(chunk);
 	}
-	goto out;
-nomem:
-	if (packet->auth && list_empty(&packet->auth->list))
-		sctp_chunk_free(packet->auth);
-	err = -ENOMEM;
-	goto err;
+
+out:
+	sctp_packet_reset(packet);
+	return err;
 }
 
 /********************************************************************
-- 
cgit v1.2.3


From 40773966ccf1985a1b2bb570a03cbeaf1cbd4e00 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Date: Thu, 15 Sep 2016 19:11:52 -0300
Subject: openvswitch: fix flow stats accounting when node 0 is not possible

On a system with only node 1 as possible, all statistics is going to be
accounted on node 0 as it will have a single writer.

However, when getting and clearing the statistics, node 0 is not going
to be considered, as it's not a possible node.

Tested that statistics are not zero on a system with only node 1
possible. Also compile-tested with CONFIG_NUMA off.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow.c       | 6 ++++--
 net/openvswitch/flow_table.c | 5 +++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 1240ae3b88d2..5b80612df182 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -142,7 +142,8 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
 	*tcp_flags = 0;
 	memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-	for_each_node(node) {
+	/* We open code this to make sure node 0 is always considered */
+	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
 		struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
 
 		if (stats) {
@@ -165,7 +166,8 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
 {
 	int node;
 
-	for_each_node(node) {
+	/* We open code this to make sure node 0 is always considered */
+	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
 		struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
 
 		if (stats) {
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d073fff82fdb..957a3c31dbb0 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -148,8 +148,9 @@ static void flow_free(struct sw_flow *flow)
 		kfree(flow->id.unmasked_key);
 	if (flow->sf_acts)
 		ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
-	for_each_node(node)
-		if (flow->stats[node])
+	/* We open code this to make sure node 0 is always considered */
+	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map))
+		if (node != 0 && flow->stats[node])
 			kmem_cache_free(flow_stats_cache,
 					(struct flow_stats __force *)flow->stats[node]);
 	kmem_cache_free(flow_cache, flow);
-- 
cgit v1.2.3


From db74a3335e0f645e3139c80bcfc90feb01d8e304 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Date: Thu, 15 Sep 2016 19:11:53 -0300
Subject: openvswitch: use percpu flow stats

Instead of using flow stats per NUMA node, use it per CPU. When using
megaflows, the stats lock can be a bottleneck in scalability.

On a E5-2690 12-core system, usual throughput went from ~4Mpps to
~15Mpps when forwarding between two 40GbE ports with a single flow
configured on the datapath.

This has been tested on a system with possible CPUs 0-7,16-23. After
module removal, there were no corruption on the slab cache.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Cc: pravin shelar <pshelar@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow.c       | 42 ++++++++++++++++++++++--------------------
 net/openvswitch/flow.h       |  4 ++--
 net/openvswitch/flow_table.c | 26 +++++++++-----------------
 3 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 5b80612df182..0fa45439def1 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/in.h>
 #include <linux/rcupdate.h>
+#include <linux/cpumask.h>
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
@@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
 {
 	struct flow_stats *stats;
 	int node = numa_node_id();
+	int cpu = smp_processor_id();
 	int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
-	stats = rcu_dereference(flow->stats[node]);
+	stats = rcu_dereference(flow->stats[cpu]);
 
-	/* Check if already have node-specific stats. */
+	/* Check if already have CPU-specific stats. */
 	if (likely(stats)) {
 		spin_lock(&stats->lock);
 		/* Mark if we write on the pre-allocated stats. */
-		if (node == 0 && unlikely(flow->stats_last_writer != node))
-			flow->stats_last_writer = node;
+		if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
+			flow->stats_last_writer = cpu;
 	} else {
 		stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
 		spin_lock(&stats->lock);
 
-		/* If the current NUMA-node is the only writer on the
+		/* If the current CPU is the only writer on the
 		 * pre-allocated stats keep using them.
 		 */
-		if (unlikely(flow->stats_last_writer != node)) {
+		if (unlikely(flow->stats_last_writer != cpu)) {
 			/* A previous locker may have already allocated the
-			 * stats, so we need to check again.  If node-specific
+			 * stats, so we need to check again.  If CPU-specific
 			 * stats were already allocated, we update the pre-
 			 * allocated stats as we have already locked them.
 			 */
-			if (likely(flow->stats_last_writer != NUMA_NO_NODE)
-			    && likely(!rcu_access_pointer(flow->stats[node]))) {
-				/* Try to allocate node-specific stats. */
+			if (likely(flow->stats_last_writer != -1) &&
+			    likely(!rcu_access_pointer(flow->stats[cpu]))) {
+				/* Try to allocate CPU-specific stats. */
 				struct flow_stats *new_stats;
 
 				new_stats =
@@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
 					new_stats->tcp_flags = tcp_flags;
 					spin_lock_init(&new_stats->lock);
 
-					rcu_assign_pointer(flow->stats[node],
+					rcu_assign_pointer(flow->stats[cpu],
 							   new_stats);
 					goto unlock;
 				}
 			}
-			flow->stats_last_writer = node;
+			flow->stats_last_writer = cpu;
 		}
 	}
 
@@ -136,15 +138,15 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
 			struct ovs_flow_stats *ovs_stats,
 			unsigned long *used, __be16 *tcp_flags)
 {
-	int node;
+	int cpu;
 
 	*used = 0;
 	*tcp_flags = 0;
 	memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-	/* We open code this to make sure node 0 is always considered */
-	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
-		struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
+	/* We open code this to make sure cpu 0 is always considered */
+	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+		struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
 
 		if (stats) {
 			/* Local CPU may write on non-local stats, so we must
@@ -164,11 +166,11 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
 /* Called with ovs_mutex. */
 void ovs_flow_stats_clear(struct sw_flow *flow)
 {
-	int node;
+	int cpu;
 
-	/* We open code this to make sure node 0 is always considered */
-	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
-		struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
+	/* We open code this to make sure cpu 0 is always considered */
+	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+		struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
 
 		if (stats) {
 			spin_lock_bh(&stats->lock);
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 156a3029c17b..ae783f5c6695 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -178,14 +178,14 @@ struct sw_flow {
 		struct hlist_node node[2];
 		u32 hash;
 	} flow_table, ufid_table;
-	int stats_last_writer;		/* NUMA-node id of the last writer on
+	int stats_last_writer;		/* CPU id of the last writer on
 					 * 'stats[0]'.
 					 */
 	struct sw_flow_key key;
 	struct sw_flow_id id;
 	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
-	struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one
+	struct flow_stats __rcu *stats[]; /* One for each CPU.  First one
 					   * is allocated at flow creation time,
 					   * the rest are allocated on demand
 					   * while holding the 'stats[0].lock'.
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 957a3c31dbb0..ea7a8073fa02 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/in.h>
 #include <linux/rcupdate.h>
+#include <linux/cpumask.h>
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
@@ -79,17 +80,12 @@ struct sw_flow *ovs_flow_alloc(void)
 {
 	struct sw_flow *flow;
 	struct flow_stats *stats;
-	int node;
 
-	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+	flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
 	if (!flow)
 		return ERR_PTR(-ENOMEM);
 
-	flow->sf_acts = NULL;
-	flow->mask = NULL;
-	flow->id.unmasked_key = NULL;
-	flow->id.ufid_len = 0;
-	flow->stats_last_writer = NUMA_NO_NODE;
+	flow->stats_last_writer = -1;
 
 	/* Initialize the default stat node. */
 	stats = kmem_cache_alloc_node(flow_stats_cache,
@@ -102,10 +98,6 @@ struct sw_flow *ovs_flow_alloc(void)
 
 	RCU_INIT_POINTER(flow->stats[0], stats);
 
-	for_each_node(node)
-		if (node != 0)
-			RCU_INIT_POINTER(flow->stats[node], NULL);
-
 	return flow;
 err:
 	kmem_cache_free(flow_cache, flow);
@@ -142,17 +134,17 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 
 static void flow_free(struct sw_flow *flow)
 {
-	int node;
+	int cpu;
 
 	if (ovs_identifier_is_key(&flow->id))
 		kfree(flow->id.unmasked_key);
 	if (flow->sf_acts)
 		ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
-	/* We open code this to make sure node 0 is always considered */
-	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map))
-		if (node != 0 && flow->stats[node])
+	/* We open code this to make sure cpu 0 is always considered */
+	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask))
+		if (flow->stats[cpu])
 			kmem_cache_free(flow_stats_cache,
-					(struct flow_stats __force *)flow->stats[node]);
+					(struct flow_stats __force *)flow->stats[cpu]);
 	kmem_cache_free(flow_cache, flow);
 }
 
@@ -757,7 +749,7 @@ int ovs_flow_init(void)
 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
 	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
-				       + (nr_node_ids
+				       + (nr_cpu_ids
 					  * sizeof(struct flow_stats *)),
 				       0, 0, NULL);
 	if (flow_cache == NULL)
-- 
cgit v1.2.3


From 695b4ec0f0a9cf29deabd3ac075911d58b31f42b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 15 Sep 2016 16:20:01 -0700
Subject: pkt_sched: fq: use proper locking in fq_dump_stats()

When fq is used on 32bit kernels, we need to lock the qdisc before
copying 64bit fields.

Otherwise "tc -s qdisc ..." might report bogus values.

Fixes: afe4fd062416 ("pkt_sched: fq: Fair Queue packet scheduler")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index e5458b99e09c..dc52cc10d6ed 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -823,20 +823,24 @@ nla_put_failure:
 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
-	u64 now = ktime_get_ns();
-	struct tc_fq_qd_stats st = {
-		.gc_flows		= q->stat_gc_flows,
-		.highprio_packets	= q->stat_internal_packets,
-		.tcp_retrans		= q->stat_tcp_retrans,
-		.throttled		= q->stat_throttled,
-		.flows_plimit		= q->stat_flows_plimit,
-		.pkts_too_long		= q->stat_pkts_too_long,
-		.allocation_errors	= q->stat_allocation_errors,
-		.flows			= q->flows,
-		.inactive_flows		= q->inactive_flows,
-		.throttled_flows	= q->throttled_flows,
-		.time_next_delayed_flow	= q->time_next_delayed_flow - now,
-	};
+	struct tc_fq_qd_stats st;
+
+	sch_tree_lock(sch);
+
+	st.gc_flows		  = q->stat_gc_flows;
+	st.highprio_packets	  = q->stat_internal_packets;
+	st.tcp_retrans		  = q->stat_tcp_retrans;
+	st.throttled		  = q->stat_throttled;
+	st.flows_plimit		  = q->stat_flows_plimit;
+	st.pkts_too_long	  = q->stat_pkts_too_long;
+	st.allocation_errors	  = q->stat_allocation_errors;
+	st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+	st.flows		  = q->flows;
+	st.inactive_flows	  = q->inactive_flows;
+	st.throttled_flows	  = q->throttled_flows;
+	st.pad			  = 0;
+
+	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
-- 
cgit v1.2.3


From 69ae6ad2ff37911903a90256e216d7e7ae460002 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Fri, 16 Sep 2016 15:05:37 +0200
Subject: net: core: Add offload stats to if_stats_msg

Add a nested attribute of offload stats to if_stats_msg
named IFLA_STATS_LINK_OFFLOAD_XSTATS.
Under it, add SW stats, meaning stats only per packets that went via
slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |   9 ++++
 net/core/rtnetlink.c         | 111 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9bf3aecfe05b..2351776a724f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -826,6 +826,7 @@ enum {
 	IFLA_STATS_LINK_64,
 	IFLA_STATS_LINK_XSTATS,
 	IFLA_STATS_LINK_XSTATS_SLAVE,
+	IFLA_STATS_LINK_OFFLOAD_XSTATS,
 	__IFLA_STATS_MAX,
 };
 
@@ -845,6 +846,14 @@ enum {
 };
 #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
 
+/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */
+enum {
+	IFLA_OFFLOAD_XSTATS_UNSPEC,
+	IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */
+	__IFLA_OFFLOAD_XSTATS_MAX
+};
+#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1)
+
 /* XDP section */
 
 enum {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 937e459bdaa9..0dbae4244a89 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
 	       (!idxattr || idxattr == attrid);
 }
 
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return sizeof(struct rtnl_link_stats64);
+	}
+
+	return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+				  int *prividx)
+{
+	struct nlattr *attr = NULL;
+	int attr_id, size;
+	void *attr_data;
+	int err;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return -ENODATA;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (attr_id < *prividx)
+			continue;
+
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		if (!size)
+			continue;
+
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+
+		attr = nla_reserve_64bit(skb, attr_id, size,
+					 IFLA_OFFLOAD_XSTATS_UNSPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		attr_data = nla_data(attr);
+		memset(attr_data, 0, size);
+		err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+							     attr_data);
+		if (err)
+			goto get_offload_stats_failure;
+	}
+
+	if (!attr)
+		return -ENODATA;
+
+	*prividx = 0;
+	return 0;
+
+nla_put_failure:
+	err = -EMSGSIZE;
+get_offload_stats_failure:
+	*prividx = attr_id;
+	return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+	int nla_size = 0;
+	int attr_id;
+	int size;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return 0;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		nla_size += nla_total_size_64bit(size);
+	}
+
+	if (nla_size != 0)
+		nla_size += nla_total_size(0);
+
+	return nla_size;
+}
+
 static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 			       int type, u32 pid, u32 seq, u32 change,
 			       unsigned int flags, unsigned int filter_mask,
@@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 	struct nlmsghdr *nlh;
 	struct nlattr *attr;
 	int s_prividx = *prividx;
+	int err;
 
 	ASSERT_RTNL();
 
@@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
 
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS);
@@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		if (master)
 			ops = master->rtnl_link_ops;
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+			     *idxattr)) {
+		*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+		attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+		if (!attr)
+			goto nla_put_failure;
+
+		err = rtnl_get_offload_stats(skb, dev, prividx);
+		if (err == -ENODATA)
+			nla_nest_cancel(skb, attr);
+		else
+			nla_nest_end(skb, attr);
+
+		if (err && err != -ENODATA)
+			goto nla_put_failure;
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+		size += rtnl_get_offload_stats_size(dev);
+
 	return size;
 }
 
-- 
cgit v1.2.3


From d409b84768037ad03d1d73538d99fb902adf7365 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Fri, 16 Sep 2016 12:59:08 -0700
Subject: ipv6: Export p6_route_input_lookup symbol

Make ip6_route_input_lookup available outside of ipv6 the module
similar to ip_route_input_noref in the IPv4 world.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 +++
 net/ipv6/route.c        | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d97305d0e71f..e0cd318d5103 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 }
 
 void ip6_route_input(struct sk_buff *skb);
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+					 struct net_device *dev,
+					 struct flowi6 *fl6, int flags);
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 					 struct flowi6 *fl6, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..4dab585f7642 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *
 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 }
 
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
-						struct net_device *dev,
-						struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+					 struct net_device *dev,
+					 struct flowi6 *fl6, int flags)
 {
 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
 		flags |= RT6_LOOKUP_F_IFACE;
 
 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 void ip6_route_input(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From e8bffe0cf964f0330595bb376b74921cccdaac88 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Fri, 16 Sep 2016 12:59:13 -0700
Subject: net: Add _nf_(un)register_hooks symbols

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c      | 51 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
    need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
 	struct net *net, *last;
 	int ret;
 
-	rtnl_lock();
 	for_each_net(net) {
 		ret = nf_register_net_hook(net, reg);
 		if (ret && ret != -ENOENT)
 			goto rollback;
 	}
 	list_add_tail(&reg->list, &nf_hook_list);
-	rtnl_unlock();
 
 	return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
 			break;
 		nf_unregister_net_hook(net, reg);
 	}
+	return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = _nf_register_hook(reg);
 	rtnl_unlock();
+
 	return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
 	struct net *net;
 
-	rtnl_lock();
 	list_del(&reg->list);
 	for_each_net(net)
 		nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	rtnl_lock();
+	_nf_unregister_hook(reg);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = _nf_register_hook(&reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		_nf_unregister_hooks(reg, i);
+	return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
 	while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	while (n-- > 0)
+		_nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
 			struct sk_buff *skb,
 			struct nf_hook_state *state,
-- 
cgit v1.2.3


From e8bc8f9a670e26e91562e724a2114243898bd616 Mon Sep 17 00:00:00 2001
From: Christophe Jaillet <christophe.jaillet@wanadoo.fr>
Date: Fri, 16 Sep 2016 23:05:35 +0200
Subject: sctp: Remove some redundant code

In commit 311b21774f13 ("sctp: simplify sk_receive_queue locking"), a call
to 'skb_queue_splice_tail_init()' has been made explicit. Previously it was
hidden in 'sctp_skb_list_tail()'

Now, the code around it looks redundant. The '_init()' part of
'skb_queue_splice_tail_init()' should already do the same.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ulpqueue.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 877e55066f89..84d0fdaf7de9 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -140,11 +140,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
 		 * we can go ahead and clear out the lobby in one shot
 		 */
 		if (!skb_queue_empty(&sp->pd_lobby)) {
-			struct list_head *list;
 			skb_queue_splice_tail_init(&sp->pd_lobby,
 						   &sk->sk_receive_queue);
-			list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
-			INIT_LIST_HEAD(list);
 			return 1;
 		}
 	} else {
-- 
cgit v1.2.3


From 1486587b2fcda08dee7eab23784d504eed772c45 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 18 Sep 2016 00:57:30 +0200
Subject: pie: use qdisc_dequeue_head wrapper

Doesn't change generated code.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_pie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index a570b0bb254c..d976d74b22d7 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
 {
 	struct sk_buff *skb;
-	skb = __qdisc_dequeue_head(sch, &sch->q);
+	skb = qdisc_dequeue_head(sch);
 
 	if (!skb)
 		return NULL;
-- 
cgit v1.2.3


From 97d0678f913369af0dc8b510a682a641654ab743 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 18 Sep 2016 00:57:31 +0200
Subject: sched: don't use skb queue helpers

A followup change will replace the sk_buff_head in the qdisc
struct with a slightly different list.

Use of the sk_buff_head helpers will thus cause compiler
warnings.

Open-code these accesses in an extra change to ease review.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fifo.c    | 4 ++--
 net/sched/sch_generic.c | 2 +-
 net/sched/sch_netem.c   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index baeed6a78d28..1e37247656f8 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			 struct sk_buff **to_free)
 {
-	if (likely(skb_queue_len(&sch->q) < sch->limit))
+	if (likely(sch->q.qlen < sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
 	return qdisc_drop(skb, sch, to_free);
@@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 {
 	unsigned int prev_backlog;
 
-	if (likely(skb_queue_len(&sch->q) < sch->limit))
+	if (likely(sch->q.qlen < sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
 	prev_backlog = sch->qstats.backlog;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0d21b567ff27..5e63bf638350 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -486,7 +486,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 			      struct sk_buff **to_free)
 {
-	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
 		int band = prio2band[skb->priority & TC_PRIO_MAX];
 		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 		struct sk_buff_head *list = band2list(priv, band);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index aaaf02175338..1832d7732dbc 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -502,7 +502,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			1<<(prandom_u32() % 8);
 	}
 
-	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+	if (unlikely(sch->q.qlen >= sch->limit))
 		return qdisc_drop(skb, sch, to_free);
 
 	qdisc_qstats_backlog_inc(sch, skb);
@@ -522,7 +522,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		if (q->rate) {
 			struct sk_buff *last;
 
-			if (!skb_queue_empty(&sch->q))
+			if (sch->q.qlen)
 				last = skb_peek_tail(&sch->q);
 			else
 				last = netem_rb_to_skb(rb_last(&q->t_root));
-- 
cgit v1.2.3


From ec323368793b8570c02e723127611a8d906a9b3f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 18 Sep 2016 00:57:32 +0200
Subject: sched: remove qdisc arg from __qdisc_dequeue_head

Moves qdisc stat accouting to qdisc_dequeue_head.

The only direct caller of the __qdisc_dequeue_head version open-codes
this now.

This allows us to later use __qdisc_dequeue_head as a replacement
of __skb_dequeue() (which operates on sk_buff_head list).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 15 ++++++++-------
 net/sched/sch_generic.c   |  7 ++++++-
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 52a2015667b4..0741ed41575b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -614,11 +614,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
 	return __qdisc_enqueue_tail(skb, sch, &sch->q);
 }
 
-static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch,
-						   struct sk_buff_head *list)
+static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list)
 {
 	struct sk_buff *skb = __skb_dequeue(list);
 
+	return skb;
+}
+
+static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
+{
+	struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+
 	if (likely(skb != NULL)) {
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_bstats_update(sch, skb);
@@ -627,11 +633,6 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch,
 	return skb;
 }
 
-static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
-{
-	return __qdisc_dequeue_head(sch, &sch->q);
-}
-
 /* Instead of calling kfree_skb() while root qdisc lock is held,
  * queue the skb for future freeing at end of __dev_xmit_skb()
  */
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5e63bf638350..73877d9c2bcb 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -506,7 +506,12 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 
 	if (likely(band >= 0)) {
 		struct sk_buff_head *list = band2list(priv, band);
-		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+		struct sk_buff *skb = __qdisc_dequeue_head(list);
+
+		if (likely(skb != NULL)) {
+			qdisc_qstats_backlog_dec(qdisc, skb);
+			qdisc_bstats_update(qdisc, skb);
+		}
 
 		qdisc->q.qlen--;
 		if (skb_queue_empty(list))
-- 
cgit v1.2.3


From ed760cb8aae7c2b84c193d4a7637b0c9e752f07e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 18 Sep 2016 00:57:33 +0200
Subject: sched: replace __skb_dequeue with __qdisc_dequeue_head

After previous patch these functions are identical.
Replace __skb_dequeue in qdiscs with __qdisc_dequeue_head.

Next patch will then make __qdisc_dequeue_head handle
single-linked list instead of strcut sk_buff_head argument.

Doesn't change generated code.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_codel.c | 4 ++--
 net/sched/sch_netem.c | 2 +-
 net/sched/sch_pie.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 4002df3c7d9f..5bfa79ee657c 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -69,7 +69,7 @@ struct codel_sched_data {
 static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
 {
 	struct Qdisc *sch = ctx;
-	struct sk_buff *skb = __skb_dequeue(&sch->q);
+	struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
 
 	if (skb)
 		sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
 
 	qlen = sch->q.qlen;
 	while (sch->q.qlen > sch->limit) {
-		struct sk_buff *skb = __skb_dequeue(&sch->q);
+		struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
 
 		dropped += qdisc_pkt_len(skb);
 		qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 1832d7732dbc..0a964b35f8c7 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -587,7 +587,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	struct rb_node *p;
 
 tfifo_dequeue:
-	skb = __skb_dequeue(&sch->q);
+	skb = __qdisc_dequeue_head(&sch->q);
 	if (skb) {
 		qdisc_qstats_backlog_dec(sch, skb);
 deliver:
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index d976d74b22d7..5c3a99d6aa82 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
 	/* Drop excess packets if new limit is lower */
 	qlen = sch->q.qlen;
 	while (sch->q.qlen > sch->limit) {
-		struct sk_buff *skb = __skb_dequeue(&sch->q);
+		struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
 
 		dropped += qdisc_pkt_len(skb);
 		qdisc_qstats_backlog_dec(sch, skb);
-- 
cgit v1.2.3


From 48da34b7a74201f15315cb1fc40bb9a7bd2b4940 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 18 Sep 2016 00:57:34 +0200
Subject: sched: add and use qdisc_skb_head helpers

This change replaces sk_buff_head struct in Qdiscs with new qdisc_skb_head.

Its similar to the skb_buff_head api, but does not use skb->prev pointers.

Qdiscs will commonly enqueue at the tail of a list and dequeue at head.
While skb_buff_head works fine for this, enqueue/dequeue needs to also
adjust the prev pointer of next element.

The ->prev pointer is not required for qdiscs so we can just leave
it undefined and avoid one cacheline write access for en/dequeue.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 63 ++++++++++++++++++++++++++++++++++++++---------
 net/sched/sch_generic.c   | 21 ++++++++--------
 net/sched/sch_htb.c       | 24 +++++++++++++++---
 net/sched/sch_netem.c     | 14 +++++++++--
 4 files changed, 94 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0741ed41575b..e6aa0a249672 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,14 @@ struct qdisc_size_table {
 	u16			data[];
 };
 
+/* similar to sk_buff_head, but skb->prev pointer is undefined. */
+struct qdisc_skb_head {
+	struct sk_buff	*head;
+	struct sk_buff	*tail;
+	__u32		qlen;
+	spinlock_t	lock;
+};
+
 struct Qdisc {
 	int 			(*enqueue)(struct sk_buff *skb,
 					   struct Qdisc *sch,
@@ -76,7 +84,7 @@ struct Qdisc {
 	 * For performance sake on SMP, we put highly modified fields at the end
 	 */
 	struct sk_buff		*gso_skb ____cacheline_aligned_in_smp;
-	struct sk_buff_head	q;
+	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_packed bstats;
 	seqcount_t		running;
 	struct gnet_stats_queue	qstats;
@@ -600,10 +608,27 @@ static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
 	sch->qstats.overlimits++;
 }
 
+static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh)
+{
+	qh->head = NULL;
+	qh->tail = NULL;
+	qh->qlen = 0;
+}
+
 static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
-				       struct sk_buff_head *list)
+				       struct qdisc_skb_head *qh)
 {
-	__skb_queue_tail(list, skb);
+	struct sk_buff *last = qh->tail;
+
+	if (last) {
+		skb->next = NULL;
+		last->next = skb;
+		qh->tail = skb;
+	} else {
+		qh->tail = skb;
+		qh->head = skb;
+	}
+	qh->qlen++;
 	qdisc_qstats_backlog_inc(sch, skb);
 
 	return NET_XMIT_SUCCESS;
@@ -614,9 +639,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
 	return __qdisc_enqueue_tail(skb, sch, &sch->q);
 }
 
-static inline struct sk_buff *__qdisc_dequeue_head(struct sk_buff_head *list)
+static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
 {
-	struct sk_buff *skb = __skb_dequeue(list);
+	struct sk_buff *skb = qh->head;
+
+	if (likely(skb != NULL)) {
+		qh->head = skb->next;
+		qh->qlen--;
+		if (qh->head == NULL)
+			qh->tail = NULL;
+		skb->next = NULL;
+	}
 
 	return skb;
 }
@@ -643,10 +676,10 @@ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
 }
 
 static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
-						   struct sk_buff_head *list,
+						   struct qdisc_skb_head *qh,
 						   struct sk_buff **to_free)
 {
-	struct sk_buff *skb = __skb_dequeue(list);
+	struct sk_buff *skb = __qdisc_dequeue_head(qh);
 
 	if (likely(skb != NULL)) {
 		unsigned int len = qdisc_pkt_len(skb);
@@ -667,7 +700,9 @@ static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch,
 
 static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
 {
-	return skb_peek(&sch->q);
+	const struct qdisc_skb_head *qh = &sch->q;
+
+	return qh->head;
 }
 
 /* generic pseudo peek method for non-work-conserving qdisc */
@@ -702,15 +737,19 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 	return skb;
 }
 
-static inline void __qdisc_reset_queue(struct sk_buff_head *list)
+static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
 {
 	/*
 	 * We do not know the backlog in bytes of this list, it
 	 * is up to the caller to correct it
 	 */
-	if (!skb_queue_empty(list)) {
-		rtnl_kfree_skbs(list->next, list->prev);
-		__skb_queue_head_init(list);
+	ASSERT_RTNL();
+	if (qh->qlen) {
+		rtnl_kfree_skbs(qh->head, qh->tail);
+
+		qh->head = NULL;
+		qh->tail = NULL;
+		qh->qlen = 0;
 	}
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 73877d9c2bcb..6cfb6e9038c2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -466,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
  */
 struct pfifo_fast_priv {
 	u32 bitmap;
-	struct sk_buff_head q[PFIFO_FAST_BANDS];
+	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
 };
 
 /*
@@ -477,7 +477,7 @@ struct pfifo_fast_priv {
  */
 static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
 
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
 					     int band)
 {
 	return priv->q + band;
@@ -489,7 +489,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
 		int band = prio2band[skb->priority & TC_PRIO_MAX];
 		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-		struct sk_buff_head *list = band2list(priv, band);
+		struct qdisc_skb_head *list = band2list(priv, band);
 
 		priv->bitmap |= (1 << band);
 		qdisc->q.qlen++;
@@ -505,8 +505,8 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 	int band = bitmap2band[priv->bitmap];
 
 	if (likely(band >= 0)) {
-		struct sk_buff_head *list = band2list(priv, band);
-		struct sk_buff *skb = __qdisc_dequeue_head(list);
+		struct qdisc_skb_head *qh = band2list(priv, band);
+		struct sk_buff *skb = __qdisc_dequeue_head(qh);
 
 		if (likely(skb != NULL)) {
 			qdisc_qstats_backlog_dec(qdisc, skb);
@@ -514,7 +514,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 		}
 
 		qdisc->q.qlen--;
-		if (skb_queue_empty(list))
+		if (qh->qlen == 0)
 			priv->bitmap &= ~(1 << band);
 
 		return skb;
@@ -529,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
 	int band = bitmap2band[priv->bitmap];
 
 	if (band >= 0) {
-		struct sk_buff_head *list = band2list(priv, band);
+		struct qdisc_skb_head *qh = band2list(priv, band);
 
-		return skb_peek(list);
+		return qh->head;
 	}
 
 	return NULL;
@@ -569,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__skb_queue_head_init(band2list(priv, prio));
+		qdisc_skb_head_init(band2list(priv, prio));
 
 	/* Can by-pass the queue discipline */
 	qdisc->flags |= TCQ_F_CAN_BYPASS;
@@ -617,7 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 		sch->padded = (char *) sch - (char *) p;
 	}
-	skb_queue_head_init(&sch->q);
+	qdisc_skb_head_init(&sch->q);
+	spin_lock_init(&sch->q.lock);
 
 	spin_lock_init(&sch->busylock);
 	lockdep_set_class(&sch->busylock,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 53dbfa187870..c798d0de8a9d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -162,7 +162,7 @@ struct htb_sched {
 	struct work_struct	work;
 
 	/* non shaped skbs; let them go directly thru */
-	struct sk_buff_head	direct_queue;
+	struct qdisc_skb_head	direct_queue;
 	long			direct_pkts;
 
 	struct qdisc_watchdog	watchdog;
@@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 	list_del_init(&cl->un.leaf.drop_list);
 }
 
+static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
+			     struct qdisc_skb_head *qh)
+{
+	struct sk_buff *last = qh->tail;
+
+	if (last) {
+		skb->next = NULL;
+		last->next = skb;
+		qh->tail = skb;
+	} else {
+		qh->tail = skb;
+		qh->head = skb;
+	}
+	qh->qlen++;
+}
+
 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		       struct sk_buff **to_free)
 {
@@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (cl == HTB_DIRECT) {
 		/* enqueue to helper queue */
 		if (q->direct_queue.qlen < q->direct_qlen) {
-			__skb_queue_tail(&q->direct_queue, skb);
+			htb_enqueue_tail(skb, sch, &q->direct_queue);
 			q->direct_pkts++;
 		} else {
 			return qdisc_drop(skb, sch, to_free);
@@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 	unsigned long start_at;
 
 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
-	skb = __skb_dequeue(&q->direct_queue);
+	skb = __qdisc_dequeue_head(&q->direct_queue);
 	if (skb != NULL) {
 ok:
 		qdisc_bstats_update(sch, skb);
@@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 
 	qdisc_watchdog_init(&q->watchdog, sch);
 	INIT_WORK(&q->work, htb_work_func);
-	__skb_queue_head_init(&q->direct_queue);
+	qdisc_skb_head_init(&q->direct_queue);
 
 	if (tb[TCA_HTB_DIRECT_QLEN])
 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 0a964b35f8c7..9f7b380cf0a3 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 	return segs;
 }
 
+static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
+{
+	skb->next = qh->head;
+
+	if (!qh->head)
+		qh->tail = skb;
+	qh->head = skb;
+	qh->qlen++;
+}
+
 /*
  * Insert one skb into qdisc.
  * Note: parent depends on return value to account for queue length.
@@ -523,7 +533,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			struct sk_buff *last;
 
 			if (sch->q.qlen)
-				last = skb_peek_tail(&sch->q);
+				last = sch->q.tail;
 			else
 				last = netem_rb_to_skb(rb_last(&q->t_root));
 			if (last) {
@@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		cb->time_to_send = psched_get_time();
 		q->counter = 0;
 
-		__skb_queue_head(&sch->q, skb);
+		netem_enqueue_skb_head(&sch->q, skb);
 		sch->qstats.requeues++;
 	}
 
-- 
cgit v1.2.3


From b588479358ce26f32138e0f0a7ab0678f8e3e601 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Sun, 18 Sep 2016 07:42:53 +0000
Subject: xfrm: Fix memory leak of aead algorithm name

commit 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms")
introduced aead. The function attach_aead kmemdup()s the algorithm
name during xfrm_state_construct().
However this memory is never freed.
Implementation has since been slightly modified in
commit ee5c23176fcc ("xfrm: Clone states properly on migration")
without resolving this leak.
This patch adds a kfree() call for the aead algorithm name.

Fixes: 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms")
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Acked-by: Rami Rosen <roszenrami@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9895a8c56d8c..a30f898dc1c5 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -332,6 +332,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 {
 	tasklet_hrtimer_cancel(&x->mtimer);
 	del_timer_sync(&x->rtimer);
+	kfree(x->aead);
 	kfree(x->aalg);
 	kfree(x->ealg);
 	kfree(x->calg);
-- 
cgit v1.2.3


From e64c97b53bc6727aa4385535166aaa047281e02d Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 21 Jul 2016 14:12:41 +0200
Subject: Bluetooth: Add combined LED trigger for controller power

Instead of just having a LED trigger for power on a specific controller,
this adds the LED trigger "bluetooth-power" that combines the power
states of all controllers into a single trigger. This simplifies the
trigger selection and also supports multiple controllers per host
system via a single LED.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/af_bluetooth.c |  5 +++++
 net/bluetooth/leds.c         | 27 +++++++++++++++++++++++++++
 net/bluetooth/leds.h         | 10 ++++++++++
 3 files changed, 42 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 0b5f729d08d2..1d96ff3a8d87 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -31,6 +31,7 @@
 #include <net/bluetooth/bluetooth.h>
 #include <linux/proc_fs.h>
 
+#include "leds.h"
 #include "selftest.h"
 
 /* Bluetooth sockets */
@@ -726,6 +727,8 @@ static int __init bt_init(void)
 
 	bt_debugfs = debugfs_create_dir("bluetooth", NULL);
 
+	bt_leds_init();
+
 	err = bt_sysfs_init();
 	if (err < 0)
 		return err;
@@ -785,6 +788,8 @@ static void __exit bt_exit(void)
 
 	bt_sysfs_cleanup();
 
+	bt_leds_cleanup();
+
 	debugfs_remove_recursive(bt_debugfs);
 }
 
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index 8319c8440c89..cb670b5594eb 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -11,6 +11,8 @@
 
 #include "leds.h"
 
+DEFINE_LED_TRIGGER(bt_power_led_trigger);
+
 struct hci_basic_led_trigger {
 	struct led_trigger	led_trigger;
 	struct hci_dev		*hdev;
@@ -24,6 +26,21 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
 	if (hdev->power_led)
 		led_trigger_event(hdev->power_led,
 				  enabled ? LED_FULL : LED_OFF);
+
+	if (!enabled) {
+		struct hci_dev *d;
+
+		read_lock(&hci_dev_list_lock);
+
+		list_for_each_entry(d, &hci_dev_list, list) {
+			if (test_bit(HCI_UP, &d->flags))
+				enabled = true;
+		}
+
+		read_unlock(&hci_dev_list_lock);
+	}
+
+	led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
 }
 
 static void power_activate(struct led_classdev *led_cdev)
@@ -72,3 +89,13 @@ void hci_leds_init(struct hci_dev *hdev)
 	/* initialize power_led */
 	hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
 }
+
+void bt_leds_init(void)
+{
+	led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger);
+}
+
+void bt_leds_cleanup(void)
+{
+	led_trigger_unregister_simple(bt_power_led_trigger);
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
index a9c4d6ea01cf..08725a2fbd9b 100644
--- a/net/bluetooth/leds.h
+++ b/net/bluetooth/leds.h
@@ -7,10 +7,20 @@
  */
 
 #if IS_ENABLED(CONFIG_BT_LEDS)
+
 void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
 void hci_leds_init(struct hci_dev *hdev);
+
+void bt_leds_init(void);
+void bt_leds_cleanup(void);
+
 #else
+
 static inline void hci_leds_update_powered(struct hci_dev *hdev,
 					   bool enabled) {}
 static inline void hci_leds_init(struct hci_dev *hdev) {}
+
+static inline void bt_leds_init(void) {}
+static inline void bt_leds_cleanup(void) {}
+
 #endif
-- 
cgit v1.2.3


From abbcc341adb16f68915cae7ef9a10e0d7b57e3c0 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Sun, 24 Jul 2016 16:12:24 +0200
Subject: mac802154: set phy net namespace for new ifaces

This patch sets the net namespace when creating SoftMAC interfaces. This
is important if the namespace at phy layer was switched before.
Currently we losing interfaces in some namespace and it's not possible
to recover that.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/mac802154/iface.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 7079cd32a7ad..06019dba4b10 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -663,6 +663,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
 
 	/* TODO check this */
 	SET_NETDEV_DEV(ndev, &local->phy->dev);
+	dev_net_set(ndev, wpan_phy_net(local->hw.phy));
 	sdata = netdev_priv(ndev);
 	ndev->ieee802154_ptr = &sdata->wpan_dev;
 	memcpy(sdata->name, ndev->name, IFNAMSIZ);
-- 
cgit v1.2.3


From 5ddedce3b7331959a6da217ed3189d020090873c Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Sun, 24 Jul 2016 16:12:25 +0200
Subject: 6lowpan: ndisc: no overreact if no short address is available

This patch removes handling to remove short address for a neigbour entry
if RS/RA/NS/NA doesn't contain a short address. If these messages
doesn't has any short address option, the existing short address from
ndisc cache will be used. The current behaviour will set that the
neigbour doesn't has a short address anymore.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/6lowpan/ndisc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
index 86450b7e2899..941df2fa4448 100644
--- a/net/6lowpan/ndisc.c
+++ b/net/6lowpan/ndisc.c
@@ -101,8 +101,6 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
 		ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
 		if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr))
 			neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
-	} else {
-		neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
 	}
 	write_unlock_bh(&n->lock);
 }
-- 
cgit v1.2.3


From ca1de81aa262dcf48354a7c55f2558205517d06e Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <arozansk@redhat.com>
Date: Mon, 25 Jul 2016 11:46:40 -0400
Subject: mac802154: don't warn on unsupported frames

Just because we don't support certain types of frames yet doesn't mean
we have to flood the message log with warnings about "invalid" frames.

Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Acked-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/mac802154/rx.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 446e1300383e..b978da018bf8 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -101,6 +101,11 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
 	sdata->dev->stats.rx_bytes += skb->len;
 
 	switch (mac_cb(skb)->type) {
+	case IEEE802154_FC_TYPE_BEACON:
+	case IEEE802154_FC_TYPE_ACK:
+	case IEEE802154_FC_TYPE_MAC_CMD:
+		goto fail;
+
 	case IEEE802154_FC_TYPE_DATA:
 		return ieee802154_deliver_skb(skb);
 	default:
-- 
cgit v1.2.3


From bd89bb6daaca3e4a7c509bdacb53a610f432fa2c Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <arozansk@redhat.com>
Date: Mon, 25 Jul 2016 11:46:41 -0400
Subject: mac802154: use rate limited warnings for malformed frames

Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Acked-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/mac802154/rx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index b978da018bf8..4dcf6e18563a 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -109,8 +109,8 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
 	case IEEE802154_FC_TYPE_DATA:
 		return ieee802154_deliver_skb(skb);
 	default:
-		pr_warn("ieee802154: bad frame received (type = %d)\n",
-			mac_cb(skb)->type);
+		pr_warn_ratelimited("ieee802154: bad frame received "
+				    "(type = %d)\n", mac_cb(skb)->type);
 		goto fail;
 	}
 
-- 
cgit v1.2.3


From 47b0f573f2fa7634860e16ea31f2bc3057a1022a Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:37 +0200
Subject: Bluetooth: Check SOL_HCI for raw socket options

The SOL_HCI level should be enforced when using socket options on the
HCI raw socket interface.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 96f04b7b9556..99dd1503ef56 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1440,6 +1440,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
 
 	BT_DBG("sk %p, opt %d", sk, optname);
 
+	if (level != SOL_HCI)
+		return -ENOPROTOOPT;
+
 	lock_sock(sk);
 
 	if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
@@ -1523,6 +1526,9 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
 
 	BT_DBG("sk %p, opt %d", sk, optname);
 
+	if (level != SOL_HCI)
+		return -ENOPROTOOPT;
+
 	if (get_user(len, optlen))
 		return -EFAULT;
 
-- 
cgit v1.2.3


From 70ecce91e3a2d7e332fe56fd065c67d404b8fccf Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:38 +0200
Subject: Bluetooth: Store control socket cookie and comm information

To further allow unique identification and tracking of control socket,
store cookie and comm information when binding the socket.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/bluetooth.h |  1 +
 net/bluetooth/hci_sock.c          | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index bfd1590821d6..69b5174168b7 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -371,6 +371,7 @@ void hci_sock_set_flag(struct sock *sk, int nr);
 void hci_sock_clear_flag(struct sock *sk, int nr);
 int hci_sock_test_flag(struct sock *sk, int nr);
 unsigned short hci_sock_get_channel(struct sock *sk);
+u32 hci_sock_get_cookie(struct sock *sk);
 
 int hci_sock_init(void);
 void hci_sock_cleanup(void);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 99dd1503ef56..4dce6dfdb0f2 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -26,6 +26,7 @@
 
 #include <linux/export.h>
 #include <linux/utsname.h>
+#include <linux/sched.h>
 #include <asm/unaligned.h>
 
 #include <net/bluetooth/bluetooth.h>
@@ -38,6 +39,8 @@
 static LIST_HEAD(mgmt_chan_list);
 static DEFINE_MUTEX(mgmt_chan_list_lock);
 
+static DEFINE_IDA(sock_cookie_ida);
+
 static atomic_t monitor_promisc = ATOMIC_INIT(0);
 
 /* ----- HCI socket interface ----- */
@@ -52,6 +55,8 @@ struct hci_pinfo {
 	__u32             cmsg_mask;
 	unsigned short    channel;
 	unsigned long     flags;
+	__u32             cookie;
+	char              comm[TASK_COMM_LEN];
 };
 
 void hci_sock_set_flag(struct sock *sk, int nr)
@@ -74,6 +79,11 @@ unsigned short hci_sock_get_channel(struct sock *sk)
 	return hci_pi(sk)->channel;
 }
 
+u32 hci_sock_get_cookie(struct sock *sk)
+{
+	return hci_pi(sk)->cookie;
+}
+
 static inline int hci_test_bit(int nr, const void *addr)
 {
 	return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
@@ -585,6 +595,7 @@ static int hci_sock_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct hci_dev *hdev;
+	int id;
 
 	BT_DBG("sock %p sk %p", sock, sk);
 
@@ -593,8 +604,17 @@ static int hci_sock_release(struct socket *sock)
 
 	hdev = hci_pi(sk)->hdev;
 
-	if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR)
+	switch (hci_pi(sk)->channel) {
+	case HCI_CHANNEL_MONITOR:
 		atomic_dec(&monitor_promisc);
+		break;
+	case HCI_CHANNEL_CONTROL:
+		id = hci_pi(sk)->cookie;
+
+		hci_pi(sk)->cookie = 0xffffffff;
+		ida_simple_remove(&sock_cookie_ida, id);
+		break;
+	}
 
 	bt_sock_unlink(&hci_sk_list, sk);
 
@@ -957,6 +977,15 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		 * are changes to settings, class of device, name etc.
 		 */
 		if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+			int id;
+
+			id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+			if (id < 0)
+				id = 0xffffffff;
+
+			hci_pi(sk)->cookie = id;
+			get_task_comm(hci_pi(sk)->comm, current);
+
 			hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
-- 
cgit v1.2.3


From 03c979c4717c7fa0c058fafe76ac4d6acdd1fb0d Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:39 +0200
Subject: Bluetooth: Introduce helper to pack mgmt version information

The mgmt version information will be also needed for the control
changell tracing feature. This provides a helper to pack them.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/mgmt.c             | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index cc349f633570..9f181b583b96 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1451,6 +1451,7 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c);
 #define DISCOV_BREDR_INQUIRY_LEN	0x08
 #define DISCOV_LE_RESTART_DELAY		msecs_to_jiffies(200)	/* msec */
 
+void mgmt_fill_version_info(void *ver);
 int mgmt_new_settings(struct hci_dev *hdev);
 void mgmt_index_added(struct hci_dev *hdev);
 void mgmt_index_removed(struct hci_dev *hdev);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7639290b6de3..9071886df194 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -278,6 +278,14 @@ static u8 le_addr_type(u8 mgmt_addr_type)
 		return ADDR_LE_DEV_RANDOM;
 }
 
+void mgmt_fill_version_info(void *ver)
+{
+	struct mgmt_rp_read_version *rp = ver;
+
+	rp->version = MGMT_VERSION;
+	rp->revision = cpu_to_le16(MGMT_REVISION);
+}
+
 static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
 			u16 data_len)
 {
@@ -285,8 +293,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	BT_DBG("sock %p", sk);
 
-	rp.version = MGMT_VERSION;
-	rp.revision = cpu_to_le16(MGMT_REVISION);
+	mgmt_fill_version_info(&rp);
 
 	return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
 				 &rp, sizeof(rp));
-- 
cgit v1.2.3


From 249fa1699f8642c73eb43e61b321969f0549ab2c Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:40 +0200
Subject: Bluetooth: Add support for sending MGMT open and close to monitor

This sends new notifications to the monitor support whenever a
management channel has been opened or closed. This allows tracing of
control channels really easily.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_mon.h |  2 +
 net/bluetooth/hci_sock.c        | 95 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h
index 587d0131b349..9640790cbbcc 100644
--- a/include/net/bluetooth/hci_mon.h
+++ b/include/net/bluetooth/hci_mon.h
@@ -45,6 +45,8 @@ struct hci_mon_hdr {
 #define HCI_MON_VENDOR_DIAG	11
 #define HCI_MON_SYSTEM_NOTE	12
 #define HCI_MON_USER_LOGGING	13
+#define HCI_MON_CTRL_OPEN	14
+#define HCI_MON_CTRL_CLOSE	15
 
 struct hci_mon_new_index {
 	__u8		type;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 4dce6dfdb0f2..2d8725006838 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -394,6 +394,59 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
 	return skb;
 }
 
+static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+	u16 format = 0x0002;
+	u8 ver[3];
+	u32 flags;
+
+	skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	mgmt_fill_version_info(ver);
+	flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
+
+	put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+	put_unaligned_le16(format, skb_put(skb, 2));
+	memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver));
+	put_unaligned_le32(flags, skb_put(skb, 4));
+	*skb_put(skb, 1) = TASK_COMM_LEN;
+	memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN);
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
+	hdr->index = cpu_to_le16(HCI_DEV_NONE);
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+
+	skb = bt_skb_alloc(4, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
+	hdr->index = cpu_to_le16(HCI_DEV_NONE);
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
 static void __printf(2, 3)
 send_monitor_note(struct sock *sk, const char *fmt, ...)
 {
@@ -468,6 +521,29 @@ static void send_monitor_replay(struct sock *sk)
 	read_unlock(&hci_dev_list_lock);
 }
 
+static void send_monitor_control_replay(struct sock *mon_sk)
+{
+	struct sock *sk;
+
+	read_lock(&hci_sk_list.lock);
+
+	sk_for_each(sk, &hci_sk_list.head) {
+		struct sk_buff *skb;
+
+		if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+			continue;
+
+		skb = create_monitor_ctrl_open(sk);
+		if (!skb)
+			continue;
+
+		if (sock_queue_rcv_skb(mon_sk, skb))
+			kfree_skb(skb);
+	}
+
+	read_unlock(&hci_sk_list.lock);
+}
+
 /* Generate internal stack event */
 static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
 {
@@ -595,6 +671,7 @@ static int hci_sock_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct hci_dev *hdev;
+	struct sk_buff *skb;
 	int id;
 
 	BT_DBG("sock %p sk %p", sock, sk);
@@ -611,6 +688,14 @@ static int hci_sock_release(struct socket *sock)
 	case HCI_CHANNEL_CONTROL:
 		id = hci_pi(sk)->cookie;
 
+		/* Send event to monitor */
+		skb = create_monitor_ctrl_close(sk);
+		if (skb) {
+			hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+					    HCI_SOCK_TRUSTED, NULL);
+			kfree_skb(skb);
+		}
+
 		hci_pi(sk)->cookie = 0xffffffff;
 		ida_simple_remove(&sock_cookie_ida, id);
 		break;
@@ -931,6 +1016,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		send_monitor_note(sk, "Bluetooth subsystem version %s",
 				  BT_SUBSYS_VERSION);
 		send_monitor_replay(sk);
+		send_monitor_control_replay(sk);
 
 		atomic_inc(&monitor_promisc);
 		break;
@@ -977,6 +1063,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		 * are changes to settings, class of device, name etc.
 		 */
 		if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+			struct sk_buff *skb;
 			int id;
 
 			id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
@@ -986,6 +1073,14 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 			hci_pi(sk)->cookie = id;
 			get_task_comm(hci_pi(sk)->comm, current);
 
+			/* Send event to monitor */
+			skb = create_monitor_ctrl_open(sk);
+			if (skb) {
+				hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+						    HCI_SOCK_TRUSTED, NULL);
+				kfree_skb(skb);
+			}
+
 			hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
-- 
cgit v1.2.3


From 38ceaa00d02dceb22c6bdd5268f5a44d5c00e123 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sat, 27 Aug 2016 20:23:41 +0200
Subject: Bluetooth: Add support for sending MGMT commands and events to
 monitor

This adds support for tracing all management commands and events via the
monitor interface.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h |  3 ++
 include/net/bluetooth/hci_mon.h  |  2 +
 net/bluetooth/hci_sock.c         | 94 ++++++++++++++++++++++++++++++++++++++++
 net/bluetooth/mgmt_util.c        | 66 ++++++++++++++++++++++++++--
 4 files changed, 162 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 9f181b583b96..a48f71d73dc8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1406,6 +1406,9 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb);
 void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
 			 int flag, struct sock *skip_sk);
 void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb);
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+				 void *data, u16 data_len, ktime_t tstamp,
+				 int flag, struct sock *skip_sk);
 
 void hci_sock_dev_event(struct hci_dev *hdev, int event);
 
diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h
index 9640790cbbcc..240786b04a46 100644
--- a/include/net/bluetooth/hci_mon.h
+++ b/include/net/bluetooth/hci_mon.h
@@ -47,6 +47,8 @@ struct hci_mon_hdr {
 #define HCI_MON_USER_LOGGING	13
 #define HCI_MON_CTRL_OPEN	14
 #define HCI_MON_CTRL_CLOSE	15
+#define HCI_MON_CTRL_COMMAND	16
+#define HCI_MON_CTRL_EVENT	17
 
 struct hci_mon_new_index {
 	__u8		type;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 2d8725006838..576ea48631b9 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -315,6 +315,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
 	kfree_skb(skb_copy);
 }
 
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+				 void *data, u16 data_len, ktime_t tstamp,
+				 int flag, struct sock *skip_sk)
+{
+	struct sock *sk;
+	__le16 index;
+
+	if (hdev)
+		index = cpu_to_le16(hdev->id);
+	else
+		index = cpu_to_le16(MGMT_INDEX_NONE);
+
+	read_lock(&hci_sk_list.lock);
+
+	sk_for_each(sk, &hci_sk_list.head) {
+		struct hci_mon_hdr *hdr;
+		struct sk_buff *skb;
+
+		if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+			continue;
+
+		/* Ignore socket without the flag set */
+		if (!hci_sock_test_flag(sk, flag))
+			continue;
+
+		/* Skip the original socket */
+		if (sk == skip_sk)
+			continue;
+
+		skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC);
+		if (!skb)
+			continue;
+
+		put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+		put_unaligned_le16(event, skb_put(skb, 2));
+
+		if (data)
+			memcpy(skb_put(skb, data_len), data, data_len);
+
+		skb->tstamp = tstamp;
+
+		hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+		hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+		hdr->index = index;
+		hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+		hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+				    HCI_SOCK_TRUSTED, NULL);
+		kfree_skb(skb);
+	}
+
+	read_unlock(&hci_sk_list.lock);
+}
+
 static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
 {
 	struct hci_mon_hdr *hdr;
@@ -447,6 +501,33 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
 	return skb;
 }
 
+static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
+						   u16 opcode, u16 len,
+						   const void *buf)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+
+	skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+	put_unaligned_le16(opcode, skb_put(skb, 2));
+
+	if (buf)
+		memcpy(skb_put(skb, len), buf, len);
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND);
+	hdr->index = cpu_to_le16(index);
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
 static void __printf(2, 3)
 send_monitor_note(struct sock *sk, const char *fmt, ...)
 {
@@ -1257,6 +1338,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
 		goto done;
 	}
 
+	if (chan->channel == HCI_CHANNEL_CONTROL) {
+		struct sk_buff *skb;
+
+		/* Send event to monitor */
+		skb = create_monitor_ctrl_command(sk, index, opcode, len,
+						  buf + sizeof(*hdr));
+		if (skb) {
+			hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+					    HCI_SOCK_TRUSTED, NULL);
+			kfree_skb(skb);
+		}
+	}
+
 	if (opcode >= chan->handler_count ||
 	    chan->handlers[opcode].func == NULL) {
 		BT_DBG("Unknown op %u", opcode);
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index 8c30c7eb8bef..c933bd08c1fe 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -21,12 +21,41 @@
    SOFTWARE IS DISCLAIMED.
 */
 
+#include <asm/unaligned.h>
+
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
 #include <net/bluetooth/mgmt.h>
 
 #include "mgmt_util.h"
 
+static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
+						 u16 opcode, u16 len, void *buf)
+{
+	struct hci_mon_hdr *hdr;
+	struct sk_buff *skb;
+
+	skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	put_unaligned_le32(cookie, skb_put(skb, 4));
+	put_unaligned_le16(opcode, skb_put(skb, 2));
+
+	if (buf)
+		memcpy(skb_put(skb, len), buf, len);
+
+	__net_timestamp(skb);
+
+	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+	hdr->index = index;
+	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+	return skb;
+}
+
 int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
 		    void *data, u16 data_len, int flag, struct sock *skip_sk)
 {
@@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
 	__net_timestamp(skb);
 
 	hci_send_to_channel(channel, skb, flag, skip_sk);
-	kfree_skb(skb);
 
+	if (channel == HCI_CHANNEL_CONTROL)
+		hci_send_monitor_ctrl_event(hdev, event, data, data_len,
+					    skb_get_ktime(skb), flag, skip_sk);
+
+	kfree_skb(skb);
 	return 0;
 }
 
 int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb, *mskb;
 	struct mgmt_hdr *hdr;
 	struct mgmt_ev_cmd_status *ev;
 	int err;
@@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
 	ev->status = status;
 	ev->opcode = cpu_to_le16(cmd);
 
+	mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+					 MGMT_EV_CMD_STATUS, sizeof(*ev), ev);
+	if (mskb)
+		skb->tstamp = mskb->tstamp;
+	else
+		__net_timestamp(skb);
+
 	err = sock_queue_rcv_skb(sk, skb);
 	if (err < 0)
 		kfree_skb(skb);
 
+	if (mskb) {
+		hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+				    HCI_SOCK_TRUSTED, NULL);
+		kfree_skb(mskb);
+	}
+
 	return err;
 }
 
 int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
 		      void *rp, size_t rp_len)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb, *mskb;
 	struct mgmt_hdr *hdr;
 	struct mgmt_ev_cmd_complete *ev;
 	int err;
@@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
 	if (rp)
 		memcpy(ev->data, rp, rp_len);
 
+	mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+					 MGMT_EV_CMD_COMPLETE,
+					 sizeof(*ev) + rp_len, ev);
+	if (mskb)
+		skb->tstamp = mskb->tstamp;
+	else
+		__net_timestamp(skb);
+
 	err = sock_queue_rcv_skb(sk, skb);
 	if (err < 0)
 		kfree_skb(skb);
 
+	if (mskb) {
+		hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+				    HCI_SOCK_TRUSTED, NULL);
+		kfree_skb(mskb);
+	}
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 37d3a1fab50fa07ac706787646e61c60e7c520e0 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Sun, 28 Aug 2016 20:53:34 +0300
Subject: Bluetooth: mgmt: Fix sending redundant event for Advertising Instance

When an Advertising Instance is removed, the Advertising Removed event
shouldn't be sent to the same socket that issued the Remove
Advertising command (it gets a command complete event instead). The
mgmt_advertising_removed() function already has a parameter for
skipping a specific socket, but there was no code to propagate the
right value to this parameter. This patch fixes the issue by making
sure the intermediate hci_req_clear_adv_instance() function gets the
socket pointer.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 11 ++++++-----
 net/bluetooth/hci_request.h |  5 +++--
 net/bluetooth/mgmt.c        |  6 +++---
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index b0e23dfc5c34..9968b1c7c03a 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1194,7 +1194,7 @@ static void adv_timeout_expire(struct work_struct *work)
 
 	hci_req_init(&req, hdev);
 
-	hci_req_clear_adv_instance(hdev, &req, instance, false);
+	hci_req_clear_adv_instance(hdev, NULL, &req, instance, false);
 
 	if (list_empty(&hdev->adv_instances))
 		__hci_req_disable_advertising(&req);
@@ -1284,8 +1284,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev)
  *   setting.
  * - force == false: Only instances that have a timeout will be removed.
  */
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
-				u8 instance, bool force)
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+				struct hci_request *req, u8 instance,
+				bool force)
 {
 	struct adv_info *adv_instance, *n, *next_instance = NULL;
 	int err;
@@ -1311,7 +1312,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
 			rem_inst = adv_instance->instance;
 			err = hci_remove_adv_instance(hdev, rem_inst);
 			if (!err)
-				mgmt_advertising_removed(NULL, hdev, rem_inst);
+				mgmt_advertising_removed(sk, hdev, rem_inst);
 		}
 	} else {
 		adv_instance = hci_find_adv_instance(hdev, instance);
@@ -1325,7 +1326,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
 
 			err = hci_remove_adv_instance(hdev, instance);
 			if (!err)
-				mgmt_advertising_removed(NULL, hdev, instance);
+				mgmt_advertising_removed(sk, hdev, instance);
 		}
 	}
 
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index b2d044bdc732..ac1e11006f38 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -73,8 +73,9 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
 
 int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
 				    bool force);
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
-				u8 instance, bool force);
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+				struct hci_request *req, u8 instance,
+				bool force);
 
 void __hci_req_update_class(struct hci_request *req);
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 9071886df194..f9af5f7c2ea2 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -929,7 +929,7 @@ static int clean_up_hci_state(struct hci_dev *hdev)
 		hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
 	}
 
-	hci_req_clear_adv_instance(hdev, NULL, 0x00, false);
+	hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false);
 
 	if (hci_dev_test_flag(hdev, HCI_LE_ADV))
 		__hci_req_disable_advertising(&req);
@@ -1697,7 +1697,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	enabled = lmp_host_le_capable(hdev);
 
 	if (!val)
-		hci_req_clear_adv_instance(hdev, NULL, 0x00, true);
+		hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true);
 
 	if (!hdev_is_powered(hdev) || val == enabled) {
 		bool changed = false;
@@ -6182,7 +6182,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
 
 	hci_req_init(&req, hdev);
 
-	hci_req_clear_adv_instance(hdev, &req, cp->instance, true);
+	hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true);
 
 	if (list_empty(&hdev->adv_instances))
 		__hci_req_disable_advertising(&req);
-- 
cgit v1.2.3


From 5504c3a31061704512707bb23bd7835e8a5281e4 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 29 Aug 2016 06:19:46 +0200
Subject: Bluetooth: Use individual flags for certain management events

Instead of hiding everything behind a general managment events flag,
introduce indivdual flags that allow fine control over which events are
send to a given management channel.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci.h |  5 ++++-
 net/bluetooth/hci_sock.c    |  5 ++++-
 net/bluetooth/mgmt.c        | 32 +++++++++++++-------------------
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 0aac123b5eee..ddb9accac3a5 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -208,7 +208,10 @@ enum {
 	HCI_MGMT_INDEX_EVENTS,
 	HCI_MGMT_UNCONF_INDEX_EVENTS,
 	HCI_MGMT_EXT_INDEX_EVENTS,
-	HCI_MGMT_GENERIC_EVENTS,
+	HCI_MGMT_OPTION_EVENTS,
+	HCI_MGMT_SETTING_EVENTS,
+	HCI_MGMT_DEV_CLASS_EVENTS,
+	HCI_MGMT_LOCAL_NAME_EVENTS,
 	HCI_MGMT_OOB_DATA_EVENTS,
 };
 
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 576ea48631b9..d37c2243157b 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1164,7 +1164,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 
 			hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
 			hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
-			hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+			hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
 		}
 		break;
 	}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index f9af5f7c2ea2..469f5cc3109b 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -256,13 +256,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
 			       flag, skip_sk);
 }
 
-static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data,
-			      u16 len, struct sock *skip_sk)
-{
-	return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
-			       HCI_MGMT_GENERIC_EVENTS, skip_sk);
-}
-
 static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
 		      struct sock *skip_sk)
 {
@@ -579,8 +572,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip)
 {
 	__le32 options = get_missing_options(hdev);
 
-	return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
-				  sizeof(options), skip);
+	return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
+				  sizeof(options), HCI_MGMT_OPTION_EVENTS, skip);
 }
 
 static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
@@ -1007,8 +1000,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip)
 {
 	__le32 ev = cpu_to_le32(get_current_settings(hdev));
 
-	return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
-				  sizeof(ev), skip);
+	return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+				  sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
 }
 
 int mgmt_new_settings(struct hci_dev *hdev)
@@ -3000,8 +2993,8 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
 		if (err < 0)
 			goto failed;
 
-		err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev,
-					 data, len, sk);
+		err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
+					 len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
 
 		goto failed;
 	}
@@ -6502,8 +6495,9 @@ void __mgmt_power_off(struct hci_dev *hdev)
 	mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
 
 	if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
-		mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
-				   zero_cod, sizeof(zero_cod), NULL);
+		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+				   zero_cod, sizeof(zero_cod),
+				   HCI_MGMT_DEV_CLASS_EVENTS, NULL);
 
 	new_settings(hdev, match.sk);
 
@@ -7100,8 +7094,8 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
 	mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
 
 	if (!status)
-		mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
-				   dev_class, 3, NULL);
+		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
+				   3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
 
 	if (match.sk)
 		sock_put(match.sk);
@@ -7130,8 +7124,8 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
 			return;
 	}
 
-	mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
-			   cmd ? cmd->sk : NULL);
+	mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
+			   HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
 }
 
 static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
-- 
cgit v1.2.3


From 56f787c5024de7829f8cccce7569feb520829baf Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 29 Aug 2016 06:19:47 +0200
Subject: Bluetooth: Fix wrong Get Clock Information return parameters

The address information of the Get Clock Information return parameters
is copying from a different memory location. It uses &cmd->param while
it actually needs to be cmd->param.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/mgmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 469f5cc3109b..0c83dd36b7e3 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4869,7 +4869,7 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
 	int err;
 
 	memset(&rp, 0, sizeof(rp));
-	memcpy(&rp.addr, &cmd->param, sizeof(rp.addr));
+	memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
 
 	if (status)
 		goto complete;
-- 
cgit v1.2.3


From 9db5c62951871c33e4443fe433e234419cf574d2 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 29 Aug 2016 06:31:57 +0200
Subject: Bluetooth: Use command status event for Set IO Capability errors

In case of failure, the Set IO Capability command is suppose to return
command status and not command complete.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/mgmt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 0c83dd36b7e3..47efdb4a669a 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -2513,8 +2513,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
 	BT_DBG("");
 
 	if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
-		return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
-					 MGMT_STATUS_INVALID_PARAMS, NULL, 0);
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
+				       MGMT_STATUS_INVALID_PARAMS);
 
 	hci_dev_lock(hdev);
 
-- 
cgit v1.2.3


From df1cb87af9f24527a8932e4d195d49ffab1168d2 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:34 +0200
Subject: Bluetooth: Introduce helper functions for socket cookie handling

Instead of manually allocating cookie information each time, use helper
functions for generating and releasing cookies.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index d37c2243157b..804208d48368 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -84,6 +84,33 @@ u32 hci_sock_get_cookie(struct sock *sk)
 	return hci_pi(sk)->cookie;
 }
 
+static bool hci_sock_gen_cookie(struct sock *sk)
+{
+	int id = hci_pi(sk)->cookie;
+
+	if (!id) {
+		id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+		if (id < 0)
+			id = 0xffffffff;
+
+		hci_pi(sk)->cookie = id;
+		get_task_comm(hci_pi(sk)->comm, current);
+		return true;
+	}
+
+	return false;
+}
+
+static void hci_sock_free_cookie(struct sock *sk)
+{
+	int id = hci_pi(sk)->cookie;
+
+	if (id) {
+		hci_pi(sk)->cookie = 0xffffffff;
+		ida_simple_remove(&sock_cookie_ida, id);
+	}
+}
+
 static inline int hci_test_bit(int nr, const void *addr)
 {
 	return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
@@ -753,7 +780,6 @@ static int hci_sock_release(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct hci_dev *hdev;
 	struct sk_buff *skb;
-	int id;
 
 	BT_DBG("sock %p sk %p", sock, sk);
 
@@ -767,8 +793,6 @@ static int hci_sock_release(struct socket *sock)
 		atomic_dec(&monitor_promisc);
 		break;
 	case HCI_CHANNEL_CONTROL:
-		id = hci_pi(sk)->cookie;
-
 		/* Send event to monitor */
 		skb = create_monitor_ctrl_close(sk);
 		if (skb) {
@@ -777,8 +801,7 @@ static int hci_sock_release(struct socket *sock)
 			kfree_skb(skb);
 		}
 
-		hci_pi(sk)->cookie = 0xffffffff;
-		ida_simple_remove(&sock_cookie_ida, id);
+		hci_sock_free_cookie(sk);
 		break;
 	}
 
@@ -1145,14 +1168,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		 */
 		if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
 			struct sk_buff *skb;
-			int id;
-
-			id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
-			if (id < 0)
-				id = 0xffffffff;
 
-			hci_pi(sk)->cookie = id;
-			get_task_comm(hci_pi(sk)->comm, current);
+			hci_sock_gen_cookie(sk);
 
 			/* Send event to monitor */
 			skb = create_monitor_ctrl_open(sk);
-- 
cgit v1.2.3


From 9e8305b39bfa23a83b932007654097f4676c2ba2 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:35 +0200
Subject: Bluetooth: Use numbers for subsystem version string

Instead of keeping a version string around, use version and revision
numbers and then stringify them for use as module parameter.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/bluetooth.h |  3 ++-
 net/bluetooth/af_bluetooth.c      | 10 +++++++---
 net/bluetooth/hci_sock.c          |  4 ++--
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 69b5174168b7..d705bcf40710 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -29,7 +29,8 @@
 #include <net/sock.h>
 #include <linux/seq_file.h>
 
-#define BT_SUBSYS_VERSION "2.21"
+#define BT_SUBSYS_VERSION	2
+#define BT_SUBSYS_REVISION	21
 
 #ifndef AF_BLUETOOTH
 #define AF_BLUETOOTH	31
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1d96ff3a8d87..1aff2da9bc74 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -26,6 +26,7 @@
 
 #include <linux/module.h>
 #include <linux/debugfs.h>
+#include <linux/stringify.h>
 #include <asm/ioctls.h>
 
 #include <net/bluetooth/bluetooth.h>
@@ -713,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = {
 struct dentry *bt_debugfs;
 EXPORT_SYMBOL_GPL(bt_debugfs);
 
+#define VERSION __stringify(BT_SUBSYS_VERSION) "." \
+		__stringify(BT_SUBSYS_REVISION)
+
 static int __init bt_init(void)
 {
 	int err;
 
 	sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
 
-	BT_INFO("Core ver %s", BT_SUBSYS_VERSION);
+	BT_INFO("Core ver %s", VERSION);
 
 	err = bt_selftest();
 	if (err < 0)
@@ -797,7 +801,7 @@ subsys_initcall(bt_init);
 module_exit(bt_exit);
 
 MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION);
-MODULE_VERSION(BT_SUBSYS_VERSION);
+MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
+MODULE_VERSION(VERSION);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 804208d48368..a4227c777d16 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1117,8 +1117,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		send_monitor_note(sk, "Linux version %s (%s)",
 				  init_utsname()->release,
 				  init_utsname()->machine);
-		send_monitor_note(sk, "Bluetooth subsystem version %s",
-				  BT_SUBSYS_VERSION);
+		send_monitor_note(sk, "Bluetooth subsystem version %u.%u",
+				  BT_SUBSYS_VERSION, BT_SUBSYS_REVISION);
 		send_monitor_replay(sk);
 		send_monitor_control_replay(sk);
 
-- 
cgit v1.2.3


From 0ef2c42f8c4e372bad16f67dc0f4b15b9be910f6 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:36 +0200
Subject: Bluetooth: Send control open and close only when cookie is present

Only when the cookie has been assigned, then send the open and close
monitor messages. Also if the socket is bound to a device, then include
the index into the message.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index a4227c777d16..0deca758fd9e 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -483,6 +483,10 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
 	u8 ver[3];
 	u32 flags;
 
+	/* No message needed when cookie is not present */
+	if (!hci_pi(sk)->cookie)
+		return NULL;
+
 	skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
 	if (!skb)
 		return NULL;
@@ -501,7 +505,10 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
 
 	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
 	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
-	hdr->index = cpu_to_le16(HCI_DEV_NONE);
+	if (hci_pi(sk)->hdev)
+		hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+	else
+		hdr->index = cpu_to_le16(HCI_DEV_NONE);
 	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
 
 	return skb;
@@ -512,6 +519,10 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
 	struct hci_mon_hdr *hdr;
 	struct sk_buff *skb;
 
+	/* No message needed when cookie is not present */
+	if (!hci_pi(sk)->cookie)
+		return NULL;
+
 	skb = bt_skb_alloc(4, GFP_ATOMIC);
 	if (!skb)
 		return NULL;
@@ -522,7 +533,10 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
 
 	hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
 	hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
-	hdr->index = cpu_to_le16(HCI_DEV_NONE);
+	if (hci_pi(sk)->hdev)
+		hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+	else
+		hdr->index = cpu_to_le16(HCI_DEV_NONE);
 	hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
 
 	return skb;
-- 
cgit v1.2.3


From 5a6d2cf5f18b5afbae0b1b450070bbba50f1e3e0 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:37 +0200
Subject: Bluetooth: Assign the channel early when binding HCI sockets

Assignment of the hci_pi(sk)->channel should be done early when binding
the HCI socket. This avoids confusion with the RAW channel that is used
for legacy access.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 0deca758fd9e..ca13fac1c132 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1045,6 +1045,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 			atomic_inc(&hdev->promisc);
 		}
 
+		hci_pi(sk)->channel = haddr.hci_channel;
 		hci_pi(sk)->hdev = hdev;
 		break;
 
@@ -1107,9 +1108,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 			}
 		}
 
-		atomic_inc(&hdev->promisc);
-
+		hci_pi(sk)->channel = haddr.hci_channel;
 		hci_pi(sk)->hdev = hdev;
+
+		atomic_inc(&hdev->promisc);
 		break;
 
 	case HCI_CHANNEL_MONITOR:
@@ -1123,6 +1125,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 			goto done;
 		}
 
+		hci_pi(sk)->channel = haddr.hci_channel;
+
 		/* The monitor interface is restricted to CAP_NET_RAW
 		 * capabilities and with that implicitly trusted.
 		 */
@@ -1149,6 +1153,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 			err = -EPERM;
 			goto done;
 		}
+
+		hci_pi(sk)->channel = haddr.hci_channel;
 		break;
 
 	default:
@@ -1170,6 +1176,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		if (capable(CAP_NET_ADMIN))
 			hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
 
+		hci_pi(sk)->channel = haddr.hci_channel;
+
 		/* At the moment the index and unconfigured index events
 		 * are enabled unconditionally. Setting them on each
 		 * socket when binding keeps this functionality. They
@@ -1180,7 +1188,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		 * received by untrusted users. Example for such events
 		 * are changes to settings, class of device, name etc.
 		 */
-		if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+		if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
 			struct sk_buff *skb;
 
 			hci_sock_gen_cookie(sk);
@@ -1203,8 +1211,6 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		break;
 	}
 
-
-	hci_pi(sk)->channel = haddr.hci_channel;
 	sk->sk_state = BT_BOUND;
 
 done:
-- 
cgit v1.2.3


From d0bef1d26fb6fdad818f3d15a178d51e2a8478ae Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:38 +0200
Subject: Bluetooth: Add extra channel checks for control open/close messages

The control open and close monitoring events require special channel
checks to ensure messages are only send when the right events happen.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index ca13fac1c132..b22efe272f7e 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -479,7 +479,7 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
 {
 	struct hci_mon_hdr *hdr;
 	struct sk_buff *skb;
-	u16 format = 0x0002;
+	u16 format;
 	u8 ver[3];
 	u32 flags;
 
@@ -487,11 +487,20 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
 	if (!hci_pi(sk)->cookie)
 		return NULL;
 
+	switch (hci_pi(sk)->channel) {
+	case HCI_CHANNEL_CONTROL:
+		format = 0x0002;
+		mgmt_fill_version_info(ver);
+		break;
+	default:
+		/* No message for unsupported format */
+		return NULL;
+	}
+
 	skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
 	if (!skb)
 		return NULL;
 
-	mgmt_fill_version_info(ver);
 	flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
 
 	put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
@@ -523,6 +532,14 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
 	if (!hci_pi(sk)->cookie)
 		return NULL;
 
+	switch (hci_pi(sk)->channel) {
+	case HCI_CHANNEL_CONTROL:
+		break;
+	default:
+		/* No message for unsupported format */
+		return NULL;
+	}
+
 	skb = bt_skb_alloc(4, GFP_ATOMIC);
 	if (!skb)
 		return NULL;
@@ -652,9 +669,6 @@ static void send_monitor_control_replay(struct sock *mon_sk)
 	sk_for_each(sk, &hci_sk_list.head) {
 		struct sk_buff *skb;
 
-		if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
-			continue;
-
 		skb = create_monitor_ctrl_open(sk);
 		if (!skb)
 			continue;
-- 
cgit v1.2.3


From f81f5b2db8692ff1d2d5f4db1fde58e67aa976a3 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:39 +0200
Subject: Bluetooth: Send control open and close messages for HCI raw sockets

When opening and closing HCI raw sockets their main usage is for legacy
userspace. To track interaction with the modern mgmt interface, send
open and close monitoring messages for these action.

The HCI raw sockets is special since it supports unbound ioctl operation
and for that special case delay the notification message until at least
one ioctl has been executed. The difference between a bound and unbound
socket will be detailed by the fact the HCI index is present or not.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index b22efe272f7e..c7772436f508 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -488,6 +488,11 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
 		return NULL;
 
 	switch (hci_pi(sk)->channel) {
+	case HCI_CHANNEL_RAW:
+		format = 0x0000;
+		ver[0] = BT_SUBSYS_VERSION;
+		put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+		break;
 	case HCI_CHANNEL_CONTROL:
 		format = 0x0002;
 		mgmt_fill_version_info(ver);
@@ -533,6 +538,7 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
 		return NULL;
 
 	switch (hci_pi(sk)->channel) {
+	case HCI_CHANNEL_RAW:
 	case HCI_CHANNEL_CONTROL:
 		break;
 	default:
@@ -820,6 +826,7 @@ static int hci_sock_release(struct socket *sock)
 	case HCI_CHANNEL_MONITOR:
 		atomic_dec(&monitor_promisc);
 		break;
+	case HCI_CHANNEL_RAW:
 	case HCI_CHANNEL_CONTROL:
 		/* Send event to monitor */
 		skb = create_monitor_ctrl_close(sk);
@@ -958,6 +965,27 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
 		goto done;
 	}
 
+	/* When calling an ioctl on an unbound raw socket, then ensure
+	 * that the monitor gets informed. Ensure that the resulting event
+	 * is only send once by checking if the cookie exists or not. The
+	 * socket cookie will be only ever generated once for the lifetime
+	 * of a given socket.
+	 */
+	if (hci_sock_gen_cookie(sk)) {
+		struct sk_buff *skb;
+
+		if (capable(CAP_NET_ADMIN))
+			hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+		/* Send event to monitor */
+		skb = create_monitor_ctrl_open(sk);
+		if (skb) {
+			hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+					    HCI_SOCK_TRUSTED, NULL);
+			kfree_skb(skb);
+		}
+	}
+
 	release_sock(sk);
 
 	switch (cmd) {
@@ -1061,6 +1089,26 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 
 		hci_pi(sk)->channel = haddr.hci_channel;
 		hci_pi(sk)->hdev = hdev;
+
+		/* Only send the event to monitor when a new cookie has
+		 * been generated. An existing cookie means that an unbound
+		 * socket has seen an ioctl and that triggered the cookie
+		 * generation and sending of the monitor event.
+		 */
+		if (hci_sock_gen_cookie(sk)) {
+			struct sk_buff *skb;
+
+			if (capable(CAP_NET_ADMIN))
+				hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+			/* Send event to monitor */
+			skb = create_monitor_ctrl_open(sk);
+			if (skb) {
+				hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+						    HCI_SOCK_TRUSTED, NULL);
+				kfree_skb(skb);
+			}
+		}
 		break;
 
 	case HCI_CHANNEL_USER:
-- 
cgit v1.2.3


From f4cdbb3f25c15c17a952deae1f2e0db6df8f1948 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 30 Aug 2016 05:00:40 +0200
Subject: Bluetooth: Handle HCI raw socket transition from unbound to bound

In case an unbound HCI raw socket is later on bound, ensure that the
monitor notification messages indicate a close and re-open. None of
the userspace tools use the socket this, but it is actually possible
to use an ioctl on an unbound socket and then later bind it.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 53 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index c7772436f508..83e9fdb712e5 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1049,6 +1049,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 	struct sockaddr_hci haddr;
 	struct sock *sk = sock->sk;
 	struct hci_dev *hdev = NULL;
+	struct sk_buff *skb;
 	int len, err = 0;
 
 	BT_DBG("sock %p sk %p", sock, sk);
@@ -1088,27 +1089,34 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		}
 
 		hci_pi(sk)->channel = haddr.hci_channel;
-		hci_pi(sk)->hdev = hdev;
-
-		/* Only send the event to monitor when a new cookie has
-		 * been generated. An existing cookie means that an unbound
-		 * socket has seen an ioctl and that triggered the cookie
-		 * generation and sending of the monitor event.
-		 */
-		if (hci_sock_gen_cookie(sk)) {
-			struct sk_buff *skb;
-
-			if (capable(CAP_NET_ADMIN))
-				hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
 
-			/* Send event to monitor */
-			skb = create_monitor_ctrl_open(sk);
+		if (!hci_sock_gen_cookie(sk)) {
+			/* In the case when a cookie has already been assigned,
+			 * then there has been already an ioctl issued against
+			 * an unbound socket and with that triggerd an open
+			 * notification. Send a close notification first to
+			 * allow the state transition to bounded.
+			 */
+			skb = create_monitor_ctrl_close(sk);
 			if (skb) {
 				hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
 						    HCI_SOCK_TRUSTED, NULL);
 				kfree_skb(skb);
 			}
 		}
+
+		if (capable(CAP_NET_ADMIN))
+			hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+		hci_pi(sk)->hdev = hdev;
+
+		/* Send event to monitor */
+		skb = create_monitor_ctrl_open(sk);
+		if (skb) {
+			hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+					    HCI_SOCK_TRUSTED, NULL);
+			kfree_skb(skb);
+		}
 		break;
 
 	case HCI_CHANNEL_USER:
@@ -1251,9 +1259,20 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		 * are changes to settings, class of device, name etc.
 		 */
 		if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
-			struct sk_buff *skb;
-
-			hci_sock_gen_cookie(sk);
+			if (!hci_sock_gen_cookie(sk)) {
+				/* In the case when a cookie has already been
+				 * assigned, this socket will transtion from
+				 * a raw socket into a control socket. To
+				 * allow for a clean transtion, send the
+				 * close notification first.
+				 */
+				skb = create_monitor_ctrl_close(sk);
+				if (skb) {
+					hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+							    HCI_SOCK_TRUSTED, NULL);
+					kfree_skb(skb);
+				}
+			}
 
 			/* Send event to monitor */
 			skb = create_monitor_ctrl_open(sk);
-- 
cgit v1.2.3


From 321c6feed2519a2691f65e41c4d62332d6ee3d52 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 1 Sep 2016 16:46:23 +0200
Subject: Bluetooth: Add framework for Extended Controller Information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This command is used to retrieve the current state and basic
information of a controller. It is typically used right after
getting the response to the Read Controller Index List command
or an Index Added event (or its extended counterparts).

When any of the values in the EIR_Data field changes, the event
Extended Controller Information Changed will be used to inform
clients about the updated information.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
---
 include/net/bluetooth/hci.h  |  1 +
 include/net/bluetooth/mgmt.h | 18 +++++++++++++
 net/bluetooth/mgmt.c         | 62 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index ddb9accac3a5..99aa5e5e3100 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -208,6 +208,7 @@ enum {
 	HCI_MGMT_INDEX_EVENTS,
 	HCI_MGMT_UNCONF_INDEX_EVENTS,
 	HCI_MGMT_EXT_INDEX_EVENTS,
+	HCI_MGMT_EXT_INFO_EVENTS,
 	HCI_MGMT_OPTION_EVENTS,
 	HCI_MGMT_SETTING_EVENTS,
 	HCI_MGMT_DEV_CLASS_EVENTS,
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 7647964b1efa..611b243713ea 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -586,6 +586,18 @@ struct mgmt_rp_get_adv_size_info {
 
 #define MGMT_OP_START_LIMITED_DISCOVERY	0x0041
 
+#define MGMT_OP_READ_EXT_INFO		0x0042
+#define MGMT_READ_EXT_INFO_SIZE		0
+struct mgmt_rp_read_ext_info {
+	bdaddr_t bdaddr;
+	__u8     version;
+	__le16   manufacturer;
+	__le32   supported_settings;
+	__le32   current_settings;
+	__le16   eir_len;
+	__u8     eir[0];
+} __packed;
+
 #define MGMT_EV_CMD_COMPLETE		0x0001
 struct mgmt_ev_cmd_complete {
 	__le16	opcode;
@@ -800,3 +812,9 @@ struct mgmt_ev_advertising_added {
 struct mgmt_ev_advertising_removed {
 	__u8    instance;
 } __packed;
+
+#define MGMT_EV_EXT_INFO_CHANGED	0x0025
+struct mgmt_ev_ext_info_changed {
+	__le16	eir_len;
+	__u8	eir[0];
+} __packed;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 47efdb4a669a..69001f415efa 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -104,6 +104,7 @@ static const u16 mgmt_commands[] = {
 	MGMT_OP_REMOVE_ADVERTISING,
 	MGMT_OP_GET_ADV_SIZE_INFO,
 	MGMT_OP_START_LIMITED_DISCOVERY,
+	MGMT_OP_READ_EXT_INFO,
 };
 
 static const u16 mgmt_events[] = {
@@ -141,6 +142,7 @@ static const u16 mgmt_events[] = {
 	MGMT_EV_LOCAL_OOB_DATA_UPDATED,
 	MGMT_EV_ADVERTISING_ADDED,
 	MGMT_EV_ADVERTISING_REMOVED,
+	MGMT_EV_EXT_INFO_CHANGED,
 };
 
 static const u16 mgmt_untrusted_commands[] = {
@@ -149,6 +151,7 @@ static const u16 mgmt_untrusted_commands[] = {
 	MGMT_OP_READ_UNCONF_INDEX_LIST,
 	MGMT_OP_READ_CONFIG_INFO,
 	MGMT_OP_READ_EXT_INDEX_LIST,
+	MGMT_OP_READ_EXT_INFO,
 };
 
 static const u16 mgmt_untrusted_events[] = {
@@ -162,6 +165,7 @@ static const u16 mgmt_untrusted_events[] = {
 	MGMT_EV_NEW_CONFIG_OPTIONS,
 	MGMT_EV_EXT_INDEX_ADDED,
 	MGMT_EV_EXT_INDEX_REMOVED,
+	MGMT_EV_EXT_INFO_CHANGED,
 };
 
 #define CACHE_TIMEOUT	msecs_to_jiffies(2 * 1000)
@@ -862,6 +866,52 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
 				 sizeof(rp));
 }
 
+static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
+				    void *data, u16 data_len)
+{
+	struct mgmt_rp_read_ext_info rp;
+
+	BT_DBG("sock %p %s", sk, hdev->name);
+
+	hci_dev_lock(hdev);
+
+	memset(&rp, 0, sizeof(rp));
+
+	bacpy(&rp.bdaddr, &hdev->bdaddr);
+
+	rp.version = hdev->hci_ver;
+	rp.manufacturer = cpu_to_le16(hdev->manufacturer);
+
+	rp.supported_settings = cpu_to_le32(get_supported_settings(hdev));
+	rp.current_settings = cpu_to_le32(get_current_settings(hdev));
+
+	rp.eir_len = cpu_to_le16(0);
+
+	hci_dev_unlock(hdev);
+
+	/* If this command is called at least once, then the events
+	 * for class of device and local name changes are disabled
+	 * and only the new extended controller information event
+	 * is used.
+	 */
+	hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS);
+	hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+	hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
+
+	return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, &rp,
+				 sizeof(rp));
+}
+
+static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
+{
+	struct mgmt_ev_ext_info_changed ev;
+
+	ev.eir_len = cpu_to_le16(0);
+
+	return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, &ev,
+				  sizeof(ev), HCI_MGMT_EXT_INFO_EVENTS, skip);
+}
+
 static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
 {
 	__le32 settings = cpu_to_le32(get_current_settings(hdev));
@@ -2995,6 +3045,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
 
 		err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
 					 len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
+		ext_info_changed(hdev, sk);
 
 		goto failed;
 	}
@@ -6356,6 +6407,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
 	{ remove_advertising,	   MGMT_REMOVE_ADVERTISING_SIZE },
 	{ get_adv_size_info,       MGMT_GET_ADV_SIZE_INFO_SIZE },
 	{ start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
+	{ read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
+						HCI_MGMT_UNTRUSTED },
 };
 
 void mgmt_index_added(struct hci_dev *hdev)
@@ -6494,10 +6547,12 @@ void __mgmt_power_off(struct hci_dev *hdev)
 
 	mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
 
-	if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
+	if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
 		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
 				   zero_cod, sizeof(zero_cod),
 				   HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+		ext_info_changed(hdev, NULL);
+	}
 
 	new_settings(hdev, match.sk);
 
@@ -7093,9 +7148,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
 	mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
 	mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
 
-	if (!status)
+	if (!status) {
 		mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
 				   3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+		ext_info_changed(hdev, NULL);
+	}
 
 	if (match.sk)
 		sock_put(match.sk);
@@ -7126,6 +7183,7 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
 
 	mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
 			   HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
+	ext_info_changed(hdev, cmd ? cmd->sk : NULL);
 }
 
 static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
-- 
cgit v1.2.3


From 8a0c9f49090fe8ae122fd1bbf7260c8492289386 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Thu, 1 Sep 2016 16:46:24 +0200
Subject: Bluetooth: Append local name and CoD to Extended Controller Info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds device class, complete local name and short local name
to EIR data in Extended Controller Info as specified in docs.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 63 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 69001f415efa..74179b92ef22 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -866,26 +866,58 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
 				 sizeof(rp));
 }
 
+static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
+				  u8 data_len)
+{
+	eir[eir_len++] = sizeof(type) + data_len;
+	eir[eir_len++] = type;
+	memcpy(&eir[eir_len], data, data_len);
+	eir_len += data_len;
+
+	return eir_len;
+}
+
 static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 				    void *data, u16 data_len)
 {
-	struct mgmt_rp_read_ext_info rp;
+	struct mgmt_rp_read_ext_info *rp;
+	char buff[512];
+	u16 eir_len = 0;
+	u8 name_len;
 
 	BT_DBG("sock %p %s", sk, hdev->name);
 
 	hci_dev_lock(hdev);
 
-	memset(&rp, 0, sizeof(rp));
+	if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+		eir_len = eir_append_data(buff, eir_len,
+					  EIR_CLASS_OF_DEV,
+					  hdev->dev_class, 3);
 
-	bacpy(&rp.bdaddr, &hdev->bdaddr);
+	name_len = strlen(hdev->dev_name);
+	eir_len = eir_append_data(buff, eir_len, EIR_NAME_COMPLETE,
+				  hdev->dev_name, name_len);
 
-	rp.version = hdev->hci_ver;
-	rp.manufacturer = cpu_to_le16(hdev->manufacturer);
+	name_len = strlen(hdev->short_name);
+	eir_len = eir_append_data(buff, eir_len, EIR_NAME_SHORT,
+				  hdev->short_name, name_len);
 
-	rp.supported_settings = cpu_to_le32(get_supported_settings(hdev));
-	rp.current_settings = cpu_to_le32(get_current_settings(hdev));
+	rp = kmalloc(sizeof(*rp) + eir_len, GFP_KERNEL);
+	if (!rp)
+		return -ENOMEM;
+
+	memset(rp, 0, sizeof(*rp) + eir_len);
+
+	rp->eir_len = cpu_to_le16(eir_len);
+	memcpy(rp->eir, buff, eir_len);
 
-	rp.eir_len = cpu_to_le16(0);
+	bacpy(&rp->bdaddr, &hdev->bdaddr);
+
+	rp->version = hdev->hci_ver;
+	rp->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+	rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
+	rp->current_settings = cpu_to_le32(get_current_settings(hdev));
 
 	hci_dev_unlock(hdev);
 
@@ -898,8 +930,8 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 	hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
 	hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
 
-	return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, &rp,
-				 sizeof(rp));
+	return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp,
+				 sizeof(*rp) + eir_len);
 }
 
 static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
@@ -5552,17 +5584,6 @@ unlock:
 	return err;
 }
 
-static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
-				  u8 data_len)
-{
-	eir[eir_len++] = sizeof(type) + data_len;
-	eir[eir_len++] = type;
-	memcpy(&eir[eir_len], data, data_len);
-	eir_len += data_len;
-
-	return eir_len;
-}
-
 static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
 					     u16 opcode, struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From aa1638dde75d00e4f549902017d0df48b77e86ff Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 1 Sep 2016 19:48:28 +0200
Subject: Bluetooth: Send control open and close messages for HCI user channels

When opening and closing HCI user channel, send monitoring messages to
be able to trace its behavior.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_sock.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 83e9fdb712e5..48f9471e7c85 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -493,6 +493,11 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
 		ver[0] = BT_SUBSYS_VERSION;
 		put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
 		break;
+	case HCI_CHANNEL_USER:
+		format = 0x0001;
+		ver[0] = BT_SUBSYS_VERSION;
+		put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+		break;
 	case HCI_CHANNEL_CONTROL:
 		format = 0x0002;
 		mgmt_fill_version_info(ver);
@@ -539,6 +544,7 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
 
 	switch (hci_pi(sk)->channel) {
 	case HCI_CHANNEL_RAW:
+	case HCI_CHANNEL_USER:
 	case HCI_CHANNEL_CONTROL:
 		break;
 	default:
@@ -827,6 +833,7 @@ static int hci_sock_release(struct socket *sock)
 		atomic_dec(&monitor_promisc);
 		break;
 	case HCI_CHANNEL_RAW:
+	case HCI_CHANNEL_USER:
 	case HCI_CHANNEL_CONTROL:
 		/* Send event to monitor */
 		skb = create_monitor_ctrl_close(sk);
@@ -1179,8 +1186,36 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		}
 
 		hci_pi(sk)->channel = haddr.hci_channel;
+
+		if (!hci_sock_gen_cookie(sk)) {
+			/* In the case when a cookie has already been assigned,
+			 * this socket will transition from a raw socket into
+			 * an user channel socket. For a clean transition, send
+			 * the close notification first.
+			 */
+			skb = create_monitor_ctrl_close(sk);
+			if (skb) {
+				hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+						    HCI_SOCK_TRUSTED, NULL);
+				kfree_skb(skb);
+			}
+		}
+
+		/* The user channel is restricted to CAP_NET_ADMIN
+		 * capabilities and with that implicitly trusted.
+		 */
+		hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
 		hci_pi(sk)->hdev = hdev;
 
+		/* Send event to monitor */
+		skb = create_monitor_ctrl_open(sk);
+		if (skb) {
+			hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+					    HCI_SOCK_TRUSTED, NULL);
+			kfree_skb(skb);
+		}
+
 		atomic_inc(&hdev->promisc);
 		break;
 
-- 
cgit v1.2.3


From baab793225c9badf46309f56982eb1012dbaac80 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sun, 4 Sep 2016 05:13:46 +0200
Subject: Bluetooth: Fix wrong New Settings event when closing HCI User Channel

When closing HCI User Channel, the New Settings event was send out to
inform about changed settings. However such event is wrong since the
exclusive HCI User Channel access is active until the Index Added event
has been sent.

@ USER Close: test
@ MGMT Event: New Settings (0x0006) plen 4
        Current settings: 0x00000ad0
          Bondable
          Secure Simple Pairing
          BR/EDR
          Low Energy
          Secure Connections
= Close Index: 00:14:EF:22:04:12
@ MGMT Event: Index Added (0x0004) plen 0

Calling __mgmt_power_off from hci_dev_do_close requires an extra check
for an active HCI User Channel.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ddf8432fe8fb..3ac89e9ace71 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1562,6 +1562,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
 	auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
 
 	if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
+	    !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
 	    hci_dev_test_flag(hdev, HCI_MGMT))
 		__mgmt_power_off(hdev);
 
-- 
cgit v1.2.3


From 3c0975a7a1087add3bf873601f0270aa695d7616 Mon Sep 17 00:00:00 2001
From: Frédéric Dalleau <frederic.dalleau@collabora.co.uk>
Date: Thu, 8 Sep 2016 12:00:11 +0200
Subject: Bluetooth: Fix reason code used for rejecting SCO connections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A comment in the code states that SCO connection should be rejected
with the proper error value between 0xd-0xf. The code uses
HCI_ERROR_REMOTE_LOW_RESOURCES which is 0x14.

This led to following error:
< HCI Command: Reject Synchronous Co.. (0x01|0x002a) plen 7
        Address: 34:51:C9:EF:02:CA (Apple, Inc.)
        Reason: Remote Device Terminated due to Low Resources (0x14)
> HCI Event: Command Status (0x0f) plen 4
      Reject Synchronous Connection Request (0x01|0x002a) ncmd 1
        Status: Invalid HCI Command Parameters (0x12)

Instead make use of HCI_ERROR_REJ_LIMITED_RESOURCES which is 0xd.

Signed-off-by: Frédéric Dalleau <frederic.dalleau@collabora.co.uk>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 9968b1c7c03a..9566ff8e3223 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1717,7 +1717,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
 			 * function. To be safe hard-code one of the
 			 * values that's suitable for SCO.
 			 */
-			rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES;
+			rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
 
 			hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
 				    sizeof(rej), &rej);
-- 
cgit v1.2.3


From 3e36ca483a642f441b8e29b4e98091f2c62bfb38 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Sat, 10 Sep 2016 12:21:22 +0000
Subject: Bluetooth: Use kzalloc instead of kmalloc/memset

Use kzalloc rather than kmalloc followed by memset with 0.

Generated by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 74179b92ef22..0ac881cfc646 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -902,12 +902,10 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 	eir_len = eir_append_data(buff, eir_len, EIR_NAME_SHORT,
 				  hdev->short_name, name_len);
 
-	rp = kmalloc(sizeof(*rp) + eir_len, GFP_KERNEL);
+	rp = kzalloc(sizeof(*rp) + eir_len, GFP_KERNEL);
 	if (!rp)
 		return -ENOMEM;
 
-	memset(rp, 0, sizeof(*rp) + eir_len);
-
 	rp->eir_len = cpu_to_le16(eir_len);
 	memcpy(rp->eir, buff, eir_len);
 
-- 
cgit v1.2.3


From 83ebb9ec734e9e768a9fae469e4a7ed1762ef43a Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Fri, 9 Sep 2016 20:24:40 +0200
Subject: Bluetooth: Fix not registering BR/EDR SMP channel with force_bredr
 flag

If force_bredr is set SMP BR/EDR channel should also be for non-SC
capable controllers. Since hcidev flag is persistent wrt power toggle
it can be already set when calling smp_register(). This resulted in
SMP BR/EDR channel not being registered even if HCI_FORCE_BREDR_SMP
flag was set.

This also fix NULL pointer dereference when trying to disable
force_bredr after power cycle.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000388
IP: [<ffffffffc0493ad8>] smp_del_chan+0x18/0x80 [bluetooth]

Call Trace:
[<ffffffffc04950ca>] force_bredr_smp_write+0xba/0x100 [bluetooth]
[<ffffffff8133be14>] full_proxy_write+0x54/0x90
[<ffffffff81245967>] __vfs_write+0x37/0x160
[<ffffffff813617f7>] ? selinux_file_permission+0xd7/0x110
[<ffffffff81356fbd>] ? security_file_permission+0x3d/0xc0
[<ffffffff810eb5b2>] ? percpu_down_read+0x12/0x50
[<ffffffff812462a5>] vfs_write+0xb5/0x1a0
[<ffffffff812476f5>] SyS_write+0x55/0xc0
[<ffffffff817eb872>] entry_SYSCALL_64_fastpath+0x1a/0xa4
Code: 48 8b 45 f0 eb c1 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f
      44 00 00 f6 05 c6 3b 02 00 04 55 48 89 e5 41 54 53 49 89 fc 75
      4b
      <49> 8b 9c 24 88 03 00 00 48 85 db 74 31 49 c7 84 24 88 03 00 00
RIP  [<ffffffffc0493ad8>] smp_del_chan+0x18/0x80 [bluetooth]
RSP <ffff8802aee3bd90>
CR2: 0000000000000388

Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 4c1a16a96ae5..43faf2aea2ab 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3387,7 +3387,10 @@ int smp_register(struct hci_dev *hdev)
 	if (!lmp_sc_capable(hdev)) {
 		debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
 				    hdev, &force_bredr_smp_fops);
-		return 0;
+
+		/* Flag can be already set here (due to power toggle) */
+		if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+			return 0;
 	}
 
 	if (WARN_ON(hdev->smp_bredr_data)) {
-- 
cgit v1.2.3


From 7c295c4801b2de24fc25687eb0cb73cf0c99d114 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Sun, 18 Sep 2016 12:50:02 +0200
Subject: Bluetooth: Add support for local name in scan rsp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch enables appending local name to scan response data. If
currently advertised instance has name flag set it is expired
immediately.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 28 +++++++++++++++++++--------
 net/bluetooth/mgmt.c        | 46 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 9566ff8e3223..0ce6cdd278b2 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -971,14 +971,14 @@ void __hci_req_enable_advertising(struct hci_request *req)
 	hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
 }
 
-static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+static u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
 {
-	u8 ad_len = 0;
 	size_t name_len;
+	int max_len;
 
+	max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
 	name_len = strlen(hdev->dev_name);
-	if (name_len > 0) {
-		size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
+	if (name_len > 0 && max_len > 0) {
 
 		if (name_len > max_len) {
 			name_len = max_len;
@@ -997,22 +997,34 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
 	return ad_len;
 }
 
+static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+{
+	return append_local_name(hdev, ptr, 0);
+}
+
 static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
 					u8 *ptr)
 {
 	struct adv_info *adv_instance;
+	u32 instance_flags;
+	u8 scan_rsp_len = 0;
 
 	adv_instance = hci_find_adv_instance(hdev, instance);
 	if (!adv_instance)
 		return 0;
 
-	/* TODO: Set the appropriate entries based on advertising instance flags
-	 * here once flags other than 0 are supported.
-	 */
+	instance_flags = adv_instance->flags;
+
 	memcpy(ptr, adv_instance->scan_rsp_data,
 	       adv_instance->scan_rsp_len);
 
-	return adv_instance->scan_rsp_len;
+	scan_rsp_len += adv_instance->scan_rsp_len;
+	ptr += adv_instance->scan_rsp_len;
+
+	if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+		scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len);
+
+	return scan_rsp_len;
 }
 
 void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 0ac881cfc646..89954bb19222 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3012,6 +3012,35 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
 				 HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
 }
 
+static void adv_expire(struct hci_dev *hdev, u32 flags)
+{
+	struct adv_info *adv_instance;
+	struct hci_request req;
+	int err;
+
+	adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+	if (!adv_instance)
+		return;
+
+	/* stop if current instance doesn't need to be changed */
+	if (!(adv_instance->flags & flags))
+		return;
+
+	cancel_adv_timeout(hdev);
+
+	adv_instance = hci_get_next_instance(hdev, adv_instance->instance);
+	if (!adv_instance)
+		return;
+
+	hci_req_init(&req, hdev);
+	err = __hci_req_schedule_adv_instance(&req, adv_instance->instance,
+					      true);
+	if (err)
+		return;
+
+	hci_req_run(&req, NULL);
+}
+
 static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 {
 	struct mgmt_cp_set_local_name *cp;
@@ -3027,13 +3056,17 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 
 	cp = cmd->param;
 
-	if (status)
+	if (status) {
 		mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
 			        mgmt_status(status));
-	else
+	} else {
 		mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
 				  cp, sizeof(*cp));
 
+		if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+			adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
+	}
+
 	mgmt_pending_remove(cmd);
 
 unlock:
@@ -5885,6 +5918,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
 	flags |= MGMT_ADV_FLAG_DISCOV;
 	flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
 	flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+	flags |= MGMT_ADV_FLAG_LOCAL_NAME;
 
 	if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
 		flags |= MGMT_ADV_FLAG_TX_POWER;
@@ -5961,6 +5995,10 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
 			tx_power_managed = true;
 			max_len -= 3;
 		}
+	} else {
+		/* at least 1 byte of name should fit in */
+		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+			max_len -= 3;
 	}
 
 	if (len > max_len)
@@ -6293,6 +6331,10 @@ static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
 
 		if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
 			max_len -= 3;
+	} else {
+		/* at least 1 byte of name should fit in */
+		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+			max_len -= 3;
 	}
 
 	return max_len;
-- 
cgit v1.2.3


From c4960ecf2b09210930964ef2c05ce2590802ccf4 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Sun, 18 Sep 2016 12:50:03 +0200
Subject: Bluetooth: Add support for appearance in scan rsp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch enables prepending appearance value to scan response data.
It also adds support for setting appearance value through mgmt command.
If currently advertised instance has apperance flag set it is expired
immediately.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  1 +
 include/net/bluetooth/mgmt.h     |  6 ++++++
 net/bluetooth/hci_request.c      |  8 ++++++++
 net/bluetooth/mgmt.c             | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a48f71d73dc8..f00bf667ec33 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -211,6 +211,7 @@ struct hci_dev {
 	__u8		dev_name[HCI_MAX_NAME_LENGTH];
 	__u8		short_name[HCI_MAX_SHORT_NAME_LENGTH];
 	__u8		eir[HCI_MAX_EIR_LENGTH];
+	__u16		appearance;
 	__u8		dev_class[3];
 	__u8		major_class;
 	__u8		minor_class;
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 611b243713ea..72a456bbbcd5 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -598,6 +598,12 @@ struct mgmt_rp_read_ext_info {
 	__u8     eir[0];
 } __packed;
 
+#define MGMT_OP_SET_APPEARANCE		0x0043
+struct mgmt_cp_set_appearance {
+	__u16	appearance;
+} __packed;
+#define MGMT_SET_APPEARANCE_SIZE	2
+
 #define MGMT_EV_CMD_COMPLETE		0x0001
 struct mgmt_ev_cmd_complete {
 	__le16	opcode;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 0ce6cdd278b2..c8135680c43e 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1015,6 +1015,14 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
 
 	instance_flags = adv_instance->flags;
 
+	if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) {
+		ptr[0] = 3;
+		ptr[1] = EIR_APPEARANCE;
+		put_unaligned_le16(hdev->appearance, ptr + 2);
+		scan_rsp_len += 4;
+		ptr += 4;
+	}
+
 	memcpy(ptr, adv_instance->scan_rsp_data,
 	       adv_instance->scan_rsp_len);
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 89954bb19222..78d708851208 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -105,6 +105,7 @@ static const u16 mgmt_commands[] = {
 	MGMT_OP_GET_ADV_SIZE_INFO,
 	MGMT_OP_START_LIMITED_DISCOVERY,
 	MGMT_OP_READ_EXT_INFO,
+	MGMT_OP_SET_APPEARANCE,
 };
 
 static const u16 mgmt_events[] = {
@@ -3143,6 +3144,34 @@ failed:
 	return err;
 }
 
+static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
+			  u16 len)
+{
+	struct mgmt_cp_set_appearance *cp = data;
+	u16 apperance;
+	int err;
+
+	BT_DBG("");
+
+	apperance = le16_to_cpu(cp->appearance);
+
+	hci_dev_lock(hdev);
+
+	if (hdev->appearance != apperance) {
+		hdev->appearance = apperance;
+
+		if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+			adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
+	}
+
+	err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
+				0);
+
+	hci_dev_unlock(hdev);
+
+	return err;
+}
+
 static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
 				         u16 opcode, struct sk_buff *skb)
 {
@@ -5918,6 +5947,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
 	flags |= MGMT_ADV_FLAG_DISCOV;
 	flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
 	flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+	flags |= MGMT_ADV_FLAG_APPEARANCE;
 	flags |= MGMT_ADV_FLAG_LOCAL_NAME;
 
 	if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
@@ -5999,6 +6029,9 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
 		/* at least 1 byte of name should fit in */
 		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
 			max_len -= 3;
+
+		if (adv_flags & MGMT_ADV_FLAG_APPEARANCE)
+			max_len -= 4;
 	}
 
 	if (len > max_len)
@@ -6335,6 +6368,9 @@ static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
 		/* at least 1 byte of name should fit in */
 		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
 			max_len -= 3;
+
+		if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
+			max_len -= 4;
 	}
 
 	return max_len;
@@ -6470,6 +6506,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
 	{ start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
 	{ read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
 						HCI_MGMT_UNTRUSTED },
+	{ set_appearance,	   MGMT_SET_APPEARANCE_SIZE },
 };
 
 void mgmt_index_added(struct hci_dev *hdev)
-- 
cgit v1.2.3


From 5e2c59e84b633e4f7719fdc6a2930f2a311da83a Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Sun, 18 Sep 2016 12:50:04 +0200
Subject: Bluetooth: Remove unused parameter from tlv_data_is_valid function

hdev parameter is not used in function.

Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 78d708851208..97f70b7fb7b1 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -6005,8 +6005,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
 	return err;
 }
 
-static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
-			      u8 len, bool is_adv_data)
+static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
 {
 	u8 max_len = HCI_MAX_AD_LENGTH;
 	int i, cur_len;
@@ -6168,8 +6167,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
 		goto unlock;
 	}
 
-	if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) ||
-	    !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len,
+	if (!tlv_data_is_valid(flags, cp->data, cp->adv_data_len, true) ||
+	    !tlv_data_is_valid(flags, cp->data + cp->adv_data_len,
 			       cp->scan_rsp_len, false)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
 				      MGMT_STATUS_INVALID_PARAMS);
-- 
cgit v1.2.3


From 2bb36870e8cb29949ef9acec37129cd8e70f1857 Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Sun, 18 Sep 2016 12:50:05 +0200
Subject: Bluetooth: Unify advertising instance flags check

This unifies max length and TLV validity checks.

Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 85 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 97f70b7fb7b1..c96b0adc4971 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -6005,34 +6005,59 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
 	return err;
 }
 
-static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
+static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
 {
 	u8 max_len = HCI_MAX_AD_LENGTH;
-	int i, cur_len;
-	bool flags_managed = false;
-	bool tx_power_managed = false;
 
 	if (is_adv_data) {
 		if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
 				 MGMT_ADV_FLAG_LIMITED_DISCOV |
-				 MGMT_ADV_FLAG_MANAGED_FLAGS)) {
-			flags_managed = true;
+				 MGMT_ADV_FLAG_MANAGED_FLAGS))
 			max_len -= 3;
-		}
 
-		if (adv_flags & MGMT_ADV_FLAG_TX_POWER) {
-			tx_power_managed = true;
+		if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
 			max_len -= 3;
-		}
 	} else {
 		/* at least 1 byte of name should fit in */
 		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
 			max_len -= 3;
 
-		if (adv_flags & MGMT_ADV_FLAG_APPEARANCE)
+		if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
 			max_len -= 4;
 	}
 
+	return max_len;
+}
+
+static bool flags_managed(u32 adv_flags)
+{
+	return adv_flags & (MGMT_ADV_FLAG_DISCOV |
+			    MGMT_ADV_FLAG_LIMITED_DISCOV |
+			    MGMT_ADV_FLAG_MANAGED_FLAGS);
+}
+
+static bool tx_power_managed(u32 adv_flags)
+{
+	return adv_flags & MGMT_ADV_FLAG_TX_POWER;
+}
+
+static bool name_managed(u32 adv_flags)
+{
+	return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME;
+}
+
+static bool appearance_managed(u32 adv_flags)
+{
+	return adv_flags & MGMT_ADV_FLAG_APPEARANCE;
+}
+
+static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
+{
+	int i, cur_len;
+	u8 max_len;
+
+	max_len = tlv_data_max_len(adv_flags, is_adv_data);
+
 	if (len > max_len)
 		return false;
 
@@ -6040,10 +6065,20 @@ static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
 	for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
 		cur_len = data[i];
 
-		if (flags_managed && data[i + 1] == EIR_FLAGS)
+		if (data[i + 1] == EIR_FLAGS && flags_managed(adv_flags))
+			return false;
+
+		if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags))
+			return false;
+
+		if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags))
+			return false;
+
+		if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags))
 			return false;
 
-		if (tx_power_managed && data[i + 1] == EIR_TX_POWER)
+		if (data[i + 1] == EIR_APPEARANCE &&
+		    appearance_managed(adv_flags))
 			return false;
 
 		/* If the current field length would exceed the total data
@@ -6351,30 +6386,6 @@ unlock:
 	return err;
 }
 
-static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
-{
-	u8 max_len = HCI_MAX_AD_LENGTH;
-
-	if (is_adv_data) {
-		if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
-				 MGMT_ADV_FLAG_LIMITED_DISCOV |
-				 MGMT_ADV_FLAG_MANAGED_FLAGS))
-			max_len -= 3;
-
-		if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
-			max_len -= 3;
-	} else {
-		/* at least 1 byte of name should fit in */
-		if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
-			max_len -= 3;
-
-		if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
-			max_len -= 4;
-	}
-
-	return max_len;
-}
-
 static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
 			     void *data, u16 data_len)
 {
-- 
cgit v1.2.3


From 9c9db78dc0fbbd95177fefdad008e46ffaa777f2 Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Sun, 18 Sep 2016 12:50:06 +0200
Subject: Bluetooth: Fix advertising instance validity check for flags

Flags are not allowed in Scan Response.

Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index c96b0adc4971..2758c6a4425c 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -6065,7 +6065,8 @@ static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
 	for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
 		cur_len = data[i];
 
-		if (data[i + 1] == EIR_FLAGS && flags_managed(adv_flags))
+		if (data[i + 1] == EIR_FLAGS &&
+		    (!is_adv_data || flags_managed(adv_flags)))
 			return false;
 
 		if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags))
-- 
cgit v1.2.3


From 3310230c5dddfafe3d1ef87f1257812011681aca Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Sun, 18 Sep 2016 12:50:07 +0200
Subject: Bluetooth: Increment management interface revision

Increment the mgmt revision due to the recently added
Read Extended Controller Information and Set Appearance commands.

Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 2758c6a4425c..54dd218d06f7 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
 #include "mgmt_util.h"
 
 #define MGMT_VERSION	1
-#define MGMT_REVISION	13
+#define MGMT_REVISION	14
 
 static const u16 mgmt_commands[] = {
 	MGMT_OP_READ_INDEX_LIST,
-- 
cgit v1.2.3


From 7d5c11da1ff6389511c42448f59456373edfc103 Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Mon, 19 Sep 2016 20:25:52 +0200
Subject: Bluetooth: Refactor read_ext_controller_info handler

There is no need to allocate heap for reply only to copy stack data to
it. This also fix rp memory leak and missing hdev unlock if kmalloc
failed.

Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 54dd218d06f7..604c48142848 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -881,42 +881,38 @@ static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
 static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 				    void *data, u16 data_len)
 {
-	struct mgmt_rp_read_ext_info *rp;
-	char buff[512];
+	char buf[512];
+	struct mgmt_rp_read_ext_info *rp = (void *)buf;
 	u16 eir_len = 0;
-	u8 name_len;
+	size_t name_len;
 
 	BT_DBG("sock %p %s", sk, hdev->name);
 
+	memset(&buf, 0, sizeof(buf));
+
 	hci_dev_lock(hdev);
 
+	bacpy(&rp->bdaddr, &hdev->bdaddr);
+
+	rp->version = hdev->hci_ver;
+	rp->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+	rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
+	rp->current_settings = cpu_to_le32(get_current_settings(hdev));
+
 	if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
-		eir_len = eir_append_data(buff, eir_len,
-					  EIR_CLASS_OF_DEV,
+		eir_len = eir_append_data(rp->eir, eir_len, EIR_CLASS_OF_DEV,
 					  hdev->dev_class, 3);
 
 	name_len = strlen(hdev->dev_name);
-	eir_len = eir_append_data(buff, eir_len, EIR_NAME_COMPLETE,
+	eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_COMPLETE,
 				  hdev->dev_name, name_len);
 
 	name_len = strlen(hdev->short_name);
-	eir_len = eir_append_data(buff, eir_len, EIR_NAME_SHORT,
+	eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_SHORT,
 				  hdev->short_name, name_len);
 
-	rp = kzalloc(sizeof(*rp) + eir_len, GFP_KERNEL);
-	if (!rp)
-		return -ENOMEM;
-
 	rp->eir_len = cpu_to_le16(eir_len);
-	memcpy(rp->eir, buff, eir_len);
-
-	bacpy(&rp->bdaddr, &hdev->bdaddr);
-
-	rp->version = hdev->hci_ver;
-	rp->manufacturer = cpu_to_le16(hdev->manufacturer);
-
-	rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
-	rp->current_settings = cpu_to_le32(get_current_settings(hdev));
 
 	hci_dev_unlock(hdev);
 
-- 
cgit v1.2.3


From cde7a863d36a4a629c111f37edc2297d6b822a82 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Mon, 19 Sep 2016 20:25:53 +0200
Subject: Bluetooth: Factor appending EIR to separate helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will also be used for Extended Information Event handling.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 604c48142848..2b6fe10256b9 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -878,13 +878,32 @@ static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
 	return eir_len;
 }
 
+static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
+{
+	u16 eir_len = 0;
+	size_t name_len;
+
+	if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+		eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV,
+					  hdev->dev_class, 3);
+
+	name_len = strlen(hdev->dev_name);
+	eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
+				  hdev->dev_name, name_len);
+
+	name_len = strlen(hdev->short_name);
+	eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT,
+				  hdev->short_name, name_len);
+
+	return eir_len;
+}
+
 static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 				    void *data, u16 data_len)
 {
 	char buf[512];
 	struct mgmt_rp_read_ext_info *rp = (void *)buf;
-	u16 eir_len = 0;
-	size_t name_len;
+	u16 eir_len;
 
 	BT_DBG("sock %p %s", sk, hdev->name);
 
@@ -900,18 +919,8 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 	rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
 	rp->current_settings = cpu_to_le32(get_current_settings(hdev));
 
-	if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
-		eir_len = eir_append_data(rp->eir, eir_len, EIR_CLASS_OF_DEV,
-					  hdev->dev_class, 3);
-
-	name_len = strlen(hdev->dev_name);
-	eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_COMPLETE,
-				  hdev->dev_name, name_len);
-
-	name_len = strlen(hdev->short_name);
-	eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_SHORT,
-				  hdev->short_name, name_len);
 
+	eir_len = append_eir_data_to_buf(hdev, rp->eir);
 	rp->eir_len = cpu_to_le16(eir_len);
 
 	hci_dev_unlock(hdev);
-- 
cgit v1.2.3


From 6a9e90bff9cfb33d5939c29e5bf2674c9176365d Mon Sep 17 00:00:00 2001
From: Szymon Janc <szymon.janc@codecoup.pl>
Date: Mon, 19 Sep 2016 20:25:54 +0200
Subject: Bluetooth: Add appearance to Read Ext Controller Info command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If LE is enabled appearance is added to EIR data.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 2b6fe10256b9..d3837e0633af 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -878,6 +878,16 @@ static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
 	return eir_len;
 }
 
+static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
+{
+	eir[eir_len++] = sizeof(type) + sizeof(data);
+	eir[eir_len++] = type;
+	put_unaligned_le16(data, &eir[eir_len]);
+	eir_len += sizeof(data);
+
+	return eir_len;
+}
+
 static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
 {
 	u16 eir_len = 0;
@@ -887,6 +897,10 @@ static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
 		eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV,
 					  hdev->dev_class, 3);
 
+	if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+		eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE,
+					  hdev->appearance);
+
 	name_len = strlen(hdev->dev_name);
 	eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
 				  hdev->dev_name, name_len);
-- 
cgit v1.2.3


From 5e9fae48f800b973e45887ce0b8d717d54c0bb11 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Mon, 19 Sep 2016 20:25:55 +0200
Subject: Bluetooth: Add supported data types to ext info changed event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds EIR data to extended info changed event.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index d3837e0633af..29e5ce95c50c 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -954,12 +954,18 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 
 static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
 {
-	struct mgmt_ev_ext_info_changed ev;
+	char buf[512];
+	struct mgmt_ev_ext_info_changed *ev = (void *)buf;
+	u16 eir_len;
 
-	ev.eir_len = cpu_to_le16(0);
+	memset(buf, 0, sizeof(buf));
+
+	eir_len = append_eir_data_to_buf(hdev, ev->eir);
+	ev->eir_len = cpu_to_le16(eir_len);
 
-	return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, &ev,
-				  sizeof(ev), HCI_MGMT_EXT_INFO_EVENTS, skip);
+	return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev,
+				  sizeof(*ev) + eir_len,
+				  HCI_MGMT_EXT_INFO_EVENTS, skip);
 }
 
 static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
-- 
cgit v1.2.3


From e74317f43f5ce2d13cddaab867c59d42934d9585 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Mon, 19 Sep 2016 20:25:56 +0200
Subject: Bluetooth: Fix missing ext info event when setting appearance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds missing event when setting appearance, just like
in the set local name command.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Szymon Janc <szymon.janc@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 29e5ce95c50c..cd9f345894e0 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3187,6 +3187,8 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
 
 		if (hci_dev_test_flag(hdev, HCI_LE_ADV))
 			adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
+
+		ext_info_changed(hdev, sk);
 	}
 
 	err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
-- 
cgit v1.2.3


From af4168c5a925dc3b11b0246c2b91124327919f47 Mon Sep 17 00:00:00 2001
From: Michał Narajowski <michal.narajowski@codecoup.pl>
Date: Mon, 19 Sep 2016 14:33:33 +0200
Subject: Bluetooth: Set appearance only for LE capable controllers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Setting appearance on controllers without LE support will result
in No Supported error.

Signed-off-by: Michał Narajowski <michal.narajowski@codecoup.pl>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/mgmt.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index cd9f345894e0..7b2bac492fb1 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3178,6 +3178,10 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	BT_DBG("");
 
+	if (!lmp_le_capable(hdev))
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
+				       MGMT_STATUS_NOT_SUPPORTED);
+
 	apperance = le16_to_cpu(cp->appearance);
 
 	hci_dev_lock(hdev);
-- 
cgit v1.2.3


From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@fb.com>
Date: Mon, 19 Sep 2016 14:44:38 -0700
Subject: cgroup: duplicate cgroup reference when cloning sockets

When a socket is cloned, the associated sock_cgroup_data is duplicated
but not its reference on the cgroup.  As a result, the cgroup reference
count will underflow when both sockets are destroyed later on.

Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: <stable@vger.kernel.org>	[4.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 6 ++++++
 net/core/sock.c | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..5e8dab5bf9ad 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 	if (cgroup_sk_alloc_disabled)
 		return;
 
+	/* Socket clone path */
+	if (skcd->val) {
+		cgroup_get(sock_cgroup_ptr(skcd));
+		return;
+	}
+
 	rcu_read_lock();
 
 	while (true) {
diff --git a/net/core/sock.c b/net/core/sock.c
index 25dab8b60223..fd7b41edf1ce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		if (!try_module_get(prot->owner))
 			goto out_free_sec;
 		sk_tx_queue_clear(sk);
-		cgroup_sk_alloc(&sk->sk_cgrp_data);
 	}
 
 	return sk;
@@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sock_net_set(sk, net);
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
+		cgroup_sk_alloc(&sk->sk_cgrp_data);
 		sock_update_classid(&sk->sk_cgrp_data);
 		sock_update_netprioidx(&sk->sk_cgrp_data);
 	}
@@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
+
+		cgroup_sk_alloc(&newsk->sk_cgrp_data);
+
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
-- 
cgit v1.2.3


From 07b26c9454a2a19fff86d6fcf2aba6bc801eb8d8 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 19 Sep 2016 12:58:47 +0200
Subject: gso: Support partial splitting at the frag_list pointer

Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
gro may build buffers with a frag_list. This can hurt forwarding
because most NICs can't offload such packets, they need to be
segmented in software. This patch splits buffers with a frag_list
at the frag_list pointer into buffers that can be TSO offloaded.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c      | 51 +++++++++++++++++++++++++++++++++++++++-----------
 net/ipv4/af_inet.c     | 14 ++++++++++----
 net/ipv4/gre_offload.c |  6 ++++--
 net/ipv4/tcp_offload.c | 13 +++++++------
 net/ipv4/udp_offload.c |  6 ++++--
 net/ipv6/ip6_offload.c |  5 ++++-
 6 files changed, 69 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e329d411242..7bf82a28e10a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3097,11 +3097,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	sg = !!(features & NETIF_F_SG);
 	csum = !!can_checksum_protocol(features, proto);
 
-	/* GSO partial only requires that we trim off any excess that
-	 * doesn't fit into an MSS sized block, so take care of that
-	 * now.
-	 */
-	if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
+	if (sg && csum && (mss != GSO_BY_FRAGS))  {
+		if (!(features & NETIF_F_GSO_PARTIAL)) {
+			struct sk_buff *iter;
+
+			if (!list_skb ||
+			    !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
+				goto normal;
+
+			/* Split the buffer at the frag_list pointer.
+			 * This is based on the assumption that all
+			 * buffers in the chain excluding the last
+			 * containing the same amount of data.
+			 */
+			skb_walk_frags(head_skb, iter) {
+				if (skb_headlen(iter))
+					goto normal;
+
+				len -= iter->len;
+			}
+		}
+
+		/* GSO partial only requires that we trim off any excess that
+		 * doesn't fit into an MSS sized block, so take care of that
+		 * now.
+		 */
 		partial_segs = len / mss;
 		if (partial_segs > 1)
 			mss *= partial_segs;
@@ -3109,6 +3129,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 			partial_segs = 0;
 	}
 
+normal:
 	headroom = skb_headroom(head_skb);
 	pos = skb_headlen(head_skb);
 
@@ -3300,21 +3321,29 @@ perform_csum_check:
 	 */
 	segs->prev = tail;
 
-	/* Update GSO info on first skb in partial sequence. */
 	if (partial_segs) {
+		struct sk_buff *iter;
 		int type = skb_shinfo(head_skb)->gso_type;
+		unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
 
 		/* Update type to add partial and then remove dodgy if set */
-		type |= SKB_GSO_PARTIAL;
+		type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
 		type &= ~SKB_GSO_DODGY;
 
 		/* Update GSO info and prepare to start updating headers on
 		 * our way back down the stack of protocols.
 		 */
-		skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
-		skb_shinfo(segs)->gso_segs = partial_segs;
-		skb_shinfo(segs)->gso_type = type;
-		SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+		for (iter = segs; iter; iter = iter->next) {
+			skb_shinfo(iter)->gso_size = gso_size;
+			skb_shinfo(iter)->gso_segs = partial_segs;
+			skb_shinfo(iter)->gso_type = type;
+			SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
+		}
+
+		if (tail->len - doffset <= gso_size)
+			skb_shinfo(tail)->gso_size = 0;
+		else if (tail != segs)
+			skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
 	}
 
 	/* Following permits correct backpressure, for protocols
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e94b47be0019..1effc986739e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features)
 {
-	bool udpfrag = false, fixedid = false, encap;
+	bool udpfrag = false, fixedid = false, gso_partial, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
 	unsigned int offset = 0;
@@ -1245,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	if (IS_ERR_OR_NULL(segs))
 		goto out;
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1259,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				iph->id = htons(id);
 				id += skb_shinfo(skb)->gso_segs;
 			}
-			tot_len = skb_shinfo(skb)->gso_size +
-				  SKB_GSO_CB(skb)->data_offset +
-				  skb->head - (unsigned char *)iph;
+
+			if (gso_partial)
+				tot_len = skb_shinfo(skb)->gso_size +
+					  SKB_GSO_CB(skb)->data_offset +
+					  skb->head - (unsigned char *)iph;
+			else
+				tot_len = skb->len - nhoff;
 		} else {
 			if (!fixedid)
 				iph->id = htons(id++);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09dbbf1..96e0efecefa6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	__be16 protocol = skb->protocol;
 	u16 mac_len = skb->mac_len;
 	int gre_offset, outer_hlen;
-	bool need_csum, ufo;
+	bool need_csum, ufo, gso_partial;
 
 	if (!skb->encapsulation)
 		goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	outer_hlen = skb_tnl_header_len(skb);
 	gre_offset = outer_hlen - tnl_hlen;
 	skb = segs;
@@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		greh = (struct gre_base_hdr *)skb_transport_header(skb);
 		pcsum = (__sum16 *)(greh + 1);
 
-		if (skb_is_gso(skb)) {
+		if (gso_partial) {
 			unsigned int partial_adj;
 
 			/* Adjust checksum to account for the fact that
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c5964962d0c..bc68da38ea86 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		goto out;
 	}
 
-	/* GSO partial only requires splitting the frame into an MSS
-	 * multiple and possibly a remainder.  So update the mss now.
-	 */
-	if (features & NETIF_F_GSO_PARTIAL)
-		mss = skb->len - (skb->len % mss);
-
 	copy_destructor = gso_skb->destructor == tcp_wfree;
 	ooo_okay = gso_skb->ooo_okay;
 	/* All segments but the first should have ooo_okay cleared */
@@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 	/* Only first segment might have ooo_okay set */
 	segs->ooo_okay = ooo_okay;
 
+	/* GSO partial and frag_list segmentation only requires splitting
+	 * the frame into an MSS multiple and possibly a remainder, both
+	 * cases return a GSO skb. So update the mss now.
+	 */
+	if (skb_is_gso(segs))
+		mss *= skb_shinfo(segs)->gso_segs;
+
 	delta = htonl(oldlen + (thlen + mss));
 
 	skb = segs;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b6ff36..f9333c963607 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	__be16 new_protocol, bool is_ipv6)
 {
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	bool remcsum, need_csum, offload_csum, ufo;
+	bool remcsum, need_csum, offload_csum, ufo, gso_partial;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	struct udphdr *uh = udp_hdr(skb);
 	u16 mac_offset = skb->mac_header;
@@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	outer_hlen = skb_tnl_header_len(skb);
 	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
@@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		 * will be using a length value equal to only one MSS sized
 		 * segment instead of the entire frame.
 		 */
-		if (skb_is_gso(skb)) {
+		if (gso_partial) {
 			uh->len = htons(skb_shinfo(skb)->gso_size +
 					SKB_GSO_CB(skb)->data_offset +
 					skb->head - (unsigned char *)uh);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 22e90e56b5a9..e7bfd55899a3 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	int offset = 0;
 	bool encap, udpfrag;
 	int nhoff;
+	bool gso_partial;
 
 	skb_reset_network_header(skb);
 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
@@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	if (IS_ERR(segs))
 		goto out;
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	for (skb = segs; skb; skb = skb->next) {
 		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-		if (skb_is_gso(skb))
+		if (gso_partial)
 			payload_len = skb_shinfo(skb)->gso_size +
 				      SKB_GSO_CB(skb)->data_offset +
 				      skb->head - (unsigned char *)(ipv6h + 1);
-- 
cgit v1.2.3


From 6a5d58b67e205f2ffc62d0a9ee4ef7d237e9a7fb Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 18 Sep 2016 07:31:42 -0400
Subject: net sched ife action: add 16 bit helpers

encoder and checker for 16 bits metadata

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h |  2 ++
 net/sched/act_ife.c         | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'net')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 5164bd7a38fb..9fd2bea0a6e0 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -50,9 +50,11 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
 int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp);
 int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi);
 int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi);
 int ife_validate_meta_u32(void *val, int len);
 int ife_validate_meta_u16(void *val, int len);
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi);
 void ife_release_meta_gen(struct tcf_meta_info *mi);
 int register_ife_op(struct tcf_meta_ops *mops);
 int unregister_ife_op(struct tcf_meta_ops *mops);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index e87cd81315e1..ccf7b4b655fe 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
 }
 EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
 
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+	u16 edata = 0;
+
+	if (mi->metaval)
+		edata = *(u16 *)mi->metaval;
+	else if (metaval)
+		edata = metaval;
+
+	if (!edata) /* will not encode */
+		return 0;
+
+	edata = htons(edata);
+	return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
+
 int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
 {
 	if (mi->metaval)
@@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
 }
 EXPORT_SYMBOL_GPL(ife_check_meta_u32);
 
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
+{
+	if (metaval || mi->metaval)
+		return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u16);
+
 int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
 {
 	u32 edata = metaval;
-- 
cgit v1.2.3


From 408fbc22ef1efb00dd896acd00e9f7d9b641e047 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 18 Sep 2016 07:31:43 -0400
Subject: net sched ife action: Introduce skb tcindex metadata encap decap

Sample use case of how this is encoded:
user space via tuntap (or a connected VM/Machine/container)
encodes the tcindex TLV.

Sample use case of decoding:
IFE action decodes it and the skb->tc_index is then used to classify.
So something like this for encoded ICMP packets:

.. first decode then reclassify... skb->tcindex will be set
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xbeef \
u32 match u32 0 0 flowid 1:1 \
action ife decode reclassify

...next match the decode icmp packet...
sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action continue

... last classify it using the tcindex classifier and do someaction..
sudo $TC filter add dev $ETH parent ffff: prio 5 protocol ip \
handle 0x11 tcindex classid 1:1 \
action blah..

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_ife.h |  3 +-
 net/sched/Kconfig                  |  5 +++
 net/sched/Makefile                 |  1 +
 net/sched/act_meta_skbtcindex.c    | 79 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/act_meta_skbtcindex.c

(limited to 'net')

diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
index 4ece02a77b9a..cd18360eca24 100644
--- a/include/uapi/linux/tc_act/tc_ife.h
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -32,8 +32,9 @@ enum {
 #define IFE_META_HASHID 2
 #define	IFE_META_PRIO 3
 #define	IFE_META_QMAP 4
+#define	IFE_META_TCINDEX 5
 /*Can be overridden at runtime by module option*/
-#define	__IFE_META_MAX 5
+#define	__IFE_META_MAX 6
 #define IFE_META_MAX (__IFE_META_MAX - 1)
 
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7795d5a3f79a..87956a768d1b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -793,6 +793,11 @@ config NET_IFE_SKBPRIO
         depends on NET_ACT_IFE
         ---help---
 
+config NET_IFE_SKBTCINDEX
+        tristate "Support to encoding decoding skb tcindex on IFE action"
+        depends on NET_ACT_IFE
+        ---help---
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 148ae0d5ac2c..4bdda3634e0b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 000000000000..3b35774ce890
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2016)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
+			     struct tcf_meta_info *e)
+{
+	u32 ifetc_index = skb->tc_index;
+
+	return ife_encode_meta_u16(ifetc_index, skbdata, e);
+}
+
+static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
+{
+	u16 ifetc_index = *(u16 *)data;
+
+	skb->tc_index = ntohs(ifetc_index);
+	return 0;
+}
+
+static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+	return ife_check_meta_u16(skb->tc_index, e);
+}
+
+static struct tcf_meta_ops ife_skbtcindex_ops = {
+	.metaid = IFE_META_TCINDEX,
+	.metatype = NLA_U16,
+	.name = "tc_index",
+	.synopsis = "skb tc_index 16 bit metadata",
+	.check_presence = skbtcindex_check,
+	.encode = skbtcindex_encode,
+	.decode = skbtcindex_decode,
+	.get = ife_get_meta_u16,
+	.alloc = ife_alloc_meta_u16,
+	.release = ife_release_meta_gen,
+	.validate = ife_validate_meta_u16,
+	.owner = THIS_MODULE,
+};
+
+static int __init ifetc_index_init_module(void)
+{
+	return register_ife_op(&ife_skbtcindex_ops);
+}
+
+static void __exit ifetc_index_cleanup_module(void)
+{
+	unregister_ife_op(&ife_skbtcindex_ops);
+}
+
+module_init(ifetc_index_init_module);
+module_exit(ifetc_index_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2016)");
+MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
-- 
cgit v1.2.3


From f71b109f1730902b73f70d78764d8a41265080dd Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Sun, 18 Sep 2016 07:53:08 -0400
Subject: net sched actions police: peg drop stats for conforming traffic

setting conforming action to drop is a valid policy.
When it is set we need to at least see the stats indicating it
for debugging.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_police.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8a3be1d99775..ba7074b391ae 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -249,6 +249,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 			police->tcfp_t_c = now;
 			police->tcfp_toks = toks;
 			police->tcfp_ptoks = ptoks;
+			if (police->tcfp_result == TC_ACT_SHOT)
+				police->tcf_qstats.drops++;
 			spin_unlock(&police->tcf_lock);
 			return police->tcfp_result;
 		}
-- 
cgit v1.2.3


From 5a7a5555a362f60350668cd124df9a396f546c61 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 18 Sep 2016 08:45:33 -0400
Subject: net sched: stylistic cleanups

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c     | 16 ++++++----------
 net/sched/act_csum.c    | 36 ++++++++++++++++++------------------
 net/sched/act_gact.c    |  3 ++-
 net/sched/act_mirred.c  |  3 ++-
 net/sched/act_police.c  | 10 ++++------
 net/sched/cls_api.c     | 18 ++++++++++--------
 net/sched/cls_bpf.c     |  6 ++++--
 net/sched/cls_flow.c    | 21 ++++++++++++++-------
 net/sched/cls_flower.c  |  3 ++-
 net/sched/cls_fw.c      | 10 +++++-----
 net/sched/cls_route.c   |  9 +++------
 net/sched/cls_tcindex.c | 12 ++++++------
 net/sched/cls_u32.c     | 30 ++++++++++++------------------
 net/sched/sch_api.c     | 41 ++++++++++++++++++++++++++---------------
 14 files changed, 114 insertions(+), 104 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d0687594b..d0aceb1740b1 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -592,9 +592,8 @@ err_out:
 	return ERR_PTR(err);
 }
 
-int tcf_action_init(struct net *net, struct nlattr *nla,
-				  struct nlattr *est, char *name, int ovr,
-				  int bind, struct list_head *actions)
+int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+		    char *name, int ovr, int bind, struct list_head *actions)
 {
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
@@ -923,9 +922,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 	return err;
 }
 
-static int
-tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
-	       u32 portid, int ovr)
+static int tcf_action_add(struct net *net, struct nlattr *nla,
+			  struct nlmsghdr *n, u32 portid, int ovr)
 {
 	int ret = 0;
 	LIST_HEAD(actions);
@@ -988,8 +986,7 @@ replay:
 	return ret;
 }
 
-static struct nlattr *
-find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
 {
 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -1016,8 +1013,7 @@ find_dump_kind(const struct nlmsghdr *n)
 	return kind;
 }
 
-static int
-tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlmsghdr *nlh;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b5dbf633a863..e0defcef376d 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -116,8 +116,8 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
 		return (void *)(skb_network_header(skb) + ihl);
 }
 
-static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
-			      unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl,
+			      unsigned int ipl)
 {
 	struct icmphdr *icmph;
 
@@ -152,8 +152,8 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
 	return 1;
 }
 
-static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
-			      unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl,
+			      unsigned int ipl)
 {
 	struct icmp6hdr *icmp6h;
 	const struct ipv6hdr *ip6h;
@@ -174,8 +174,8 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
 	return 1;
 }
 
-static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
-			     unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl,
+			     unsigned int ipl)
 {
 	struct tcphdr *tcph;
 	const struct iphdr *iph;
@@ -195,8 +195,8 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
 	return 1;
 }
 
-static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
-			     unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl,
+			     unsigned int ipl)
 {
 	struct tcphdr *tcph;
 	const struct ipv6hdr *ip6h;
@@ -217,8 +217,8 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
 	return 1;
 }
 
-static int tcf_csum_ipv4_udp(struct sk_buff *skb,
-			     unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
+			     unsigned int ipl, int udplite)
 {
 	struct udphdr *udph;
 	const struct iphdr *iph;
@@ -270,8 +270,8 @@ ignore_obscure_skb:
 	return 1;
 }
 
-static int tcf_csum_ipv6_udp(struct sk_buff *skb,
-			     unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
+			     unsigned int ipl, int udplite)
 {
 	struct udphdr *udph;
 	const struct ipv6hdr *ip6h;
@@ -380,8 +380,8 @@ fail:
 	return 0;
 }
 
-static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
-				 unsigned int ixhl, unsigned int *pl)
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl,
+				 unsigned int *pl)
 {
 	int off, len, optlen;
 	unsigned char *xh = (void *)ip6xh;
@@ -494,8 +494,8 @@ fail:
 	return 0;
 }
 
-static int tcf_csum(struct sk_buff *skb,
-		    const struct tc_action *a, struct tcf_result *res)
+static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
+		    struct tcf_result *res)
 {
 	struct tcf_csum *p = to_tcf_csum(a);
 	int action;
@@ -531,8 +531,8 @@ drop:
 	return TC_ACT_SHOT;
 }
 
-static int tcf_csum_dump(struct sk_buff *skb,
-			 struct tc_action *a, int bind, int ref)
+static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+			 int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_csum *p = to_tcf_csum(a);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e24a4093d6f6..e0aa30f83c6c 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -156,7 +156,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	int action = READ_ONCE(gact->tcf_action);
 	struct tcf_t *tm = &gact->tcf_tm;
 
-	_bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets);
+	_bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
+			   packets);
 	if (action == TC_ACT_SHOT)
 		this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6038c85d92f5..1c76387c5d9c 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -204,7 +204,8 @@ out:
 	return retval;
 }
 
-static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+			   int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_mirred *m = to_mirred(a);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index ba7074b391ae..d1bd248fe146 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -263,8 +263,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 	return police->tcf_action;
 }
 
-static int
-tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
+			       int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_police *police = to_police(a);
@@ -349,14 +349,12 @@ static struct pernet_operations police_net_ops = {
 	.size = sizeof(struct tc_action_net),
 };
 
-static int __init
-police_init_module(void)
+static int __init police_init_module(void)
 {
 	return tcf_register_action(&act_police_ops, &police_net_ops);
 }
 
-static void __exit
-police_cleanup_module(void)
+static void __exit police_cleanup_module(void)
 {
 	tcf_unregister_action(&act_police_ops, &police_net_ops);
 }
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a7c5645373af..11da7da0b7c4 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -344,13 +344,15 @@ replay:
 			if (err == 0) {
 				struct tcf_proto *next = rtnl_dereference(tp->next);
 
-				tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+				tfilter_notify(net, skb, n, tp, fh,
+					       RTM_DELTFILTER);
 				if (tcf_destroy(tp, false))
 					RCU_INIT_POINTER(*back, next);
 			}
 			goto errout;
 		case RTM_GETTFILTER:
-			err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+			err = tfilter_notify(net, skb, n, tp, fh,
+					     RTM_NEWTFILTER);
 			goto errout;
 		default:
 			err = -EINVAL;
@@ -448,7 +450,8 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
 	struct net *net = sock_net(a->skb->sk);
 
 	return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
-			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
+			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+			     RTM_NEWTFILTER);
 }
 
 /* called with RTNL */
@@ -552,7 +555,7 @@ void tcf_exts_destroy(struct tcf_exts *exts)
 EXPORT_SYMBOL(tcf_exts_destroy);
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
-		  struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
+		      struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	{
@@ -560,8 +563,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 
 		if (exts->police && tb[exts->police]) {
 			act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
-						"police", ovr,
-						TCA_ACT_BIND);
+						"police", ovr, TCA_ACT_BIND);
 			if (IS_ERR(act))
 				return PTR_ERR(act);
 
@@ -573,8 +575,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 			int err, i = 0;
 
 			err = tcf_action_init(net, tb[exts->action], rate_tlv,
-					      NULL, ovr,
-					      TCA_ACT_BIND, &actions);
+					      NULL, ovr, TCA_ACT_BIND,
+					      &actions);
 			if (err)
 				return err;
 			list_for_each_entry(act, &actions, list)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1d92d4d3f222..c6f7a47541eb 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -55,7 +55,8 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
 	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
 	[TCA_BPF_FLAGS]		= { .type = NLA_U32 },
 	[TCA_BPF_FD]		= { .type = NLA_U32 },
-	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
+	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING,
+				    .len = CLS_BPF_NAME_LEN },
 	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
 	[TCA_BPF_OPS]		= { .type = NLA_BINARY,
 				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -409,7 +410,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 	}
 
-	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE],
+				      ovr);
 	if (ret < 0)
 		goto errout;
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index a379bae1d74e..e39672394c7b 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -87,12 +87,14 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
-static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto(const struct sk_buff *skb,
+			  const struct flow_keys *flow)
 {
 	return flow->basic.ip_proto;
 }
 
-static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_src(const struct sk_buff *skb,
+			      const struct flow_keys *flow)
 {
 	if (flow->ports.ports)
 		return ntohs(flow->ports.src);
@@ -100,7 +102,8 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys
 	return addr_fold(skb->sk);
 }
 
-static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_dst(const struct sk_buff *skb,
+			      const struct flow_keys *flow)
 {
 	if (flow->ports.ports)
 		return ntohs(flow->ports.dst);
@@ -149,7 +152,8 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
 })
 #endif
 
-static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_src(const struct sk_buff *skb,
+			     const struct flow_keys *flow)
 {
 	switch (tc_skb_protocol(skb)) {
 	case htons(ETH_P_IP):
@@ -161,7 +165,8 @@ fallback:
 	return flow_get_src(skb, flow);
 }
 
-static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_dst(const struct sk_buff *skb,
+			     const struct flow_keys *flow)
 {
 	switch (tc_skb_protocol(skb)) {
 	case htons(ETH_P_IP):
@@ -173,14 +178,16 @@ fallback:
 	return flow_get_dst(skb, flow);
 }
 
-static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb,
+				   const struct flow_keys *flow)
 {
 	return ntohs(CTTUPLE(skb, src.u.all));
 fallback:
 	return flow_get_proto_src(skb, flow);
 }
 
-static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb,
+				   const struct flow_keys *flow)
 {
 	return ntohs(CTTUPLE(skb, dst.u.all));
 fallback:
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index a3f4c706dfaa..2af09c872a1a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -241,7 +241,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	tc.type = TC_SETUP_CLSFLOWER;
 	tc.cls_flower = &offload;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+					    &tc);
 
 	if (tc_skip_sw(flags))
 		return err;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index cc0bda945800..9dc63d54e167 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -57,7 +57,7 @@ static u32 fw_hash(u32 handle)
 }
 
 static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
-			  struct tcf_result *res)
+		       struct tcf_result *res)
 {
 	struct fw_head *head = rcu_dereference_bh(tp->root);
 	struct fw_filter *f;
@@ -188,7 +188,8 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
 
 static int
 fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
-	struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
+		struct nlattr **tb, struct nlattr **tca, unsigned long base,
+		bool ovr)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct tcf_exts e;
@@ -237,9 +238,8 @@ errout:
 
 static int fw_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
-		     u32 handle,
-		     struct nlattr **tca,
-		     unsigned long *arg, bool ovr)
+		     u32 handle, struct nlattr **tca, unsigned long *arg,
+		     bool ovr)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f = (struct fw_filter *) *arg;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index c91e65d81a48..a4ce39b19be0 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -268,8 +268,7 @@ static int route4_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void
-route4_delete_filter(struct rcu_head *head)
+static void route4_delete_filter(struct rcu_head *head)
 {
 	struct route4_filter *f = container_of(head, struct route4_filter, rcu);
 
@@ -474,10 +473,8 @@ errout:
 }
 
 static int route4_change(struct net *net, struct sk_buff *in_skb,
-		       struct tcf_proto *tp, unsigned long base,
-		       u32 handle,
-		       struct nlattr **tca,
-		       unsigned long *arg, bool ovr)
+			 struct tcf_proto *tp, unsigned long base, u32 handle,
+			 struct nlattr **tca, unsigned long *arg, bool ovr)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_filter __rcu **fp;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index d9500709831f..96144bdf30db 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -50,14 +50,13 @@ struct tcindex_data {
 	struct rcu_head rcu;
 };
 
-static inline int
-tcindex_filter_is_set(struct tcindex_filter_result *r)
+static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
 {
 	return tcf_exts_is_predicative(&r->exts) || r->res.classid;
 }
 
-static struct tcindex_filter_result *
-tcindex_lookup(struct tcindex_data *p, u16 key)
+static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
+						    u16 key)
 {
 	if (p->perfect) {
 		struct tcindex_filter_result *f = p->perfect + key;
@@ -144,7 +143,8 @@ static void tcindex_destroy_rexts(struct rcu_head *head)
 
 static void tcindex_destroy_fexts(struct rcu_head *head)
 {
-	struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu);
+	struct tcindex_filter *f = container_of(head, struct tcindex_filter,
+						rcu);
 
 	tcf_exts_destroy(&f->result.exts);
 	kfree(f);
@@ -550,7 +550,7 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force)
 
 
 static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
-    struct sk_buff *skb, struct tcmsg *t)
+			struct sk_buff *skb, struct tcmsg *t)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index a29263a9d8c1..ae83c3aec308 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -104,7 +104,8 @@ static inline unsigned int u32_hash_fold(__be32 key,
 	return h;
 }
 
-static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+			struct tcf_result *res)
 {
 	struct {
 		struct tc_u_knode *knode;
@@ -256,8 +257,7 @@ deadloop:
 	return -1;
 }
 
-static struct tc_u_hnode *
-u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
 {
 	struct tc_u_hnode *ht;
 
@@ -270,8 +270,7 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
 	return ht;
 }
 
-static struct tc_u_knode *
-u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
 {
 	unsigned int sel;
 	struct tc_u_knode *n = NULL;
@@ -360,8 +359,7 @@ static int u32_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static int u32_destroy_key(struct tcf_proto *tp,
-			   struct tc_u_knode *n,
+static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
 			   bool free_pf)
 {
 	tcf_exts_destroy(&n->exts);
@@ -448,9 +446,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	}
 }
 
-static int u32_replace_hw_hnode(struct tcf_proto *tp,
-				 struct tc_u_hnode *h,
-				 u32 flags)
+static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
+				u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -496,9 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	}
 }
 
-static int u32_replace_hw_knode(struct tcf_proto *tp,
-				 struct tc_u_knode *n,
-				 u32 flags)
+static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+				u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -763,8 +759,7 @@ errout:
 	return err;
 }
 
-static void u32_replace_knode(struct tcf_proto *tp,
-			      struct tc_u_common *tp_c,
+static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
 			      struct tc_u_knode *n)
 {
 	struct tc_u_knode __rcu **ins;
@@ -845,8 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 
 static int u32_change(struct net *net, struct sk_buff *in_skb,
 		      struct tcf_proto *tp, unsigned long base, u32 handle,
-		      struct nlattr **tca,
-		      unsigned long *arg, bool ovr)
+		      struct nlattr **tca, unsigned long *arg, bool ovr)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *ht;
@@ -1088,7 +1082,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 }
 
 static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
-		     struct sk_buff *skb, struct tcmsg *t)
+		    struct sk_buff *skb, struct tcmsg *t)
 {
 	struct tc_u_knode *n = (struct tc_u_knode *)fh;
 	struct tc_u_hnode *ht_up, *ht_down;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d677b3484d81..206dc24add3a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -389,7 +389,8 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 
 static struct qdisc_rate_table *qdisc_rtab_list;
 
-struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
+					struct nlattr *tab)
 {
 	struct qdisc_rate_table *rtab;
 
@@ -541,7 +542,8 @@ nla_put_failure:
 	return -1;
 }
 
-void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb,
+			       const struct qdisc_size_table *stab)
 {
 	int pkt_len, slot;
 
@@ -888,10 +890,10 @@ static struct lock_class_key qdisc_rx_lock;
    Parameters are passed via opt.
  */
 
-static struct Qdisc *
-qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
-	     struct Qdisc *p, u32 parent, u32 handle,
-	     struct nlattr **tca, int *errp)
+static struct Qdisc *qdisc_create(struct net_device *dev,
+				  struct netdev_queue *dev_queue,
+				  struct Qdisc *p, u32 parent, u32 handle,
+				  struct nlattr **tca, int *errp)
 {
 	int err;
 	struct nlattr *kind = tca[TCA_KIND];
@@ -1073,7 +1075,8 @@ struct check_loop_arg {
 	int			depth;
 };
 
-static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+static int check_loop_fn(struct Qdisc *q, unsigned long cl,
+			 struct qdisc_walker *w);
 
 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 {
@@ -1450,7 +1453,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
 	} else {
 		if (!tc_qdisc_dump_ignore(q) &&
 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
-				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				  RTM_NEWQDISC) <= 0)
 			goto done;
 		q_idx++;
 	}
@@ -1471,7 +1475,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
 		}
 		if (!tc_qdisc_dump_ignore(q) &&
 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
-				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				  RTM_NEWQDISC) <= 0)
 			goto done;
 		q_idx++;
 	}
@@ -1505,7 +1510,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 			s_q_idx = 0;
 		q_idx = 0;
 
-		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, true) < 0)
+		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
+				       true) < 0)
 			goto done;
 
 		dev_queue = dev_ingress_queue(dev);
@@ -1640,7 +1646,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
 			if (cops->delete)
 				err = cops->delete(q, cl);
 			if (err == 0)
-				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
+				tclass_notify(net, skb, n, q, cl,
+					      RTM_DELTCLASS);
 			goto out;
 		case RTM_GETTCLASS:
 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
@@ -1738,12 +1745,14 @@ struct qdisc_dump_args {
 	struct netlink_callback	*cb;
 };
 
-static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
+			    struct qdisc_walker *arg)
 {
 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
 
 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
-			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
+			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+			      RTM_NEWTCLASS);
 }
 
 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
@@ -1976,10 +1985,12 @@ static int __init pktsched_init(void)
 
 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
+		      NULL);
 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
+		      NULL);
 
 	return 0;
 }
-- 
cgit v1.2.3


From a435a07f9164dda7c0c26e8ad758881f4bafc127 Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.im>
Date: Sun, 18 Sep 2016 17:46:07 +0200
Subject: net: ipv6: fallback to full lookup if table lookup is unsuitable

Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop
lookups") introduced a regression: insertion of an IPv6 route in a table
not containing the appropriate connected route for the gateway but which
contained a non-connected route (like a default gateway) fails while it
was previously working:

    $ ip link add eth0 type dummy
    $ ip link set up dev eth0
    $ ip addr add 2001:db8::1/64 dev eth0
    $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20
    $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
    RTNETLINK answers: No route to host
    $ ip -6 route show table 20
    default via 2001:db8::5 dev eth0  metric 1024  pref medium

After this patch, we get:

    $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
    $ ip -6 route show table 20
    2001:db8:cafe::1 via 2001:db8::6 dev eth0  metric 1024  pref medium
    default via 2001:db8::5 dev eth0  metric 1024  pref medium

Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups")
Signed-off-by: Vincent Bernat <vincent@bernat.im>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 49817555449e..e3a224b97905 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1986,9 +1986,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
 			if (!(gwa_type & IPV6_ADDR_UNICAST))
 				goto out;
 
-			if (cfg->fc_table)
+			if (cfg->fc_table) {
 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
 
+				if (grt) {
+					if (grt->rt6i_flags & RTF_GATEWAY ||
+					    (dev && dev != grt->dst.dev)) {
+						ip6_rt_put(grt);
+						grt = NULL;
+					}
+				}
+			}
+
 			if (!grt)
 				grt = rt6_lookup(net, gw_addr, NULL,
 						 cfg->fc_ifindex, 1);
-- 
cgit v1.2.3


From 83e7e4ce9e93c3b020497144f4354b62aed5d894 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 19 Sep 2016 19:00:10 +0800
Subject: mac80211: Use rhltable instead of rhashtable

mac80211 currently uses rhashtable with insecure_elasticity set
to true.  The latter is because of duplicate objects.  What's
more, mac80211 walks the rhashtable chains by hand which is broken
as rhashtable may contain multiple tables due to resizing or
rehashing.

This patch fixes it by converting it to the newly added rhltable
interface which is designed for use with duplicate objects.

With rhltable a lookup returns a list of objects instead of a
single one.  This is then fed into the existing for_each_sta_info
macro.

This patch also deletes the sta_addr_hash function since rhashtable
defaults to jhash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/rx.c          |  7 ++-----
 net/mac80211/sta_info.c    | 52 +++++++++++++++++++---------------------------
 net/mac80211/sta_info.h    | 19 +++++++----------
 net/mac80211/status.c      |  7 ++-----
 5 files changed, 33 insertions(+), 54 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c71c73594790..e496dee5af08 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1213,7 +1213,7 @@ struct ieee80211_local {
 	spinlock_t tim_lock;
 	unsigned long num_sta;
 	struct list_head sta_list;
-	struct rhashtable sta_hash;
+	struct rhltable sta_hash;
 	struct timer_list sta_cleanup;
 	int sta_generation;
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e796060b7c5e..f7cf342bab52 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4003,7 +4003,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 	__le16 fc;
 	struct ieee80211_rx_data rx;
 	struct ieee80211_sub_if_data *prev;
-	struct rhash_head *tmp;
+	struct rhlist_head *tmp;
 	int err = 0;
 
 	fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -4046,13 +4046,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 		goto out;
 	} else if (ieee80211_is_data(fc)) {
 		struct sta_info *sta, *prev_sta;
-		const struct bucket_table *tbl;
 
 		prev_sta = NULL;
 
-		tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
-		for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+		for_each_sta_info(local, hdr->addr2, sta, tmp) {
 			if (!prev_sta) {
 				prev_sta = sta;
 				continue;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 1b1b28ff4fdb..c803e2cb58bc 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
 
 static const struct rhashtable_params sta_rht_params = {
 	.nelem_hint = 3, /* start small */
-	.insecure_elasticity = true, /* Disable chain-length checks. */
 	.automatic_shrinking = true,
 	.head_offset = offsetof(struct sta_info, hash_node),
 	.key_offset = offsetof(struct sta_info, addr),
 	.key_len = ETH_ALEN,
-	.hashfn = sta_addr_hash,
 	.max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
 };
 
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
 static int sta_info_hash_del(struct ieee80211_local *local,
 			     struct sta_info *sta)
 {
-	return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node,
-				      sta_rht_params);
+	return rhltable_remove(&local->sta_hash, &sta->hash_node,
+			       sta_rht_params);
 }
 
 static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
 	sta_info_free(local, sta);
 }
 
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+					 const u8 *addr)
+{
+	return rhltable_lookup(&local->sta_hash, addr, sta_rht_params);
+}
+
 /* protected by RCU */
 struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
 			      const u8 *addr)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct rhlist_head *tmp;
 	struct sta_info *sta;
-	struct rhash_head *tmp;
-	const struct bucket_table *tbl;
 
 	rcu_read_lock();
-	tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
-	for_each_sta_info(local, tbl, addr, sta, tmp) {
+	for_each_sta_info(local, addr, sta, tmp) {
 		if (sta->sdata == sdata) {
 			rcu_read_unlock();
 			/* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
 				  const u8 *addr)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct rhlist_head *tmp;
 	struct sta_info *sta;
-	struct rhash_head *tmp;
-	const struct bucket_table *tbl;
 
 	rcu_read_lock();
-	tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
-	for_each_sta_info(local, tbl, addr, sta, tmp) {
+	for_each_sta_info(local, addr, sta, tmp) {
 		if (sta->sdata == sdata ||
 		    (sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
 			rcu_read_unlock();
@@ -263,8 +261,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
 static int sta_info_hash_add(struct ieee80211_local *local,
 			     struct sta_info *sta)
 {
-	return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node,
-				      sta_rht_params);
+	return rhltable_insert(&local->sta_hash, &sta->hash_node,
+			       sta_rht_params);
 }
 
 static void sta_deliver_ps_frames(struct work_struct *wk)
@@ -453,9 +451,9 @@ static int sta_info_insert_check(struct sta_info *sta)
 		    is_multicast_ether_addr(sta->sta.addr)))
 		return -EINVAL;
 
-	/* Strictly speaking this isn't necessary as we hold the mutex, but
-	 * the rhashtable code can't really deal with that distinction. We
-	 * do require the mutex for correctness though.
+	/* The RCU read lock is required by rhashtable due to
+	 * asynchronous resize/rehash.  We also require the mutex
+	 * for correctness.
 	 */
 	rcu_read_lock();
 	lockdep_assert_held(&sdata->local->sta_mtx);
@@ -1043,16 +1041,11 @@ static void sta_info_cleanup(unsigned long data)
 		  round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
 }
 
-u32 sta_addr_hash(const void *key, u32 length, u32 seed)
-{
-	return jhash(key, ETH_ALEN, seed);
-}
-
 int sta_info_init(struct ieee80211_local *local)
 {
 	int err;
 
-	err = rhashtable_init(&local->sta_hash, &sta_rht_params);
+	err = rhltable_init(&local->sta_hash, &sta_rht_params);
 	if (err)
 		return err;
 
@@ -1068,7 +1061,7 @@ int sta_info_init(struct ieee80211_local *local)
 void sta_info_stop(struct ieee80211_local *local)
 {
 	del_timer_sync(&local->sta_cleanup);
-	rhashtable_destroy(&local->sta_hash);
+	rhltable_destroy(&local->sta_hash);
 }
 
 
@@ -1138,17 +1131,14 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
 						   const u8 *localaddr)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
+	struct rhlist_head *tmp;
 	struct sta_info *sta;
-	struct rhash_head *tmp;
-	const struct bucket_table *tbl;
-
-	tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
 
 	/*
 	 * Just return a random station if localaddr is NULL
 	 * ... first in list.
 	 */
-	for_each_sta_info(local, tbl, addr, sta, tmp) {
+	for_each_sta_info(local, addr, sta, tmp) {
 		if (localaddr &&
 		    !ether_addr_equal(sta->sdata->vif.addr, localaddr))
 			continue;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 530231b73278..ed5fcb984a01 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -455,7 +455,7 @@ struct sta_info {
 	/* General information, mostly static */
 	struct list_head list, free_list;
 	struct rcu_head rcu_head;
-	struct rhash_head hash_node;
+	struct rhlist_head hash_node;
 	u8 addr[ETH_ALEN];
 	struct ieee80211_local *local;
 	struct ieee80211_sub_if_data *sdata;
@@ -638,6 +638,9 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
  */
 #define STA_INFO_CLEANUP_INTERVAL (10 * HZ)
 
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+					 const u8 *addr);
+
 /*
  * Get a STA info, must be under RCU read lock.
  */
@@ -647,17 +650,9 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
 struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
 				  const u8 *addr);
 
-u32 sta_addr_hash(const void *key, u32 length, u32 seed);
-
-#define _sta_bucket_idx(_tbl, _a)					\
-	rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd))
-
-#define for_each_sta_info(local, tbl, _addr, _sta, _tmp)		\
-	rht_for_each_entry_rcu(_sta, _tmp, tbl, 			\
-			       _sta_bucket_idx(tbl, _addr),		\
-			       hash_node)				\
-	/* compare address and run code only if it matches */		\
-	if (ether_addr_equal(_sta->addr, (_addr)))
+#define for_each_sta_info(local, _addr, _sta, _tmp)			\
+	rhl_for_each_entry_rcu(_sta, _tmp,				\
+			       sta_info_hash_lookup(local, _addr), hash_node)
 
 /*
  * Get STA info by index, BROKEN!
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ea39f8a7baf3..ddf71c648cab 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -746,8 +746,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	__le16 fc;
 	struct ieee80211_supported_band *sband;
+	struct rhlist_head *tmp;
 	struct sta_info *sta;
-	struct rhash_head *tmp;
 	int retry_count;
 	int rates_idx;
 	bool send_to_cooked;
@@ -755,7 +755,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	struct ieee80211_bar *bar;
 	int shift = 0;
 	int tid = IEEE80211_NUM_TIDS;
-	const struct bucket_table *tbl;
 
 	rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
 
@@ -764,9 +763,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	sband = local->hw.wiphy->bands[info->band];
 	fc = hdr->frame_control;
 
-	tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
-	for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) {
+	for_each_sta_info(local, hdr->addr1, sta, tmp) {
 		/* skip wrong virtual interface */
 		if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
 			continue;
-- 
cgit v1.2.3


From 06f8ec9041f02d44bb0b75d47668e2fe00d5e0c3 Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Mon, 19 Sep 2016 15:28:00 +0200
Subject: net-next: dsa: fix duplicate invocation of set_addr()

commit 83c0afaec7b730b ("net: dsa: Add new binding implementation")
has a duplicate invocation of the set_addr() operation callback. Remove one
of them.

Signed-off-by: John Crispin <john@phrozen.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 8278385dcd21..cffc19e972a1 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -308,10 +308,6 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	if (err < 0)
 		return err;
 
-	err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
-	if (err < 0)
-		return err;
-
 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
 		if (!ds->slave_mii_bus)
-- 
cgit v1.2.3


From 092183df0fa1f4b49baad3a980c55d55de07dfb7 Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Mon, 19 Sep 2016 15:28:01 +0200
Subject: net-next: dsa: make the set_addr() operation optional

Only 1 of the 3 drivers currently has a set_addr() operation. Make the
set_addr() callback optional to reduce the amount of empty stubs inside
the drivers.

Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c  | 8 +++++---
 net/dsa/dsa2.c | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 66e31acfcad8..a6902c1e2f28 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -378,9 +378,11 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	if (ret < 0)
 		goto out;
 
-	ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
-	if (ret < 0)
-		goto out;
+	if (ops->set_addr) {
+		ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
+		if (ret < 0)
+			goto out;
+	}
 
 	if (!ds->slave_mii_bus && ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(parent);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index cffc19e972a1..f8a7d9aab437 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -304,9 +304,11 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
 	if (err < 0)
 		return err;
 
-	err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
-	if (err < 0)
-		return err;
+	if (ds->ops->set_addr) {
+		err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
+		if (err < 0)
+			return err;
+	}
 
 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
-- 
cgit v1.2.3


From 190aa3e77880a05332ea1ccb382a51285d57adb5 Mon Sep 17 00:00:00 2001
From: pravin shelar <pshelar@ovn.org>
Date: Mon, 19 Sep 2016 13:50:59 -0700
Subject: openvswitch: Fix Frame-size larger than 1024 bytes warning.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no need to declare separate key on stack,
we can just use sw_flow->key to store the key directly.

This commit fixes following warning:

net/openvswitch/datapath.c: In function ‘ovs_flow_cmd_new’:
net/openvswitch/datapath.c:1080:1: warning: the frame size of 1040 bytes
is larger than 1024 bytes [-Wframe-larger-than=]

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0536ab3504d5..474e7a6bfeb7 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -928,7 +928,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	struct sw_flow_mask mask;
 	struct sk_buff *reply;
 	struct datapath *dp;
-	struct sw_flow_key key;
 	struct sw_flow_actions *acts;
 	struct sw_flow_match match;
 	u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
@@ -956,20 +955,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	/* Extract key. */
-	ovs_match_init(&match, &key, &mask);
+	ovs_match_init(&match, &new_flow->key, &mask);
 	error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 				  a[OVS_FLOW_ATTR_MASK], log);
 	if (error)
 		goto err_kfree_flow;
 
-	ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
-
 	/* Extract flow identifier. */
 	error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
-				       &key, log);
+				       &new_flow->key, log);
 	if (error)
 		goto err_kfree_flow;
 
+	/* unmasked key is needed to match when ufid is not used. */
+	if (ovs_identifier_is_key(&new_flow->id))
+		match.key = new_flow->id.unmasked_key;
+
+	ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
+
 	/* Validate actions. */
 	error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
 				     &new_flow->key, &acts, log);
@@ -996,7 +999,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (ovs_identifier_is_ufid(&new_flow->id))
 		flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
 	if (!flow)
-		flow = ovs_flow_tbl_lookup(&dp->table, &key);
+		flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
 	if (likely(!flow)) {
 		rcu_assign_pointer(new_flow->sf_acts, acts);
 
-- 
cgit v1.2.3


From 2279994d07ab67ff7a1d09bfbd65588332dfb6d8 Mon Sep 17 00:00:00 2001
From: pravin shelar <pshelar@ovn.org>
Date: Mon, 19 Sep 2016 13:51:00 -0700
Subject: openvswitch: avoid resetting flow key while installing new flow.

since commit commit db74a3335e0f6 ("openvswitch: use percpu
flow stats") flow alloc resets flow-key. So there is no need
to reset the flow-key again if OVS is using newly allocated
flow-key.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c     | 8 ++++----
 net/openvswitch/flow.c         | 2 --
 net/openvswitch/flow_netlink.c | 6 ++++--
 net/openvswitch/flow_netlink.h | 3 ++-
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 474e7a6bfeb7..4d67ea856067 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -955,7 +955,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	/* Extract key. */
-	ovs_match_init(&match, &new_flow->key, &mask);
+	ovs_match_init(&match, &new_flow->key, false, &mask);
 	error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 				  a[OVS_FLOW_ATTR_MASK], log);
 	if (error)
@@ -1124,7 +1124,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
 	if (a[OVS_FLOW_ATTR_KEY]) {
-		ovs_match_init(&match, &key, &mask);
+		ovs_match_init(&match, &key, true, &mask);
 		error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 					  a[OVS_FLOW_ATTR_MASK], log);
 	} else if (!ufid_present) {
@@ -1241,7 +1241,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 
 	ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
 	if (a[OVS_FLOW_ATTR_KEY]) {
-		ovs_match_init(&match, &key, NULL);
+		ovs_match_init(&match, &key, true, NULL);
 		err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
 					log);
 	} else if (!ufid_present) {
@@ -1300,7 +1300,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 
 	ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
 	if (a[OVS_FLOW_ATTR_KEY]) {
-		ovs_match_init(&match, &key, NULL);
+		ovs_match_init(&match, &key, true, NULL);
 		err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 					NULL, log);
 		if (unlikely(err))
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0fa45439def1..634cc10d6dee 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -767,8 +767,6 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 {
 	int err;
 
-	memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
-
 	/* Extract metadata from netlink attributes. */
 	err = ovs_nla_get_flow_metadata(net, attr, key, log);
 	if (err)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 8efa718ddb5e..ae25ded82b3b 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1996,13 +1996,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
 
 void ovs_match_init(struct sw_flow_match *match,
 		    struct sw_flow_key *key,
+		    bool reset_key,
 		    struct sw_flow_mask *mask)
 {
 	memset(match, 0, sizeof(*match));
 	match->key = key;
 	match->mask = mask;
 
-	memset(key, 0, sizeof(*key));
+	if (reset_key)
+		memset(key, 0, sizeof(*key));
 
 	if (mask) {
 		memset(&mask->key, 0, sizeof(mask->key));
@@ -2049,7 +2051,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	struct nlattr *a;
 	int err = 0, start, opts_type;
 
-	ovs_match_init(&match, &key, NULL);
+	ovs_match_init(&match, &key, true, NULL);
 	opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
 	if (opts_type < 0)
 		return opts_type;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 47dd142eca1c..45f9769e5aac 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void);
 size_t ovs_key_attr_size(void);
 
 void ovs_match_init(struct sw_flow_match *match,
-		    struct sw_flow_key *key, struct sw_flow_mask *mask);
+		    struct sw_flow_key *key, bool reset_key,
+		    struct sw_flow_mask *mask);
 
 int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
 		    int attr, bool is_mask, struct sk_buff *);
-- 
cgit v1.2.3


From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 20 Sep 2016 00:26:13 +0200
Subject: bpf: direct packet write and access for helpers for clsact progs

This work implements direct packet access for helpers and direct packet
write in a similar fashion as already available for XDP types via commits
4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and
6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a
complementary feature to the already available direct packet read for tc
(cls/act) programs.

For enabling this, we need to introduce two helpers, bpf_skb_pull_data()
and bpf_csum_update(). The first is generally needed for both, read and
write, because they would otherwise only be limited to the current linear
skb head. Usually, when the data_end test fails, programs just bail out,
or, in the direct read case, use bpf_skb_load_bytes() as an alternative
to overcome this limitation. If such data sits in non-linear parts, we
can just pull them in once with the new helper, retest and eventually
access them.

At the same time, this also makes sure the skb is uncloned, which is, of
course, a necessary condition for direct write. As this needs to be an
invariant for the write part only, the verifier detects writes and adds
a prologue that is calling bpf_skb_pull_data() to effectively unclone the
skb from the very beginning in case it is indeed cloned. The heuristic
makes use of a similar trick that was done in 233577a22089 ("net: filter:
constify detection of pkt_type_offset"). This comes at zero cost for other
programs that do not use the direct write feature. Should a program use
this feature only sparsely and has read access for the most parts with,
for example, drop return codes, then such write action can be delegated
to a tail called program for mitigating this cost of potential uncloning
to a late point in time where it would have been paid similarly with the
bpf_skb_store_bytes() as well. Advantage of direct write is that the
writes are inlined whereas the helper cannot make any length assumptions
and thus needs to generate a call to memcpy() also for small sizes, as well
as cost of helper call itself with sanity checks are avoided. Plus, when
direct read is already used, we don't need to cache or perform rechecks
on the data boundaries (due to verifier invalidating previous checks for
helpers that change skb->data), so more complex programs using rewrites
can benefit from switching to direct read plus write.

For direct packet access to helpers, we save the otherwise needed copy into
a temp struct sitting on stack memory when use-case allows. Both facilities
are enabled via may_access_direct_pkt_data() in verifier. For now, we limit
this to map helpers and csum_diff, and can successively enable other helpers
where we find it makes sense. Helpers that definitely cannot be allowed for
this are those part of bpf_helper_changes_skb_data() since they can change
underlying data, and those that write into memory as this could happen for
packet typed args when still cloned. bpf_csum_update() helper accommodates
for the fact that we need to fixup checksum_complete when using direct write
instead of bpf_skb_store_bytes(), meaning the programs can use available
helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(),
csum_block_add(), csum_block_sub() equivalents in eBPF together with the
new helper. A usage example will be provided for iproute2's examples/bpf/
directory.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   4 +-
 include/linux/skbuff.h   |  14 ++++-
 include/uapi/linux/bpf.h |  21 ++++++++
 kernel/bpf/helpers.c     |   3 ++
 kernel/bpf/verifier.c    |  54 ++++++++++++++-----
 net/core/filter.c        | 134 +++++++++++++++++++++++++++++++++++++++++------
 6 files changed, 196 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a904f63f8c1..5691fdc83819 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_return_type {
 struct bpf_func_proto {
 	u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 	bool gpl_only;
+	bool pkt_access;
 	enum bpf_return_type ret_type;
 	enum bpf_arg_type arg1_type;
 	enum bpf_arg_type arg2_type;
@@ -151,7 +152,8 @@ struct bpf_verifier_ops {
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
 				enum bpf_reg_type *reg_type);
-
+	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
+			    const struct bpf_prog *prog);
 	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn, struct bpf_prog *prog);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c5662f05bda..c6dab3f7457c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -676,13 +676,23 @@ struct sk_buff {
 	 */
 	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
+
+/* if you move cloned around you also must adapt those constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define CLONED_MASK	(1 << 7)
+#else
+#define CLONED_MASK	1
+#endif
+#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)
+
+	__u8			__cloned_offset[0];
 	__u8			cloned:1,
 				nohdr:1,
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				xmit_more:1;
-	/* one bit hole */
+				xmit_more:1,
+				__unused:1; /* one bit hole */
 	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f896dfac4ac0..e07432b9f8b8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -398,6 +398,27 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_change_tail,
 
+	/**
+	 * bpf_skb_pull_data(skb, len)
+	 * The helper will pull in non-linear data in case the
+	 * skb is non-linear and not all of len are part of the
+	 * linear section. Only needed for read/write with direct
+	 * packet access.
+	 * @skb: pointer to skb
+	 * @len: len to make read/writeable
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_pull_data,
+
+	/**
+	 * bpf_csum_update(skb, csum)
+	 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+	 * @skb: pointer to skb
+	 * @csum: csum to add
+	 * Return: csum on success or negative error
+	 */
+	BPF_FUNC_csum_update,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a5b8bf8cfcfd..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.func		= bpf_map_lookup_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.func		= bpf_map_update_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.func		= bpf_map_delete_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc138f34e38c..3a75ee3bdcd1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -196,6 +196,7 @@ struct verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool seen_direct_write;
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
@@ -204,6 +205,7 @@ struct verifier_env {
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
+	bool pkt_access;
 	int regno;
 	int access_size;
 };
@@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
-static bool may_write_pkt_data(enum bpf_prog_type type)
+static bool may_access_direct_pkt_data(struct verifier_env *env,
+				       const struct bpf_call_arg_meta *meta)
 {
-	switch (type) {
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+		if (meta)
+			return meta->pkt_access;
+
+		env->seen_direct_write = true;
 		return true;
 	default:
 		return false;
@@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) {
-		verbose("helper access to the packet is not allowed for clsact\n");
+	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	changes_data = bpf_helper_changes_skb_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
+	meta.pkt_access = fn->pkt_access;
 
 	/* We only support one arg being in raw mode at the moment, which
 	 * is sufficient for the helper functions we have right now.
@@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
  */
 static int convert_ctx_accesses(struct verifier_env *env)
 {
-	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
-	struct bpf_insn insn_buf[16];
+	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i;
+	int i, insn_cnt, cnt;
 
-	if (!env->prog->aux->ops->convert_ctx_access)
+	if (ops->gen_prologue) {
+		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+					env->prog);
+		if (cnt >= ARRAY_SIZE(insn_buf)) {
+			verbose("bpf verifier is misconfigured\n");
+			return -EINVAL;
+		} else if (cnt) {
+			new_prog = bpf_patch_insn_single(env->prog, 0,
+							 insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+			env->prog = new_prog;
+		}
+	}
+
+	if (!ops->convert_ctx_access)
 		return 0;
 
+	insn_cnt = env->prog->len;
+	insn = env->prog->insnsi;
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		u32 insn_delta, cnt;
+		u32 insn_delta;
 
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
@@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			continue;
 		}
 
-		cnt = env->prog->aux->ops->
-			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+					      insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index 298b146b47e7..0920c2ac1d00 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 	return err;
 }
 
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+	return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
 {
 	if (skb_at_tc_ingress(skb))
@@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+	/* Idea is the following: should the needed direct read/write
+	 * test fail during runtime, we can pull in more data and redo
+	 * again, since implicitly, we invalidate previous checks here.
+	 *
+	 * Or, since we know how much we need to make read/writeable,
+	 * this can be done once at the program beginning for direct
+	 * access case. By this we overcome limitations of only current
+	 * headroom being accessible.
+	 */
+	return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+	.func		= bpf_skb_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
 	   u64, from, u64, to, u64, flags)
 {
@@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
 static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.func		= bpf_csum_diff,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+	/* The interface is to be used in combination with bpf_csum_diff()
+	 * for direct packet writes. csum rotation for alignment as well
+	 * as emulating csum_sub() can be done from the eBPF program.
+	 */
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		return (skb->csum = csum_add(skb->csum, csum));
+
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+	.func		= bpf_csum_update,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	return dev_forward_skb(dev, skb);
@@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
 	struct net_device *dev;
+	struct sk_buff *clone;
+	int ret;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return -EINVAL;
@@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 	if (unlikely(!dev))
 		return -EINVAL;
 
-	skb = skb_clone(skb, GFP_ATOMIC);
-	if (unlikely(!skb))
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (unlikely(!clone))
 		return -ENOMEM;
 
-	bpf_push_mac_rcsum(skb);
+	/* For direct write, we need to keep the invariant that the skbs
+	 * we're dealing with need to be uncloned. Should uncloning fail
+	 * here, we need to free the just generated clone to unclone once
+	 * again.
+	 */
+	ret = bpf_try_make_head_writable(skb);
+	if (unlikely(ret)) {
+		kfree_skb(clone);
+		return -ENOMEM;
+	}
+
+	bpf_push_mac_rcsum(clone);
 
 	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+	       __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
 }
 
 static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 
 bool bpf_helper_changes_skb_data(void *func)
 {
-	if (func == bpf_skb_vlan_push)
-		return true;
-	if (func == bpf_skb_vlan_pop)
-		return true;
-	if (func == bpf_skb_store_bytes)
-		return true;
-	if (func == bpf_skb_change_proto)
-		return true;
-	if (func == bpf_skb_change_tail)
-		return true;
-	if (func == bpf_l3_csum_replace)
-		return true;
-	if (func == bpf_l4_csum_replace)
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace)
 		return true;
 
 	return false;
@@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
 		return &bpf_csum_diff_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
@@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			       const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (!direct_write)
+		return 0;
+
+	/* if (!skb->cloned)
+	 *       goto start;
+	 *
+	 * (Fast-path, otherwise approximation that we might be
+	 *  a clone, do the rest in helper.)
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+	/* ret = bpf_skb_pull_data(skb, 0); */
+	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       BPF_FUNC_skb_pull_data);
+	/* if (!ret)
+	 *      goto restore;
+	 * return TC_ACT_SHOT;
+	 */
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+	*insn++ = BPF_EXIT_INSN();
+
+	/* restore: */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+	/* start: */
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
 				       enum bpf_reg_type *reg_type)
@@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
 };
 
 static const struct bpf_verifier_ops xdp_ops = {
-- 
cgit v1.2.3


From aecc5cefc389735b5327d234e11d1fe505e1c280 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 19 Sep 2016 19:02:51 -0400
Subject: net sched actions: fix GETing actions

With the batch changes that translated transient actions into
a temporary list lost in the translation was the fact that
tcf_action_destroy() will eventually delete the action from
the permanent location if the refcount is zero.

Example of what broke:
...add a gact action to drop
sudo $TC actions add action drop index 10
...now retrieve it, looks good
sudo $TC actions get action gact index 10
...retrieve it again and find it is gone!
sudo $TC actions get action gact index 10

Fixes: 22dc13c837c3 ("net_sched: convert tcf_exts from list to pointer array"),
Fixes: 824a7e8863b3 ("net_sched: remove an unnecessary list_del()")
Fixes: f07fed82ad79 ("net_sched: remove the leftover cleanup_a()")

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d0aceb1740b1..c9102172ce3b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -592,6 +592,17 @@ err_out:
 	return ERR_PTR(err);
 }
 
+static void cleanup_a(struct list_head *actions, int ovr)
+{
+	struct tc_action *a;
+
+	if (!ovr)
+		return;
+
+	list_for_each_entry(a, actions, list)
+		a->tcfa_refcnt--;
+}
+
 int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 		    char *name, int ovr, int bind, struct list_head *actions)
 {
@@ -611,8 +622,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 			goto err;
 		}
 		act->order = i;
+		if (ovr)
+			act->tcfa_refcnt++;
 		list_add_tail(&act->list, actions);
 	}
+
+	/* Remove the temp refcnt which was necessary to protect against
+	 * destroying an existing action which was being replaced
+	 */
+	cleanup_a(actions, ovr);
 	return 0;
 
 err:
@@ -882,6 +900,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 			goto err;
 		}
 		act->order = i;
+		if (event == RTM_GETACTION)
+			act->tcfa_refcnt++;
 		list_add_tail(&act->list, &actions);
 	}
 
-- 
cgit v1.2.3


From f78e73e27fdeab6f9317667f7e9676b59c1ec1fb Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Mon, 19 Sep 2016 23:39:08 -0400
Subject: tcp: cdg: rename struct minmax in tcp_cdg.c to avoid a naming
 conflict

The upcoming change "lib/win_minmax: windowed min or max estimator"
introduces a struct called minmax, which is then included in
include/linux/tcp.h in the upcoming change "tcp: use windowed min
filter library for TCP min_rtt estimation". This would create a
compilation error for tcp_cdg.c, which defines its own minmax
struct. To avoid this naming conflict (and potentially others in the
future), this commit renames the version used in tcp_cdg.c to
cdg_minmax.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kenneth Klette Jonassen <kennetkl@ifi.uio.no>
Acked-by: Kenneth Klette Jonassen <kennetkl@ifi.uio.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cdg.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 03725b294286..35b280361cb2 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
 module_param(use_tolerance, bool, 0644);
 MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
 
-struct minmax {
+struct cdg_minmax {
 	union {
 		struct {
 			s32 min;
@@ -74,10 +74,10 @@ enum cdg_state {
 };
 
 struct cdg {
-	struct minmax rtt;
-	struct minmax rtt_prev;
-	struct minmax *gradients;
-	struct minmax gsum;
+	struct cdg_minmax rtt;
+	struct cdg_minmax rtt_prev;
+	struct cdg_minmax *gradients;
+	struct cdg_minmax gsum;
 	bool gfilled;
 	u8  tail;
 	u8  state;
@@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
 {
 	struct cdg *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct minmax *gradients;
+	struct cdg_minmax *gradients;
 
 	switch (ev) {
 	case CA_EVENT_CWND_RESTART:
-- 
cgit v1.2.3


From 6403389211e1f4d40ed963fe47a96fce1a3ba7a9 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:10 -0400
Subject: tcp: use windowed min filter library for TCP min_rtt estimation

Refactor the TCP min_rtt code to reuse the new win_minmax library in
lib/win_minmax.c to simplify the TCP code.

This is a pure refactor: the functionality is exactly the same. We
just moved the windowed min code to make TCP easier to read and
maintain, and to allow other parts of the kernel to use the windowed
min/max filter code.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  5 ++--
 include/net/tcp.h        |  2 +-
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_input.c     | 64 ++++--------------------------------------------
 net/ipv4/tcp_minisocks.c |  2 +-
 5 files changed, 10 insertions(+), 65 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c723a465125d..6433cc8b4667 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -19,6 +19,7 @@
 
 
 #include <linux/skbuff.h>
+#include <linux/win_minmax.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -234,9 +235,7 @@ struct tcp_sock {
 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
 	u32	rttvar_us;	/* smoothed mdev_max			*/
 	u32	rtt_seq;	/* sequence number to update rttvar	*/
-	struct rtt_meas {
-		u32 rtt, ts;	/* RTT in usec and sampling time in jiffies. */
-	} rtt_min[3];
+	struct  minmax rtt_min;
 
 	u32	packets_out;	/* Packets which are "in flight"	*/
 	u32	retrans_out;	/* Retransmitted packets out		*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fdfbedd61c67..2f1648af4d12 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -671,7 +671,7 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
 /* Minimum RTT in usec. ~0 means not available. */
 static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
 {
-	return tp->rtt_min[0].rtt;
+	return minmax_get(&tp->rtt_min);
 }
 
 /* Compute the actual receive window we are currently advertising.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7dae800092e6..e79ed17ccfd6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
-	tp->rtt_min[0].rtt = ~0U;
+	minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
 
 	/* So many TCP implementations out there (incorrectly) count the
 	 * initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dad3e7eeed94..6886f386464f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2879,67 +2879,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	*rexmit = REXMIT_LOST;
 }
 
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
 static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
 {
-	const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
-	struct rtt_meas *m = tcp_sk(sk)->rtt_min;
-	struct rtt_meas rttm = {
-		.rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
-		.ts = now,
-	};
-	u32 elapsed;
-
-	/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
-	if (unlikely(rttm.rtt <= m[0].rtt))
-		m[0] = m[1] = m[2] = rttm;
-	else if (rttm.rtt <= m[1].rtt)
-		m[1] = m[2] = rttm;
-	else if (rttm.rtt <= m[2].rtt)
-		m[2] = rttm;
-
-	elapsed = now - m[0].ts;
-	if (unlikely(elapsed > wlen)) {
-		/* Passed entire window without a new min so make 2nd choice
-		 * the new min & 3rd choice the new 2nd. So forth and so on.
-		 */
-		m[0] = m[1];
-		m[1] = m[2];
-		m[2] = rttm;
-		if (now - m[0].ts > wlen) {
-			m[0] = m[1];
-			m[1] = rttm;
-			if (now - m[0].ts > wlen)
-				m[0] = rttm;
-		}
-	} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
-		/* Passed a quarter of the window without a new min so
-		 * take 2nd choice from the 2nd quarter of the window.
-		 */
-		m[2] = m[1] = rttm;
-	} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
-		/* Passed half the window without a new min so take the 3rd
-		 * choice from the last half of the window.
-		 */
-		m[2] = rttm;
-	}
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+	minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+			   rtt_us ? : jiffies_to_usecs(1));
 }
 
 static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f63c73dc0acb..568947110b60 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
 		newtp->srtt_us = 0;
 		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
-		newtp->rtt_min[0].rtt = ~0U;
+		minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
 		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
 
 		newtp->packets_out = 0;
-- 
cgit v1.2.3


From 77879147a3481babffd7e368d977ab682545a6bd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Sep 2016 23:39:11 -0400
Subject: net_sched: sch_fq: add low_rate_threshold parameter

This commit adds to the fq module a low_rate_threshold parameter to
insert a delay after all packets if the socket requests a pacing rate
below the threshold.

This helps achieve more precise control of the sending rate with
low-rate paths, especially policers. The basic issue is that if a
congestion control module detects a policer at a certain rate, it may
want fq to be able to shape to that policed rate. That way the sender
can avoid policer drops by having the packets arrive at the policer at
or just under the policed rate.

The default threshold of 550Kbps was chosen analytically so that for
policers or links at 500Kbps or 512Kbps fq would very likely invoke
this mechanism, even if the pacing rate was briefly slightly above the
available bandwidth. This value was then empirically validated with
two years of production testing on YouTube video servers.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_fq.c             | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 2382eed50278..f8e39dbaa781 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -792,6 +792,8 @@ enum {
 
 	TCA_FQ_ORPHAN_MASK,	/* mask applied to orphaned skb hashes */
 
+	TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
+
 	__TCA_FQ_MAX
 };
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index dc52cc10d6ed..5dd929cc1423 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
 	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
 	u32		orphan_mask;	/* mask for orphaned skb */
+	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
 	u8		rate_enable;
 	u8		fq_trees_log;
@@ -433,7 +434,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
-	u32 rate;
+	u32 rate, plen;
 
 	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
@@ -482,7 +483,7 @@ begin:
 	prefetch(&skb->end);
 	f->credit -= qdisc_pkt_len(skb);
 
-	if (f->credit > 0 || !q->rate_enable)
+	if (!q->rate_enable)
 		goto out;
 
 	/* Do not pace locally generated ack packets */
@@ -493,8 +494,15 @@ begin:
 	if (skb->sk)
 		rate = min(skb->sk->sk_pacing_rate, rate);
 
+	if (rate <= q->low_rate_threshold) {
+		f->credit = 0;
+		plen = qdisc_pkt_len(skb);
+	} else {
+		plen = max(qdisc_pkt_len(skb), q->quantum);
+		if (f->credit > 0)
+			goto out;
+	}
 	if (rate != ~0U) {
-		u32 plen = max(qdisc_pkt_len(skb), q->quantum);
 		u64 len = (u64)plen * NSEC_PER_SEC;
 
 		if (likely(rate))
@@ -662,6 +670,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
+	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +725,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_FQ_FLOW_MAX_RATE])
 		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
 
+	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+		q->low_rate_threshold =
+			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
 	if (tb[TCA_FQ_RATE_ENABLE]) {
 		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
 
@@ -781,6 +794,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->fq_root		= NULL;
 	q->fq_trees_log		= ilog2(1024);
 	q->orphan_mask		= 1024 - 1;
+	q->low_rate_threshold	= 550000 / 8;
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	if (opt)
@@ -811,6 +825,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
 			jiffies_to_usecs(q->flow_refill_delay)) ||
 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+			q->low_rate_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From b2d3ea4a730f812b9c0f67a67b6762ce66ddb17c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Sep 2016 23:39:12 -0400
Subject: tcp: switch back to proper tcp_skb_cb size check in tcp_init()

Revert to the tcp_skb_cb size check that tcp_init() had before commit
b4772ef879a8 ("net: use common macro for assering skb->cb[] available
size in protocol families"). As related commit 744d5a3e9fe2 ("net:
move skb->dropcount to skb->cb[]") explains, the
sock_skb_cb_check_size() mechanism was added to ensure that there is
space for dropcount, "for protocol families using it". But TCP is not
a protocol using dropcount, so tcp_init() doesn't need to provision
space for dropcount in the skb->cb[], and thus we can revert to the
older form of the tcp_skb_cb size check. Doing so allows TCP to use 4
more bytes of the skb->cb[] space.

Fixes: b4772ef879a8 ("net: use common macro for assering skb->cb[] available size in protocol families")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e79ed17ccfd6..de02fb4b1349 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3261,11 +3261,12 @@ static void __init tcp_init_mem(void)
 
 void __init tcp_init(void)
 {
-	unsigned long limit;
 	int max_rshare, max_wshare, cnt;
+	unsigned long limit;
 	unsigned int i;
 
-	sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+		     FIELD_SIZEOF(struct sk_buff, cb));
 
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
-- 
cgit v1.2.3


From 0682e6902a52aca7caf6ad42551b16ea0f87bc31 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:13 -0400
Subject: tcp: count packets marked lost for a TCP connection

Count the number of packets that a TCP connection marks lost.

Congestion control modules can use this loss rate information for more
intelligent decisions about how fast to send.

Specifically, this is used in TCP BBR policer detection. BBR uses a
high packet loss rate as one signal in its policer detection and
policer bandwidth estimation algorithm.

The BBR policer detection algorithm cannot simply track retransmits,
because a retransmit can be (and often is) an indicator of packets
lost long, long ago. This is particularly true in a long CA_Loss
period that repairs the initial massive losses when a policer kicks
in.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  1 +
 net/ipv4/tcp_input.c | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6433cc8b4667..38590fbc0ac5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -267,6 +267,7 @@ struct tcp_sock {
 				 * receiver in Recovery. */
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
+	u32	lost;		/* Total data packets lost incl. rexmits */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6886f386464f..9413288c2778 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -899,12 +899,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ *    and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ *    and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	__u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+	if (!(sacked & TCPCB_LOST) ||
+	    ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+		tp->lost += tcp_skb_pcount(skb);
+}
+
 static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
 		tcp_verify_retransmit_hint(tp, skb);
 
 		tp->lost_out += tcp_skb_pcount(skb);
+		tcp_sum_lost(tp, skb);
 		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 	}
 }
@@ -913,6 +930,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	tcp_verify_retransmit_hint(tp, skb);
 
+	tcp_sum_lost(tp, skb);
 	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
 		tp->lost_out += tcp_skb_pcount(skb);
 		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1890,6 +1908,7 @@ void tcp_enter_loss(struct sock *sk)
 	struct sk_buff *skb;
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
+	bool mark_lost;
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1942,12 @@ void tcp_enter_loss(struct sock *sk)
 		if (skb == tcp_send_head(sk))
 			break;
 
+		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+			     is_reneg);
+		if (mark_lost)
+			tcp_sum_lost(tp, skb);
 		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+		if (mark_lost) {
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
-- 
cgit v1.2.3


From b9f64820fb226a4e8ab10591f46cecd91ca56b30 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:14 -0400
Subject: tcp: track data delivery rate for a TCP connection

This patch generates data delivery rate (throughput) samples on a
per-ACK basis. These rate samples can be used by congestion control
modules, and specifically will be used by TCP BBR in later patches in
this series.

Key state:

tp->delivered: Tracks the total number of data packets (original or not)
	       delivered so far. This is an already-existing field.

tp->delivered_mstamp: the last time tp->delivered was updated.

Algorithm:

A rate sample is calculated as (d1 - d0)/(t1 - t0) on a per-ACK basis:

  d1: the current tp->delivered after processing the ACK
  t1: the current time after processing the ACK

  d0: the prior tp->delivered when the acked skb was transmitted
  t0: the prior tp->delivered_mstamp when the acked skb was transmitted

When an skb is transmitted, we snapshot d0 and t0 in its control
block in tcp_rate_skb_sent().

When an ACK arrives, it may SACK and ACK some skbs. For each SACKed
or ACKed skb, tcp_rate_skb_delivered() updates the rate_sample struct
to reflect the latest (d0, t0).

Finally, tcp_rate_gen() generates a rate sample by storing
(d1 - d0) in rs->delivered and (t1 - t0) in rs->interval_us.

One caveat: if an skb was sent with no packets in flight, then
tp->delivered_mstamp may be either invalid (if the connection is
starting) or outdated (if the connection was idle). In that case,
we'll re-stamp tp->delivered_mstamp.

At first glance it seems t0 should always be the time when an skb was
transmitted, but actually this could over-estimate the rate due to
phase mismatch between transmit and ACK events. To track the delivery
rate, we ensure that if packets are in flight then t0 and and t1 are
times at which packets were marked delivered.

If the initial and final RTTs are different then one may be corrupted
by some sort of noise. The noise we see most often is sending gaps
caused by delayed, compressed, or stretched acks. This either affects
both RTTs equally or artificially reduces the final RTT. We approach
this by recording the info we need to compute the initial RTT
(duration of the "send phase" of the window) when we recorded the
associated inflight. Then, for a filter to avoid bandwidth
overestimates, we generalize the per-sample bandwidth computation
from:

    bw = delivered / ack_phase_rtt

to the following:

    bw = delivered / max(send_phase_rtt, ack_phase_rtt)

In large-scale experiments, this filtering approach incorporating
send_phase_rtt is effective at avoiding bandwidth overestimates due to
ACK compression or stretched ACKs.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |   2 +
 include/net/tcp.h     |  35 +++++++++++-
 net/ipv4/Makefile     |   2 +-
 net/ipv4/tcp_input.c  |  46 +++++++++++-----
 net/ipv4/tcp_output.c |   4 ++
 net/ipv4/tcp_rate.c   | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 222 insertions(+), 16 deletions(-)
 create mode 100644 net/ipv4/tcp_rate.c

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 38590fbc0ac5..c50e6aec005a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,8 @@ struct tcp_sock {
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
 	u32	lost;		/* Total data packets lost incl. rexmits */
+	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
+	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2f1648af4d12..b261c892605a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,8 +763,14 @@ struct tcp_skb_cb {
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct {
-			/* There is space for up to 20 bytes */
+			/* There is space for up to 24 bytes */
 			__u32 in_flight;/* Bytes in flight when packet sent */
+			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+			__u32 delivered;
+			/* start of send pipeline phase */
+			struct skb_mstamp first_tx_mstamp;
+			/* when we reached the "delivered" count */
+			struct skb_mstamp delivered_mstamp;
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -860,6 +866,26 @@ struct ack_sample {
 	u32 in_flight;
 };
 
+/* A rate sample measures the number of (original/retransmitted) data
+ * packets delivered "delivered" over an interval of time "interval_us".
+ * The tcp_rate.c code fills in the rate sample, and congestion
+ * control modules that define a cong_control function to run at the end
+ * of ACK processing can optionally chose to consult this sample when
+ * setting cwnd and pacing rate.
+ * A sample is invalid if "delivered" or "interval_us" is negative.
+ */
+struct rate_sample {
+	struct	skb_mstamp prior_mstamp; /* starting timestamp for interval */
+	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+	s32  delivered;		/* number of packets delivered over interval */
+	long interval_us;	/* time for tp->delivered to incr "delivered" */
+	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
+	int  losses;		/* number of packets marked lost upon ACK */
+	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
+	u32  prior_in_flight;	/* in flight before this ACK */
+	bool is_retrans;	/* is sample from retransmission? */
+};
+
 struct tcp_congestion_ops {
 	struct list_head	list;
 	u32 key;
@@ -946,6 +972,13 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 		icsk->icsk_ca_ops->cwnd_event(sk, event);
 }
 
+/* From tcp_rate.c */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+			    struct rate_sample *rs);
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+		  struct skb_mstamp *now, struct rate_sample *rs);
+
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
  * between different flows.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..9cfff1a0bf71 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-	     tcp_recovery.o \
+	     tcp_rate.o tcp_recovery.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9413288c2778..d9ed4bb96f74 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1112,6 +1112,7 @@ struct tcp_sacktag_state {
 	 */
 	struct skb_mstamp first_sackt;
 	struct skb_mstamp last_sackt;
+	struct rate_sample *rate;
 	int	flag;
 };
 
@@ -1279,6 +1280,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
 			start_seq, end_seq, dup_sack, pcount,
 			&skb->skb_mstamp);
+	tcp_rate_skb_delivered(sk, skb, state->rate);
 
 	if (skb == tp->lost_skb_hint)
 		tp->lost_cnt_hint += pcount;
@@ -1329,6 +1331,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 		tcp_advance_highest_sack(sk, skb);
 
 	tcp_skb_collapse_tstamp(prev, skb);
+	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+		TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
 	tcp_unlink_write_queue(skb, sk);
 	sk_wmem_free_skb(sk, skb);
 
@@ -1558,6 +1563,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						dup_sack,
 						tcp_skb_pcount(skb),
 						&skb->skb_mstamp);
+			tcp_rate_skb_delivered(sk, skb, state->rate);
 
 			if (!before(TCP_SKB_CB(skb)->seq,
 				    tcp_highest_sack_seq(tp)))
@@ -1640,8 +1646,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 
 	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
 					 num_sacks, prior_snd_una);
-	if (found_dup_sack)
+	if (found_dup_sack) {
 		state->flag |= FLAG_DSACKING_ACK;
+		tp->delivered++; /* A spurious retransmission is delivered */
+	}
 
 	/* Eliminate too old ACKs, but take into
 	 * account more or less fresh ones, they can
@@ -3071,10 +3079,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			       u32 prior_snd_una, int *acked,
-			       struct tcp_sacktag_state *sack)
+			       struct tcp_sacktag_state *sack,
+			       struct skb_mstamp *now)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct skb_mstamp first_ackt, last_ackt, now;
+	struct skb_mstamp first_ackt, last_ackt;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
@@ -3106,7 +3115,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			acked_pcount = tcp_tso_acked(sk, skb);
 			if (!acked_pcount)
 				break;
-
 			fully_acked = false;
 		} else {
 			/* Speedup tcp_unlink_write_queue() and next loop */
@@ -3142,6 +3150,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 		tp->packets_out -= acked_pcount;
 		pkts_acked += acked_pcount;
+		tcp_rate_skb_delivered(sk, skb, sack->rate);
 
 		/* Initial outgoing SYN's get put onto the write_queue
 		 * just like anything else we transmit.  It is not
@@ -3174,16 +3183,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		flag |= FLAG_SACK_RENEGING;
 
-	skb_mstamp_get(&now);
 	if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
-		seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
-		ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+		seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+		ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
 	}
 	if (sack->first_sackt.v64) {
-		sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
-		ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+		sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+		ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
 	}
-
+	sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
 	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
 					ca_rtt_us);
 
@@ -3211,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
 
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
-		   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+		   sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
 		/* Do not re-arm RTO if the sack RTT is measured from data sent
 		 * after when the head was last (re)transmitted. Otherwise the
 		 * timeout may continue to extend in loss recovery.
@@ -3548,17 +3556,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_sacktag_state sack_state;
+	struct rate_sample rs = { .prior_delivered = 0 };
 	u32 prior_snd_una = tp->snd_una;
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
 	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
-	u32 prior_delivered = tp->delivered;
+	u32 delivered = tp->delivered;
+	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+	struct skb_mstamp now;
 
 	sack_state.first_sackt.v64 = 0;
+	sack_state.rate = &rs;
 
 	/* We very likely will need to access write queue head. */
 	prefetchw(sk->sk_write_queue.next);
@@ -3581,6 +3593,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
+	skb_mstamp_get(&now);
+
 	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
@@ -3591,6 +3605,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	}
 
 	prior_fackets = tp->fackets_out;
+	rs.prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* ts_recent update must be made after we are sure that the packet
 	 * is in window.
@@ -3646,7 +3661,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state);
+				    &sack_state, &now);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3663,7 +3678,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
-	tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
+	lost = tp->lost - lost;			/* freshly marked lost */
+	tcp_rate_gen(sk, delivered, lost, &now, &rs);
+	tcp_cong_control(sk, ack, delivered, flag);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8b45794eb6b2..e02c8ebf3ed4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -918,6 +918,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		skb_mstamp_get(&skb->skb_mstamp);
 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
 			- tp->snd_una;
+		tcp_rate_skb_sent(sk, skb);
 
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1214,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	tcp_set_skb_tso_segs(skb, mss_now);
 	tcp_set_skb_tso_segs(buff, mss_now);
 
+	/* Update delivered info for the new segment */
+	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
 	/* If this packet has been sent out already, we must
 	 * adjust the various packet counters.
 	 */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..1daed6af6e80
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,149 @@
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ *    send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ *    ack_rate  = #pkts_delivered/(last_ack_time - first_ack_time)
+ *    bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits.  The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ */
+
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	 /* In general we need to start delivery rate samples from the
+	  * time we received the most recent ACK, to ensure we include
+	  * the full time the network needs to deliver all in-flight
+	  * packets. If there are no packets in flight yet, then we
+	  * know that any ACKs after now indicate that the network was
+	  * able to deliver those packets completely in the sampling
+	  * interval between now and the next ACK.
+	  *
+	  * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+	  * because the latter is a guess based on RTO and loss-marking
+	  * heuristics. We don't want spurious RTOs or loss markings to cause
+	  * a spuriously small time interval, causing a spuriously high
+	  * bandwidth estimate.
+	  */
+	if (!tp->packets_out) {
+		tp->first_tx_mstamp  = skb->skb_mstamp;
+		tp->delivered_mstamp = skb->skb_mstamp;
+	}
+
+	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
+	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the highest prior_delivered count.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+			    struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+	if (!scb->tx.delivered_mstamp.v64)
+		return;
+
+	if (!rs->prior_delivered ||
+	    after(scb->tx.delivered, rs->prior_delivered)) {
+		rs->prior_delivered  = scb->tx.delivered;
+		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
+
+		/* Find the duration of the "send phase" of this window: */
+		rs->interval_us      = skb_mstamp_us_delta(
+						&skb->skb_mstamp,
+						&scb->tx.first_tx_mstamp);
+
+		/* Record send time of most recently ACKed packet: */
+		tp->first_tx_mstamp  = skb->skb_mstamp;
+	}
+	/* Mark off the skb delivered once it's sacked to avoid being
+	 * used again when it's cumulatively acked. For acked packets
+	 * we don't need to reset since it'll be freed soon.
+	 */
+	if (scb->sacked & TCPCB_SACKED_ACKED)
+		scb->tx.delivered_mstamp.v64 = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+		  struct skb_mstamp *now, struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 snd_us, ack_us;
+
+	/* TODO: there are multiple places throughout tcp_ack() to get
+	 * current time. Refactor the code using a new "tcp_acktag_state"
+	 * to carry current time, flags, stats like "tcp_sacktag_state".
+	 */
+	if (delivered)
+		tp->delivered_mstamp = *now;
+
+	rs->acked_sacked = delivered;	/* freshly ACKed or SACKed */
+	rs->losses = lost;		/* freshly marked lost */
+	/* Return an invalid sample if no timing information is available. */
+	if (!rs->prior_mstamp.v64) {
+		rs->delivered = -1;
+		rs->interval_us = -1;
+		return;
+	}
+	rs->delivered   = tp->delivered - rs->prior_delivered;
+
+	/* Model sending data and receiving ACKs as separate pipeline phases
+	 * for a window. Usually the ACK phase is longer, but with ACK
+	 * compression the send phase can be longer. To be safe we use the
+	 * longer phase.
+	 */
+	snd_us = rs->interval_us;				/* send phase */
+	ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp);	/* ack phase */
+	rs->interval_us = max(snd_us, ack_us);
+
+	/* Normally we expect interval_us >= min-rtt.
+	 * Note that rate may still be over-estimated when a spuriously
+	 * retransmistted skb was first (s)acked because "interval_us"
+	 * is under-estimated (up to an RTT). However continuously
+	 * measuring the delivery rate during loss recovery is crucial
+	 * for connections suffer heavy or prolonged losses.
+	 */
+	if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+		rs->interval_us = -1;
+		if (!rs->is_retrans)
+			pr_debug("tcp rate: %ld %d %u %u %u\n",
+				 rs->interval_us, rs->delivered,
+				 inet_csk(sk)->icsk_ca_state,
+				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+	}
+}
-- 
cgit v1.2.3


From d7722e8570fc0f1e003cee7cf37694041828918b Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Mon, 19 Sep 2016 23:39:15 -0400
Subject: tcp: track application-limited rate samples

This commit adds code to track whether the delivery rate represented
by each rate_sample was limited by the application.

Upon each transmit, we store in the is_app_limited field in the skb a
boolean bit indicating whether there is a known "bubble in the pipe":
a point in the rate sample interval where the sender was
application-limited, and did not transmit even though the cwnd and
pacing rate allowed it.

This logic marks the flow app-limited on a write if *all* of the
following are true:

  1) There is less than 1 MSS of unsent data in the write queue
     available to transmit.

  2) There is no packet in the sender's queues (e.g. in fq or the NIC
     tx queue).

  3) The connection is not limited by cwnd.

  4) There are no lost packets to retransmit.

The tcp_rate_check_app_limited() code in tcp_rate.c determines whether
the connection is application-limited at the moment. If the flow is
application-limited, it sets the tp->app_limited field. If the flow is
application-limited then that means there is effectively a "bubble" of
silence in the pipe now, and this silence will be reflected in a lower
bandwidth sample for any rate samples from now until we get an ACK
indicating this bubble has exited the pipe: specifically, until we get
an ACK for the next packet we transmit.

When we send every skb we record in scb->tx.is_app_limited whether the
resulting rate sample will be application-limited.

The code in tcp_rate_gen() checks to see when it is safe to mark all
known application-limited bubbles of silence as having exited the
pipe. It does this by checking to see when the delivered count moves
past the tp->app_limited marker. At this point it zeroes the
tp->app_limited marker, as all known bubbles are out of the pipe.

We make room for the tx.is_app_limited bit in the skb by borrowing a
bit from the in_flight field used by NV to record the number of bytes
in flight. The receive window in the TCP header is 16 bits, and the
max receive window scaling shift factor is 14 (RFC 1323). So the max
receive window offered by the TCP protocol is 2^(16+14) = 2^30. So we
only need 30 bits for the tx.in_flight used by NV.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 +
 include/net/tcp.h        |  6 +++++-
 net/ipv4/tcp.c           |  8 ++++++++
 net/ipv4/tcp_minisocks.c |  3 +++
 net/ipv4/tcp_rate.c      | 29 ++++++++++++++++++++++++++++-
 5 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c50e6aec005a..fdcd00ffcb66 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,7 @@ struct tcp_sock {
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
 	u32	lost;		/* Total data packets lost incl. rexmits */
+	u32	app_limited;	/* limited until "delivered" reaches this val */
 	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
 	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b261c892605a..a69ed7f0030c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -764,7 +764,9 @@ struct tcp_skb_cb {
 	union {
 		struct {
 			/* There is space for up to 24 bytes */
-			__u32 in_flight;/* Bytes in flight when packet sent */
+			__u32 in_flight:30,/* Bytes in flight at transmit */
+			      is_app_limited:1, /* cwnd not fully used? */
+			      unused:1;
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
@@ -883,6 +885,7 @@ struct rate_sample {
 	int  losses;		/* number of packets marked lost upon ACK */
 	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
 	u32  prior_in_flight;	/* in flight before this ACK */
+	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
 };
 
@@ -978,6 +981,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		  struct skb_mstamp *now, struct rate_sample *rs);
+void tcp_rate_check_app_limited(struct sock *sk);
 
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index de02fb4b1349..2250f891f931 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
 	 */
 	tp->snd_cwnd = TCP_INIT_CWND;
 
+	/* There's a bubble in the pipe until at least the first ACK. */
+	tp->app_limited = ~0U;
+
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
 	 */
@@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 					flags);
 
 	lock_sock(sk);
+
+	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
+
 	res = do_tcp_sendpages(sk, page, offset, size, flags);
 	release_sock(sk);
 	return res;
@@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
+	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
+
 	/* Wait for a connection to finish. One exception is TCP Fast Open
 	 * (passive side) where data is allowed to be sent before a connection
 	 * is fully established.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 568947110b60..6234ebaa7db1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 
+		/* There's a bubble in the pipe until at least the first ACK. */
+		newtp->app_limited = ~0U;
+
 		tcp_init_xmit_timers(newsk);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 1daed6af6e80..52ff84be59ab 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -26,9 +26,13 @@
  * other factors like applications or receiver window limits.  The estimator
  * deliberately avoids using the inter-packet spacing approach because that
  * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
  */
 
-
 /* Snapshot the current delivery information in the skb, to generate
  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
  */
@@ -58,6 +62,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
 }
 
 /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -80,6 +85,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 	    after(scb->tx.delivered, rs->prior_delivered)) {
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
 
 		/* Find the duration of the "send phase" of this window: */
@@ -105,6 +111,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 snd_us, ack_us;
 
+	/* Clear app limited if bubble is acked and gone. */
+	if (tp->app_limited && after(tp->delivered, tp->app_limited))
+		tp->app_limited = 0;
+
 	/* TODO: there are multiple places throughout tcp_ack() to get
 	 * current time. Refactor the code using a new "tcp_acktag_state"
 	 * to carry current time, flags, stats like "tcp_sacktag_state".
@@ -147,3 +157,20 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
 	}
 }
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (/* We have less than one packet to send. */
+	    tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+	    /* Nothing in sending host's qdisc queues or NIC tx queue. */
+	    sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+	    /* We are not limited by CWND. */
+	    tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+	    /* All lost packets have been retransmitted. */
+	    tp->lost_out <= tp->retrans_out)
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
-- 
cgit v1.2.3


From eb8329e0a04db0061f714f033b4454326ba147f4 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:16 -0400
Subject: tcp: export data delivery rate

This commit export two new fields in struct tcp_info:

  tcpi_delivery_rate: The most recent goodput, as measured by
    tcp_rate_gen(). If the socket is limited by the sending
    application (e.g., no data to send), it reports the highest
    measurement instead of the most recent. The unit is bytes per
    second (like other rate fields in tcp_info).

  tcpi_delivery_rate_app_limited: A boolean indicating if the goodput
    was measured when the socket's throughput was limited by the
    sending application.

This delivery rate information can be useful for applications that
want to know the current throughput the TCP connection is seeing,
e.g. adaptive bitrate video streaming. It can also be very useful for
debugging or troubleshooting.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  5 ++++-
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 11 ++++++++++-
 net/ipv4/tcp_rate.c      | 12 +++++++++++-
 4 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fdcd00ffcb66..a17ae7b85218 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -213,7 +213,8 @@ struct tcp_sock {
 		u8 reord;    /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
-	u8	unused;
+	u8	rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+		unused:7;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
@@ -271,6 +272,8 @@ struct tcp_sock {
 	u32	app_limited;	/* limited until "delivered" reaches this val */
 	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
 	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
+	u32	rate_delivered;    /* saved rate sample: packets delivered */
+	u32	rate_interval_us;  /* saved rate sample: time elapsed */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 482898fc433a..73ac0db487f8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -167,6 +167,7 @@ struct tcp_info {
 	__u8	tcpi_backoff;
 	__u8	tcpi_options;
 	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+	__u8	tcpi_delivery_rate_app_limited:1;
 
 	__u32	tcpi_rto;
 	__u32	tcpi_ato;
@@ -211,6 +212,8 @@ struct tcp_info {
 	__u32	tcpi_min_rtt;
 	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
 	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
+
+	__u64   tcpi_delivery_rate;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2250f891f931..f253e5019d22 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2712,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	u32 now = tcp_time_stamp;
+	u32 now = tcp_time_stamp, intv;
 	unsigned int start;
 	int notsent_bytes;
 	u64 rate64;
@@ -2802,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
 	info->tcpi_data_segs_in = tp->data_segs_in;
 	info->tcpi_data_segs_out = tp->data_segs_out;
+
+	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+	rate = READ_ONCE(tp->rate_delivered);
+	intv = READ_ONCE(tp->rate_interval_us);
+	if (rate && intv) {
+		rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+		do_div(rate64, intv);
+		put_unaligned(rate64, &info->tcpi_delivery_rate);
+	}
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 52ff84be59ab..9be1581a5a08 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -149,12 +149,22 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	 * for connections suffer heavy or prolonged losses.
 	 */
 	if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
-		rs->interval_us = -1;
 		if (!rs->is_retrans)
 			pr_debug("tcp rate: %ld %d %u %u %u\n",
 				 rs->interval_us, rs->delivered,
 				 inet_csk(sk)->icsk_ca_state,
 				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+		rs->interval_us = -1;
+		return;
+	}
+
+	/* Record the last non-app-limited or the highest app-limited bw */
+	if (!rs->is_app_limited ||
+	    ((u64)rs->delivered * tp->rate_interval_us >=
+	     (u64)tp->rate_delivered * rs->interval_us)) {
+		tp->rate_delivered = rs->delivered;
+		tp->rate_interval_us = rs->interval_us;
+		tp->rate_app_limited = rs->is_app_limited;
 	}
 }
 
-- 
cgit v1.2.3


From ed6e7268b930e0a9a65d895d368eac79a438d992 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:17 -0400
Subject: tcp: allow congestion control module to request TSO skb segment count

Add the tso_segs_goal() function in tcp_congestion_ops to allow the
congestion control module to specify the number of segments that
should be in a TSO skb sent by tcp_write_xmit() and
tcp_xmit_retransmit_queue(). The congestion control module can either
request a particular number of segments in TSO skb that we transmit,
or return 0 if it doesn't care.

This allows the upcoming BBR congestion control module to select small
TSO skb sizes if the module detects that the bottleneck bandwidth is
very low, or that the connection is policed to a low rate.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 ++
 net/ipv4/tcp_output.c | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a69ed7f0030c..f8f581fd05f5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -913,6 +913,8 @@ struct tcp_congestion_ops {
 	u32  (*undo_cwnd)(struct sock *sk);
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+	/* suggest number of segments for each skb to transmit (optional) */
+	u32 (*tso_segs_goal)(struct sock *sk);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e02c8ebf3ed4..01379567a732 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1566,6 +1566,17 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
 	return min_t(u32, segs, sk->sk_gso_max_segs);
 }
 
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+	return tso_segs ? : tcp_tso_autosize(sk, mss_now);
+}
+
 /* Returns the portion of skb which can be sent right away */
 static unsigned int tcp_mss_split_point(const struct sock *sk,
 					const struct sk_buff *skb,
@@ -2061,7 +2072,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		}
 	}
 
-	max_segs = tcp_tso_autosize(sk, mss_now);
+	max_segs = tcp_tso_segs(sk, mss_now);
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
@@ -2778,7 +2789,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		last_lost = tp->snd_una;
 	}
 
-	max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
 	tcp_for_write_queue_from(skb, sk) {
 		__u8 sacked;
 		int segs;
-- 
cgit v1.2.3


From 1b3878ca1551f3baab2c408d1e703b5ef785a1b2 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:18 -0400
Subject: tcp: export tcp_tso_autosize() and parameterize minimum number of TSO
 segments

To allow congestion control modules to use the default TSO auto-sizing
algorithm as one of the ingredients in their own decision about TSO sizing:

1) Export tcp_tso_autosize() so that CC modules can use it.

2) Change tcp_tso_autosize() to allow callers to specify a minimum
   number of segments per TSO skb, in case the congestion control
   module has a different notion of the best floor for TSO skbs for
   the connection right now. For very low-rate paths or policed
   connections it can be appropriate to use smaller TSO skbs.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 2 ++
 net/ipv4/tcp_output.c | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f8f581fd05f5..349204130d84 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -533,6 +533,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
 #endif
 /* tcp_output.c */
 
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+		     int min_tso_segs);
 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 			       int nonagle);
 bool tcp_may_send_now(struct sock *sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 01379567a732..0bf3d481fa85 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1549,7 +1549,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
 /* Return how many segs we'd like on a TSO packet,
  * to send one TSO packet per ms
  */
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+		     int min_tso_segs)
 {
 	u32 bytes, segs;
 
@@ -1561,10 +1562,11 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
 	 * This preserves ACK clocking and is consistent
 	 * with tcp_tso_should_defer() heuristic.
 	 */
-	segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+	segs = max_t(u32, bytes / mss_now, min_tso_segs);
 
 	return min_t(u32, segs, sk->sk_gso_max_segs);
 }
+EXPORT_SYMBOL(tcp_tso_autosize);
 
 /* Return the number of segments we want in the skb we are transmitting.
  * See if congestion control module wants to decide; otherwise, autosize.
@@ -1574,7 +1576,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
 
-	return tso_segs ? : tcp_tso_autosize(sk, mss_now);
+	return tso_segs ? :
+		tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
 }
 
 /* Returns the portion of skb which can be sent right away */
-- 
cgit v1.2.3


From 556c6b46d194cc0dbb6a5b22f1d2bbc699c86d8e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:19 -0400
Subject: tcp: export tcp_mss_to_mtu() for congestion control modules

Export tcp_mss_to_mtu(), so that congestion control modules can use
this to help calculate a pacing rate.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bf3d481fa85..7d025a7804b5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1362,6 +1362,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
 	}
 	return mtu;
 }
+EXPORT_SYMBOL(tcp_mss_to_mtu);
 
 /* MTU probing init per socket */
 void tcp_mtup_init(struct sock *sk)
-- 
cgit v1.2.3


From 77bfc174c38e558a3425d3b069aa2762b2fedfdd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:20 -0400
Subject: tcp: allow congestion control to expand send buffer differently

Currently the TCP send buffer expands to twice cwnd, in order to allow
limited transmits in the CA_Recovery state. This assumes that cwnd
does not increase in the CA_Recovery.

For some congestion control algorithms, like the upcoming BBR module,
if the losses in recovery do not indicate congestion then we may
continue to raise cwnd multiplicatively in recovery. In such cases the
current multiplier will falsely limit the sending rate, much as if it
were limited by the application.

This commit adds an optional congestion control callback to use a
different multiplier to expand the TCP send buffer. For congestion
control modules that do not specificy this callback, TCP continues to
use the previous default of 2.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 2 ++
 net/ipv4/tcp_input.c | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 349204130d84..1aa9628ae608 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -917,6 +917,8 @@ struct tcp_congestion_ops {
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
 	/* suggest number of segments for each skb to transmit (optional) */
 	u32 (*tso_segs_goal)(struct sock *sk);
+	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
+	u32 (*sndbuf_expand)(struct sock *sk);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d9ed4bb96f74..13a2e70141f5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
 static void tcp_sndbuf_expand(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 	int sndmem, per_mss;
 	u32 nr_segs;
 
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
 	 * Cubic needs 1.7 factor, rounded to 2 to include
 	 * extra cushion (application might react slowly to POLLOUT)
 	 */
-	sndmem = 2 * nr_segs * per_mss;
+	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+	sndmem *= nr_segs * per_mss;
 
 	if (sk->sk_sndbuf < sndmem)
 		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
-- 
cgit v1.2.3


From c0402760f565ae066621ebf8720a32fba074d538 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:21 -0400
Subject: tcp: new CC hook to set sending rate with rate_sample in any CA state

This commit introduces an optional new "omnipotent" hook,
cong_control(), for congestion control modules. The cong_control()
function is called at the end of processing an ACK (i.e., after
updating sequence numbers, the SACK scoreboard, and loss
detection). At that moment we have precise delivery rate information
the congestion control module can use to control the sending behavior
(using cwnd, TSO skb size, and pacing rate) in any CA state.

This function can also be used by a congestion control that prefers
not to use the default cwnd reduction approach (i.e., the PRR
algorithm) during CA_Recovery to control the cwnd and sending rate
during loss recovery.

We take advantage of the fact that recent changes defer the
retransmission or transmission of new data (e.g. by F-RTO) in recovery
until the new tcp_cong_control() function is run.

With this commit, we only run tcp_update_pacing_rate() if the
congestion control is not using this new API. New congestion controls
which use the new API do not want the TCP stack to run the default
pacing rate calculation and overwrite whatever pacing rate they have
chosen at initialization time.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  4 ++++
 net/ipv4/tcp_cong.c  |  2 +-
 net/ipv4/tcp_input.c | 17 ++++++++++++++---
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1aa9628ae608..f83b7f220a65 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -919,6 +919,10 @@ struct tcp_congestion_ops {
 	u32 (*tso_segs_goal)(struct sock *sk);
 	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
 	u32 (*sndbuf_expand)(struct sock *sk);
+	/* call when packets are delivered to update cwnd and pacing rate,
+	 * after all the ca_state processing. (optional)
+	 */
+	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4e72bc..1294af4e0127 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 	int ret = 0;
 
 	/* all algorithms must implement ssthresh and cong_avoid ops */
-	if (!ca->ssthresh || !ca->cong_avoid) {
+	if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
 		pr_err("%s does not implement required ops\n", ca->name);
 		return -EINVAL;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 13a2e70141f5..980a83edfa63 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2536,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	if (inet_csk(sk)->icsk_ca_ops->cong_control)
+		return;
+
 	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
 	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
 	    (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -3312,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
  * information. All transmission or retransmission are delayed afterwards.
  */
 static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
-			     int flag)
+			     int flag, const struct rate_sample *rs)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->cong_control) {
+		icsk->icsk_ca_ops->cong_control(sk, rs);
+		return;
+	}
+
 	if (tcp_in_cwnd_reduction(sk)) {
 		/* Reduce cwnd if state mandates */
 		tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3683,7 +3693,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
 	tcp_rate_gen(sk, delivered, lost, &now, &rs);
-	tcp_cong_control(sk, ack, delivered, flag);
+	tcp_cong_control(sk, ack, delivered, flag, &rs);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
@@ -5982,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		} else
 			tcp_init_metrics(sk);
 
-		tcp_update_pacing_rate(sk);
+		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+			tcp_update_pacing_rate(sk);
 
 		/* Prevent spurious tcp_cwnd_restart() on first data packet */
 		tp->lsndtime = tcp_time_stamp;
-- 
cgit v1.2.3


From 0f8782ea14974ce992618b55f0c041ef43ed0b78 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:23 -0400
Subject: tcp_bbr: add BBR congestion control

This commit implements a new TCP congestion control algorithm: BBR
(Bottleneck Bandwidth and RTT). A detailed description of BBR will be
published in ACM Queue, Vol. 14 No. 5, September-October 2016, as
"BBR: Congestion-Based Congestion Control".

BBR has significantly increased throughput and reduced latency for
connections on Google's internal backbone networks and google.com and
YouTube Web servers.

BBR requires only changes on the sender side, not in the network or
the receiver side. Thus it can be incrementally deployed on today's
Internet, or in datacenters.

The Internet has predominantly used loss-based congestion control
(largely Reno or CUBIC) since the 1980s, relying on packet loss as the
signal to slow down. While this worked well for many years, loss-based
congestion control is unfortunately out-dated in today's networks. On
today's Internet, loss-based congestion control causes the infamous
bufferbloat problem, often causing seconds of needless queuing delay,
since it fills the bloated buffers in many last-mile links. On today's
high-speed long-haul links using commodity switches with shallow
buffers, loss-based congestion control has abysmal throughput because
it over-reacts to losses caused by transient traffic bursts.

In 1981 Kleinrock and Gale showed that the optimal operating point for
a network maximizes delivered bandwidth while minimizing delay and
loss, not only for single connections but for the network as a
whole. Finding that optimal operating point has been elusive, since
any single network measurement is ambiguous: network measurements are
the result of both bandwidth and propagation delay, and those two
cannot be measured simultaneously.

While it is impossible to disambiguate any single bandwidth or RTT
measurement, a connection's behavior over time tells a clearer
story. BBR uses a measurement strategy designed to resolve this
ambiguity. It combines these measurements with a robust servo loop
using recent control systems advances to implement a distributed
congestion control algorithm that reacts to actual congestion, not
packet loss or transient queue delay, and is designed to converge with
high probability to a point near the optimal operating point.

In a nutshell, BBR creates an explicit model of the network pipe by
sequentially probing the bottleneck bandwidth and RTT. On the arrival
of each ACK, BBR derives the current delivery rate of the last round
trip, and feeds it through a windowed max-filter to estimate the
bottleneck bandwidth. Conversely it uses a windowed min-filter to
estimate the round trip propagation delay. The max-filtered bandwidth
and min-filtered RTT estimates form BBR's model of the network pipe.

Using its model, BBR sets control parameters to govern sending
behavior. The primary control is the pacing rate: BBR applies a gain
multiplier to transmit faster or slower than the observed bottleneck
bandwidth. The conventional congestion window (cwnd) is now the
secondary control; the cwnd is set to a small multiple of the
estimated BDP (bandwidth-delay product) in order to allow full
utilization and bandwidth probing while bounding the potential amount
of queue at the bottleneck.

When a BBR connection starts, it enters STARTUP mode and applies a
high gain to perform an exponential search to quickly probe the
bottleneck bandwidth (doubling its sending rate each round trip, like
slow start). However, instead of continuing until it fills up the
buffer (i.e. a loss), or until delay or ACK spacing reaches some
threshold (like Hystart), it uses its model of the pipe to estimate
when that pipe is full: it estimates the pipe is full when it notices
the estimated bandwidth has stopped growing. At that point it exits
STARTUP and enters DRAIN mode, where it reduces its pacing rate to
drain the queue it estimates it has created.

Then BBR enters steady state. In steady state, PROBE_BW mode cycles
between first pacing faster to probe for more bandwidth, then pacing
slower to drain any queue that created if no more bandwidth was
available, and then cruising at the estimated bandwidth to utilize the
pipe without creating excess queue. Occasionally, on an as-needed
basis, it sends significantly slower to probe for RTT (PROBE_RTT
mode).

BBR has been fully deployed on Google's wide-area backbone networks
and we're experimenting with BBR on Google.com and YouTube on a global
scale.  Replacing CUBIC with BBR has resulted in significant
improvements in network latency and application (RPC, browser, and
video) metrics. For more details please refer to our upcoming ACM
Queue publication.

Example performance results, to illustrate the difference between BBR
and CUBIC:

Resilience to random loss (e.g. from shallow buffers):
  Consider a netperf TCP_STREAM test lasting 30 secs on an emulated
  path with a 10Gbps bottleneck, 100ms RTT, and 1% packet loss
  rate. CUBIC gets 3.27 Mbps, and BBR gets 9150 Mbps (2798x higher).

Low latency with the bloated buffers common in today's last-mile links:
  Consider a netperf TCP_STREAM test lasting 120 secs on an emulated
  path with a 10Mbps bottleneck, 40ms RTT, and 1000-packet bottleneck
  buffer. Both fully utilize the bottleneck bandwidth, but BBR
  achieves this with a median RTT 25x lower (43 ms instead of 1.09
  secs).

Our long-term goal is to improve the congestion control algorithms
used on the Internet. We are hopeful that BBR can help advance the
efforts toward this goal, and motivate the community to do further
research.

Test results, performance evaluations, feedback, and BBR-related
discussions are very welcome in the public e-mail list for BBR:

  https://groups.google.com/forum/#!forum/bbr-dev

NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing
enabled, since pacing is integral to the BBR design and
implementation. BBR without pacing would not function properly, and
may incur unnecessary high packet loss rates.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  13 +
 net/ipv4/Kconfig               |  18 +
 net/ipv4/Makefile              |   1 +
 net/ipv4/tcp_bbr.c             | 896 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 928 insertions(+)
 create mode 100644 net/ipv4/tcp_bbr.c

(limited to 'net')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index b5c366f87b3e..509cd961068d 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -124,6 +124,7 @@ enum {
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
 	INET_DIAG_MARK,
+	INET_DIAG_BBRINFO,
 	__INET_DIAG_MAX,
 };
 
@@ -157,8 +158,20 @@ struct tcp_dctcp_info {
 	__u32	dctcp_ab_tot;
 };
 
+/* INET_DIAG_BBRINFO */
+
+struct tcp_bbr_info {
+	/* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
+	__u32	bbr_bw_lo;		/* lower 32 bits of bw */
+	__u32	bbr_bw_hi;		/* upper 32 bits of bw */
+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+};
+
 union tcp_cc_info {
 	struct tcpvegas_info	vegas;
 	struct tcp_dctcp_info	dctcp;
+	struct tcp_bbr_info	bbr;
 };
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..300b06888fdf 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
 	  D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
 	  delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
 
+config TCP_CONG_BBR
+	tristate "BBR TCP"
+	default n
+	---help---
+
+	BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+	maximize network utilization and minimize queues. It builds an explicit
+	model of the the bottleneck delivery rate and path round-trip
+	propagation delay. It tolerates packet loss and delay unrelated to
+	congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+	modem links. It can coexist with flows that use loss-based congestion
+	control, and can operate with shallow buffers, deep buffers,
+	bufferbloat, policers, or AQM schemes that do not provide a delay
+	signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
 	config DEFAULT_CDG
 		bool "CDG" if TCP_CONG_CDG=y
 
+	config DEFAULT_BBR
+		bool "BBR" if TCP_CONG_BBR=y
+
 	config DEFAULT_RENO
 		bool "Reno"
 endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9cfff1a0bf71..bc6a6c8b9bcd 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..0ea66c2c9344
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,896 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ *   On each ACK, update our model of the network path:
+ *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ *      min_rtt = windowed_min(rtt, 10 seconds)
+ *   pacing_rate = pacing_gain * bottleneck_bandwidth
+ *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * BBR is described in detail in:
+ *   "BBR: Congestion-Based Congestion Control",
+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ *   https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
+	BBR_DRAIN,	/* drain any queue created during startup */
+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
+	BBR_PROBE_RTT,	/* cut cwnd to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+	struct skb_mstamp cycle_mstamp;  /* time of this cycle phase start */
+	u32     mode:3,		     /* current bbr_mode in state machine */
+		prev_ca_state:3,     /* CA state on previous ACK */
+		packet_conservation:1,  /* use packet conservation? */
+		restore_cwnd:1,	     /* decided to revert cwnd to old value */
+		round_start:1,	     /* start of packet-timed tx->ack round? */
+		tso_segs_goal:7,     /* segments we want in each skb we send */
+		idle_restart:1,	     /* restarting after idle? */
+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+		unused:5,
+		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
+	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+		cwnd_gain:10,	/* current gain for setting cwnd */
+		full_bw_cnt:3,	/* number of rounds without large bw gains */
+		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		unused_b:6;
+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+	rate *= gain;
+	rate >>= BBR_SCALE;
+	rate *= USEC_PER_SEC;
+	return rate >> BW_SCALE;
+}
+
+/* Pace using current bw estimate and a gain factor. In order to help drive the
+ * network toward lower queues while maintaining high utilization and low
+ * latency, the average pacing rate aims to be slightly (~1%) lower than the
+ * estimated bandwidth. This is an important aspect of the design. In this
+ * implementation this slightly lower pacing rate is achieved implicitly by not
+ * including link-layer headers in the packet size used for the pacing rate.
+ */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 rate = bw;
+
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+		sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 min_segs;
+
+	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+				 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (event == CA_EVENT_TX_START && tp->app_limited) {
+		bbr->idle_restart = 1;
+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
+		 * need more speed (we're restarting from idle and app-limited).
+		 */
+		if (bbr->mode == BBR_PROBE_BW)
+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+	}
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ *   - one skb in sending host Qdisc,
+ *   - one skb in sending host TSO/GSO engine
+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cwnd;
+	u64 w;
+
+	/* If we've never had a valid RTT sample, cap cwnd at the initial
+	 * default. This should only happen when the connection is not using TCP
+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+	 */
+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
+
+	w = (u64)bw * bbr->min_rtt_us;
+
+	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
+	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	cwnd += 3 * bbr->tso_segs_goal;
+
+	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+	cwnd = (cwnd + 1) & ~1U;
+
+	return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+	u32 cwnd = tp->snd_cwnd;
+
+	/* An ACK for P pkts should release at most 2*P packets. We do this
+	 * in two steps. First, here we deduct the number of lost packets.
+	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+	 */
+	if (rs->losses > 0)
+		cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+		/* Starting 1st round of Recovery, so do packet conservation. */
+		bbr->packet_conservation = 1;
+		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+		cwnd = tcp_packets_in_flight(tp) + acked;
+	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+		/* Exiting loss recovery; restore cwnd saved before recovery. */
+		bbr->restore_cwnd = 1;
+		bbr->packet_conservation = 0;
+	}
+	bbr->prev_ca_state = state;
+
+	if (bbr->restore_cwnd) {
+		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+		cwnd = max(cwnd, bbr->prior_cwnd);
+		bbr->restore_cwnd = 0;
+	}
+
+	if (bbr->packet_conservation) {
+		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+		return true;	/* yes, using packet conservation */
+	}
+	*new_cwnd = cwnd;
+	return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+			 u32 acked, u32 bw, int gain)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cwnd = 0, target_cwnd = 0;
+
+	if (!acked)
+		return;
+
+	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+		goto done;
+
+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+	target_cwnd = bbr_target_cwnd(sk, bw, gain);
+	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+		cwnd = min(cwnd + acked, target_cwnd);
+	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+		cwnd = cwnd + acked;
+	cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_full_length =
+		skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+		bbr->min_rtt_us;
+	u32 inflight, bw;
+
+	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+	 * use the pipe without increasing the queue.
+	 */
+	if (bbr->pacing_gain == BBR_UNIT)
+		return is_full_length;		/* just use wall clock time */
+
+	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
+	bw = bbr_max_bw(sk);
+
+	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+	 * a path with small buffers may not hold that much.
+	 */
+	if (bbr->pacing_gain > BBR_UNIT)
+		return is_full_length &&
+			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+	 * probing didn't find more bw. If inflight falls to match BDP then we
+	 * estimate queue is drained; persisting would underutilize the pipe.
+	 */
+	return is_full_length ||
+		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+	bbr->cycle_mstamp = tp->delivered_mstamp;
+	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+				   const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+	    bbr_is_next_cycle_phase(sk, rs))
+		bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_STARTUP;
+	bbr->pacing_gain = bbr_high_gain;
+	bbr->cwnd_gain	 = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_PROBE_BW;
+	bbr->pacing_gain = BBR_UNIT;
+	bbr->cwnd_gain = bbr_cwnd_gain;
+	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+	if (!bbr_full_bw_reached(sk))
+		bbr_reset_startup_mode(sk);
+	else
+		bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+	bbr->lt_last_delivered = tp->delivered;
+	bbr->lt_last_lost = tp->lost;
+	bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_bw = 0;
+	bbr->lt_use_bw = 0;
+	bbr->lt_is_sampling = false;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 diff;
+
+	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+		/* Is new bw close to the lt_bw from the previous interval? */
+		diff = abs(bw - bbr->lt_bw);
+		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+		     bbr_lt_bw_diff)) {
+			/* All criteria are met; estimate we're policed. */
+			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+			bbr->lt_use_bw = 1;
+			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+			bbr->lt_rtt_cnt = 0;
+			return;
+		}
+	}
+	bbr->lt_bw = bw;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 lost, delivered;
+	u64 bw;
+	s32 t;
+
+	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+		}
+		return;
+	}
+
+	/* Wait for the first loss before sampling, to let the policer exhaust
+	 * its tokens and estimate the steady-state rate allowed by the policer.
+	 * Starting samples earlier includes bursts that over-estimate the bw.
+	 */
+	if (!bbr->lt_is_sampling) {
+		if (!rs->losses)
+			return;
+		bbr_reset_lt_bw_sampling_interval(sk);
+		bbr->lt_is_sampling = true;
+	}
+
+	/* To avoid underestimates, reset sampling if we run out of data. */
+	if (rs->is_app_limited) {
+		bbr_reset_lt_bw_sampling(sk);
+		return;
+	}
+
+	if (bbr->round_start)
+		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+		return;		/* sampling interval needs to be longer */
+	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+		return;
+	}
+
+	/* End sampling interval when a packet is lost, so we estimate the
+	 * policer tokens were exhausted. Stopping the sampling before the
+	 * tokens are exhausted under-estimates the policed rate.
+	 */
+	if (!rs->losses)
+		return;
+
+	/* Calculate packets lost and delivered in sampling interval. */
+	lost = tp->lost - bbr->lt_last_lost;
+	delivered = tp->delivered - bbr->lt_last_delivered;
+	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+		return;
+
+	/* Find average delivery rate in this sampling interval. */
+	t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+	if (t < 1)
+		return;		/* interval is less than one jiffy, so wait */
+	t = jiffies_to_usecs(t);
+	/* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+	if (t < 1) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+		return;
+	}
+	bw = (u64)delivered * BW_UNIT;
+	do_div(bw, t);
+	bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->round_start = 0;
+	if (rs->delivered < 0 || rs->interval_us <= 0)
+		return; /* Not a valid observation */
+
+	/* See if we've reached the next RTT */
+	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		bbr->next_rtt_delivered = tp->delivered;
+		bbr->rtt_cnt++;
+		bbr->round_start = 1;
+		bbr->packet_conservation = 0;
+	}
+
+	bbr_lt_bw_sampling(sk, rs);
+
+	/* Divide delivered by the interval to find a (lower bound) bottleneck
+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 */
+	bw = (u64)rs->delivered * BW_UNIT;
+	do_div(bw, rs->interval_us);
+
+	/* If this sample is application-limited, it is likely to have a very
+	 * low delivered count that represents application behavior rather than
+	 * the available network rate. Such a sample could drag down estimated
+	 * bw, causing needless slow-down. Thus, to continue to send at the
+	 * last measured network rate, we filter out app-limited samples unless
+	 * they describe the path bw at least as well as our bw model.
+	 *
+	 * So the goal during app-limited phase is to proceed with the best
+	 * network rate no matter how long. We automatically leave this
+	 * phase when app writes faster than the network can deliver :)
+	 */
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+		/* Incorporate new sample into our max bw filter. */
+		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+	}
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh;
+
+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+		return;
+
+	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+	if (bbr_max_bw(sk) >= bw_thresh) {
+		bbr->full_bw = bbr_max_bw(sk);
+		bbr->full_bw_cnt = 0;
+		return;
+	}
+	++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
+		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    tcp_packets_in_flight(tcp_sk(sk)) <=
+	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool filter_expired;
+
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	filter_expired = after(tcp_time_stamp,
+			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+	if (rs->rtt_us >= 0 &&
+	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+		bbr->min_rtt_us = rs->rtt_us;
+		bbr->min_rtt_stamp = tcp_time_stamp;
+	}
+
+	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain = BBR_UNIT;
+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+		bbr->probe_rtt_done_stamp = 0;
+	}
+
+	if (bbr->mode == BBR_PROBE_RTT) {
+		/* Ignore low rate samples during this mode. */
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+		/* Maintain min packets in flight for max(200 ms, 1 round). */
+		if (!bbr->probe_rtt_done_stamp &&
+		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+			bbr->probe_rtt_done_stamp = tcp_time_stamp +
+				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+			bbr->probe_rtt_round_done = 0;
+			bbr->next_rtt_delivered = tp->delivered;
+		} else if (bbr->probe_rtt_done_stamp) {
+			if (bbr->round_start)
+				bbr->probe_rtt_round_done = 1;
+			if (bbr->probe_rtt_round_done &&
+			    after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+				bbr->min_rtt_stamp = tcp_time_stamp;
+				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
+				bbr_reset_mode(sk);
+			}
+		}
+	}
+	bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+	bbr_update_bw(sk, rs);
+	bbr_update_cycle_phase(sk, rs);
+	bbr_check_full_bw_reached(sk, rs);
+	bbr_check_drain(sk, rs);
+	bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw;
+
+	bbr_update_model(sk, rs);
+
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_tso_segs_goal(sk);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->prior_cwnd = 0;
+	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
+	bbr->rtt_cnt = 0;
+	bbr->next_rtt_delivered = 0;
+	bbr->prev_ca_state = TCP_CA_Open;
+	bbr->packet_conservation = 0;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_time_stamp;
+
+	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
+
+	/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+	bw = (u64)tp->snd_cwnd * BW_UNIT;
+	do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
+	bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+	bbr->restore_cwnd = 0;
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw = 0;
+	bbr->full_bw_cnt = 0;
+	bbr->cycle_mstamp.v64 = 0;
+	bbr->cycle_idx = 0;
+	bbr_reset_lt_bw_sampling(sk);
+	bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+	return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+	bbr_save_cwnd(sk);
+	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+			   union tcp_cc_info *info)
+{
+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct bbr *bbr = inet_csk_ca(sk);
+		u64 bw = bbr_bw(sk);
+
+		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+		memset(&info->bbr, 0, sizeof(info->bbr));
+		info->bbr.bbr_bw_lo		= (u32)bw;
+		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
+		*attr = INET_DIAG_BBRINFO;
+		return sizeof(info->bbr);
+	}
+	return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		struct rate_sample rs = { .losses = 1 };
+
+		bbr->prev_ca_state = TCP_CA_Loss;
+		bbr->full_bw = 0;
+		bbr->round_start = 1;	/* treat RTO like end of a round */
+		bbr_lt_bw_sampling(sk, &rs);
+	}
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+	.flags		= TCP_CONG_NON_RESTRICTED,
+	.name		= "bbr",
+	.owner		= THIS_MODULE,
+	.init		= bbr_init,
+	.cong_control	= bbr_main,
+	.sndbuf_expand	= bbr_sndbuf_expand,
+	.undo_cwnd	= bbr_undo_cwnd,
+	.cwnd_event	= bbr_cwnd_event,
+	.ssthresh	= bbr_ssthresh,
+	.tso_segs_goal	= bbr_tso_segs_goal,
+	.get_info	= bbr_get_info,
+	.set_state	= bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
-- 
cgit v1.2.3


From b5036cd4ed3173ab8cdbc85e2ba74acf46bafb51 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 20 Sep 2016 16:17:22 +0200
Subject: ipmr, ip6mr: return lastuse relative to now

When I introduced the lastuse member I made a subtle error because it was
returned as an absolute value but that is meaningless to user-space as it
doesn't allow to see how old exactly an entry is. Let's make it similar to
how the bridge returns such values and make it relative to "now" (jiffies).
This allows us to show the actual age of the entries and is much more
useful (e.g. user-space daemons can age out entries, iproute2 can display
the lastuse properly).

Fixes: 43b9e1274060 ("net: ipmr/ip6mr: add support for keeping an entry age")
Reported-by: Satish Ashok <sashok@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c  | 7 +++++--
 net/ipv6/ip6mr.c | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 26253328d227..a87bcd2d4a94 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2076,6 +2076,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	struct rta_mfc_stats mfcs;
 	struct nlattr *mp_attr;
 	struct rtnexthop *nhp;
+	unsigned long lastuse;
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
@@ -2105,12 +2106,14 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 
 	nla_nest_end(skb, mp_attr);
 
+	lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
 	mfcs.mfcs_packets = c->mfc_un.res.pkt;
 	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
 	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
 	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
-	    nla_put_u64_64bit(skb, RTA_EXPIRES,
-			      jiffies_to_clock_t(c->mfc_un.res.lastuse),
+	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
 			      RTA_PAD))
 		return -EMSGSIZE;
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 6122f9c5cc49..fccb5dd91902 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2239,6 +2239,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 	struct rta_mfc_stats mfcs;
 	struct nlattr *mp_attr;
 	struct rtnexthop *nhp;
+	unsigned long lastuse;
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
@@ -2269,12 +2270,14 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 
 	nla_nest_end(skb, mp_attr);
 
+	lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
 	mfcs.mfcs_packets = c->mfc_un.res.pkt;
 	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
 	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
 	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
-	    nla_put_u64_64bit(skb, RTA_EXPIRES,
-			      jiffies_to_clock_t(c->mfc_un.res.lastuse),
+	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
 			      RTA_PAD))
 		return -EMSGSIZE;
 
-- 
cgit v1.2.3


From 63c43787d35e45562a6b5927e2edc8f4783d95b8 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 19 Sep 2016 16:17:57 +0200
Subject: vti6: fix input path

Since commit 1625f4529957, vti6 is broken, all input packets are dropped
(LINUX_MIB_XFRMINNOSTATES is incremented).

XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 is set by vti6_rcv() before calling
xfrm6_rcv()/xfrm6_rcv_spi(), thus we cannot set to NULL that value in
xfrm6_rcv_spi().

A new function xfrm6_rcv_tnl() that enables to pass a value to
xfrm6_rcv_spi() is added, so that xfrm6_rcv() is not touched (this function
is used in several handlers).

CC: Alexey Kodanev <alexey.kodanev@oracle.com>
Fixes: 1625f4529957 ("net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  4 +++-
 net/ipv6/ip6_vti.c      |  4 +---
 net/ipv6/xfrm6_input.c  | 16 +++++++++++-----
 net/ipv6/xfrm6_tunnel.c |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index adfebd6f243c..17934312eecb 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1540,8 +1540,10 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
 void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
 int xfrm6_extract_header(struct sk_buff *skb);
 int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi);
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+		  struct ip6_tnl *t);
 int xfrm6_transport_finish(struct sk_buff *skb, int async);
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
 int xfrm6_rcv(struct sk_buff *skb);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 52a2f735881f..5bd3afdcc771 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb)
 			goto discard;
 		}
 
-		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
-
 		rcu_read_unlock();
 
-		return xfrm6_rcv(skb);
+		return xfrm6_rcv_tnl(skb, t);
 	}
 	rcu_read_unlock();
 	return -EINVAL;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00a2d40677d6..b5789562aded 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 	return xfrm6_extract_header(skb);
 }
 
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+		  struct ip6_tnl *t)
 {
-	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
 	return xfrm_input(skb, nexthdr, spi, 0);
@@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	return -1;
 }
 
-int xfrm6_rcv(struct sk_buff *skb)
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
 {
 	return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
-			     0);
+			     0, t);
 }
-EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_rcv_tnl);
 
+int xfrm6_rcv(struct sk_buff *skb)
+{
+	return xfrm6_rcv_tnl(skb, NULL);
+}
+EXPORT_SYMBOL(xfrm6_rcv);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto)
 {
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5743044cd660..e1c0bbe7996c 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
 	__be32 spi;
 
 	spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+	return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
 }
 
 static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-- 
cgit v1.2.3


From c2f672fc94642bae96821a393f342edcfa9794a6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 20 Sep 2016 15:45:26 +0200
Subject: xfrm: state lookup can be lockless

This is called from the packet input path, we get lock contention
if many cpus handle ipsec in parallel.

After recent rcu conversion it is safe to call __xfrm_state_lookup
without the spinlock.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index ba8bf518ba14..a38fdead38ea 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1431,9 +1431,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32
 {
 	struct xfrm_state *x;
 
-	spin_lock_bh(&net->xfrm.xfrm_state_lock);
+	rcu_read_lock();
 	x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family);
-	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+	rcu_read_unlock();
 	return x;
 }
 EXPORT_SYMBOL(xfrm_state_lookup);
-- 
cgit v1.2.3


From 332ae8e2f6ecda5e50c5c62ed62894963e3a83f5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:53 +0100
Subject: net: cls_bpf: add hardware offload

This patch adds hardware offload capability to cls_bpf classifier,
similar to what have been done with U32 and flower.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 include/net/pkt_cls.h     | 14 ++++++++++
 net/sched/cls_bpf.c       | 70 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a10d8d18ce19..69f242c71865 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -789,6 +789,7 @@ enum {
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_MATCHALL,
+	TC_SETUP_CLSBPF,
 };
 
 struct tc_cls_u32_offload;
@@ -800,6 +801,7 @@ struct tc_to_netdev {
 		struct tc_cls_u32_offload *cls_u32;
 		struct tc_cls_flower_offload *cls_flower;
 		struct tc_cls_matchall_offload *cls_mall;
+		struct tc_cls_bpf_offload *cls_bpf;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a459be5fe1c2..41e8071dff87 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -486,4 +486,18 @@ struct tc_cls_matchall_offload {
 	unsigned long cookie;
 };
 
+enum tc_clsbpf_command {
+	TC_CLSBPF_ADD,
+	TC_CLSBPF_REPLACE,
+	TC_CLSBPF_DESTROY,
+};
+
+struct tc_cls_bpf_offload {
+	enum tc_clsbpf_command command;
+	struct tcf_exts *exts;
+	struct bpf_prog *prog;
+	const char *name;
+	bool exts_integrated;
+};
+
 #endif
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c6f7a47541eb..6523c5b4c0a5 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -39,6 +39,7 @@ struct cls_bpf_prog {
 	struct list_head link;
 	struct tcf_result res;
 	bool exts_integrated;
+	bool offloaded;
 	struct tcf_exts exts;
 	u32 handle;
 	union {
@@ -138,6 +139,71 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
 	return !prog->bpf_ops;
 }
 
+static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+			       enum tc_clsbpf_command cmd)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_bpf_offload bpf_offload = {};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSBPF;
+	offload.cls_bpf = &bpf_offload;
+
+	bpf_offload.command = cmd;
+	bpf_offload.exts = &prog->exts;
+	bpf_offload.prog = prog->filter;
+	bpf_offload.name = prog->bpf_name;
+	bpf_offload.exts_integrated = prog->exts_integrated;
+
+	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					     tp->protocol, &offload);
+}
+
+static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+			    struct cls_bpf_prog *oldprog)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct cls_bpf_prog *obj = prog;
+	enum tc_clsbpf_command cmd;
+
+	if (oldprog && oldprog->offloaded) {
+		if (tc_should_offload(dev, tp, 0)) {
+			cmd = TC_CLSBPF_REPLACE;
+		} else {
+			obj = oldprog;
+			cmd = TC_CLSBPF_DESTROY;
+		}
+	} else {
+		if (!tc_should_offload(dev, tp, 0))
+			return;
+		cmd = TC_CLSBPF_ADD;
+	}
+
+	if (cls_bpf_offload_cmd(tp, obj, cmd))
+		return;
+
+	obj->offloaded = true;
+	if (oldprog)
+		oldprog->offloaded = false;
+}
+
+static void cls_bpf_stop_offload(struct tcf_proto *tp,
+				 struct cls_bpf_prog *prog)
+{
+	int err;
+
+	if (!prog->offloaded)
+		return;
+
+	err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
+	if (err) {
+		pr_err("Stopping hardware offload failed: %d\n", err);
+		return;
+	}
+
+	prog->offloaded = false;
+}
+
 static int cls_bpf_init(struct tcf_proto *tp)
 {
 	struct cls_bpf_head *head;
@@ -177,6 +243,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
 {
 	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
 
+	cls_bpf_stop_offload(tp, prog);
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
 	call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -193,6 +260,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+		cls_bpf_stop_offload(tp, prog);
 		list_del_rcu(&prog->link);
 		tcf_unbind_filter(tp, &prog->res);
 		call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -415,6 +483,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	if (ret < 0)
 		goto errout;
 
+	cls_bpf_offload(tp, prog, oldprog);
+
 	if (oldprog) {
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
-- 
cgit v1.2.3


From 0d01d45f1b251448590c710baa32f722e43c62c7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:54 +0100
Subject: net: cls_bpf: limit hardware offload by software-only flag

Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_HW flag.
Unlike U32 and flower cls_bpf already has some netlink
flags defined.  Create a new attribute to be able to use
the same flag values as the above.

Unlike U32 and flower reject unknown flags.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        |  1 +
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_bpf.c          | 22 ++++++++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 41e8071dff87..57af9f3032ff 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -498,6 +498,7 @@ struct tc_cls_bpf_offload {
 	struct bpf_prog *prog;
 	const char *name;
 	bool exts_integrated;
+	u32 gen_flags;
 };
 
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 8915b61bbf83..8fd715f806a2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -396,6 +396,7 @@ enum {
 	TCA_BPF_FD,
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
+	TCA_BPF_FLAGS_GEN,
 	__TCA_BPF_MAX,
 };
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6523c5b4c0a5..ebf01f7c1470 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
 MODULE_DESCRIPTION("TC BPF based classifier");
 
 #define CLS_BPF_NAME_LEN	256
+#define CLS_BPF_SUPPORTED_GEN_FLAGS		\
+	TCA_CLS_FLAGS_SKIP_HW
 
 struct cls_bpf_head {
 	struct list_head plist;
@@ -40,6 +42,7 @@ struct cls_bpf_prog {
 	struct tcf_result res;
 	bool exts_integrated;
 	bool offloaded;
+	u32 gen_flags;
 	struct tcf_exts exts;
 	u32 handle;
 	union {
@@ -55,6 +58,7 @@ struct cls_bpf_prog {
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
 	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
 	[TCA_BPF_FLAGS]		= { .type = NLA_U32 },
+	[TCA_BPF_FLAGS_GEN]	= { .type = NLA_U32 },
 	[TCA_BPF_FD]		= { .type = NLA_U32 },
 	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING,
 				    .len = CLS_BPF_NAME_LEN },
@@ -154,6 +158,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	bpf_offload.prog = prog->filter;
 	bpf_offload.name = prog->bpf_name;
 	bpf_offload.exts_integrated = prog->exts_integrated;
+	bpf_offload.gen_flags = prog->gen_flags;
 
 	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
 					     tp->protocol, &offload);
@@ -167,14 +172,14 @@ static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	enum tc_clsbpf_command cmd;
 
 	if (oldprog && oldprog->offloaded) {
-		if (tc_should_offload(dev, tp, 0)) {
+		if (tc_should_offload(dev, tp, prog->gen_flags)) {
 			cmd = TC_CLSBPF_REPLACE;
 		} else {
 			obj = oldprog;
 			cmd = TC_CLSBPF_DESTROY;
 		}
 	} else {
-		if (!tc_should_offload(dev, tp, 0))
+		if (!tc_should_offload(dev, tp, prog->gen_flags))
 			return;
 		cmd = TC_CLSBPF_ADD;
 	}
@@ -370,6 +375,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 {
 	bool is_bpf, is_ebpf, have_exts = false;
 	struct tcf_exts exts;
+	u32 gen_flags = 0;
 	int ret;
 
 	is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
@@ -394,8 +400,17 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 
 		have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
 	}
+	if (tb[TCA_BPF_FLAGS_GEN]) {
+		gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+		if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+		    !tc_flags_valid(gen_flags)) {
+			ret = -EINVAL;
+			goto errout;
+		}
+	}
 
 	prog->exts_integrated = have_exts;
+	prog->gen_flags = gen_flags;
 
 	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
 		       cls_bpf_prog_from_efd(tb, prog, tp);
@@ -568,6 +583,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
 	if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
 		goto nla_put_failure;
+	if (prog->gen_flags &&
+	    nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
+		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-- 
cgit v1.2.3


From eadb41489fd2249e71fd14b36fb488ed7217ca4b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:55 +0100
Subject: net: cls_bpf: add support for marking filters as hardware-only

Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_SW flag.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index ebf01f7c1470..1becc2fe1bc5 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -28,7 +28,7 @@ MODULE_DESCRIPTION("TC BPF based classifier");
 
 #define CLS_BPF_NAME_LEN	256
 #define CLS_BPF_SUPPORTED_GEN_FLAGS		\
-	TCA_CLS_FLAGS_SKIP_HW
+	(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
 
 struct cls_bpf_head {
 	struct list_head plist;
@@ -96,7 +96,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
 
-		if (at_ingress) {
+		if (tc_skip_sw(prog->gen_flags)) {
+			filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
+		} else if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
 			bpf_compute_data_end(skb);
@@ -164,32 +166,42 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 					     tp->protocol, &offload);
 }
 
-static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
-			    struct cls_bpf_prog *oldprog)
+static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+			   struct cls_bpf_prog *oldprog)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct cls_bpf_prog *obj = prog;
 	enum tc_clsbpf_command cmd;
+	bool skip_sw;
+	int ret;
+
+	skip_sw = tc_skip_sw(prog->gen_flags) ||
+		(oldprog && tc_skip_sw(oldprog->gen_flags));
 
 	if (oldprog && oldprog->offloaded) {
 		if (tc_should_offload(dev, tp, prog->gen_flags)) {
 			cmd = TC_CLSBPF_REPLACE;
-		} else {
+		} else if (!tc_skip_sw(prog->gen_flags)) {
 			obj = oldprog;
 			cmd = TC_CLSBPF_DESTROY;
+		} else {
+			return -EINVAL;
 		}
 	} else {
 		if (!tc_should_offload(dev, tp, prog->gen_flags))
-			return;
+			return skip_sw ? -EINVAL : 0;
 		cmd = TC_CLSBPF_ADD;
 	}
 
-	if (cls_bpf_offload_cmd(tp, obj, cmd))
-		return;
+	ret = cls_bpf_offload_cmd(tp, obj, cmd);
+	if (ret)
+		return skip_sw ? ret : 0;
 
 	obj->offloaded = true;
 	if (oldprog)
 		oldprog->offloaded = false;
+
+	return 0;
 }
 
 static void cls_bpf_stop_offload(struct tcf_proto *tp,
@@ -498,7 +510,11 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	if (ret < 0)
 		goto errout;
 
-	cls_bpf_offload(tp, prog, oldprog);
+	ret = cls_bpf_offload(tp, prog, oldprog);
+	if (ret) {
+		cls_bpf_delete_prog(tp, prog);
+		return ret;
+	}
 
 	if (oldprog) {
 		list_replace_rcu(&oldprog->link, &prog->link);
-- 
cgit v1.2.3


From 68d640630d4ef2a4bf3f68b5073dec5e4c4f878b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:44:02 +0100
Subject: net: cls_bpf: allow offloaded filters to update stats

Call into offloaded filters to update stats.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h |  1 +
 net/sched/cls_bpf.c   | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 57af9f3032ff..5ccaa4be7d96 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -490,6 +490,7 @@ enum tc_clsbpf_command {
 	TC_CLSBPF_ADD,
 	TC_CLSBPF_REPLACE,
 	TC_CLSBPF_DESTROY,
+	TC_CLSBPF_STATS,
 };
 
 struct tc_cls_bpf_offload {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1becc2fe1bc5..bb1d5a487081 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -221,6 +221,15 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp,
 	prog->offloaded = false;
 }
 
+static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
+					 struct cls_bpf_prog *prog)
+{
+	if (!prog->offloaded)
+		return;
+
+	cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
+}
+
 static int cls_bpf_init(struct tcf_proto *tp)
 {
 	struct cls_bpf_head *head;
@@ -577,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	tm->tcm_handle = prog->handle;
 
+	cls_bpf_offload_update_stats(tp, prog);
+
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 9798e6fe4f9b6a2847a40e24b75e68afdc7a01b3 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:44:05 +0100
Subject: net: act_mirred: allow statistic updates from offloaded actions

Implement .stats_update() callback.  The implementation
is generic and can be reused by other simple actions if
needed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_mirred.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1c76387c5d9c..667dc382df82 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -204,6 +204,13 @@ out:
 	return retval;
 }
 
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+			     u64 lastuse)
+{
+	tcf_lastuse_update(&a->tcfa_tm);
+	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+}
+
 static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 			   int ref)
 {
@@ -281,6 +288,7 @@ static struct tc_action_ops act_mirred_ops = {
 	.type		=	TCA_ACT_MIRRED,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_mirred,
+	.stats_update	=	tcf_stats_update,
 	.dump		=	tcf_mirred_dump,
 	.cleanup	=	tcf_mirred_release,
 	.init		=	tcf_mirred_init,
-- 
cgit v1.2.3


From 5a924b8951f835b5ff8a3d9f434f3b230fc9905f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:31 +0100
Subject: rxrpc: Don't store the rxrpc header in the Tx queue sk_buffs

Don't store the rxrpc protocol header in sk_buffs on the transmit queue,
but rather generate it on the fly and pass it to kernel_sendmsg() as a
separate iov.  This reduces the amount of storage required.

Note that the security header is still stored in the sk_buff as it may get
encrypted along with the data (and doesn't change with each transmission).

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  5 ++-
 net/rxrpc/call_event.c  | 11 ++-----
 net/rxrpc/conn_object.c |  1 -
 net/rxrpc/output.c      | 83 +++++++++++++++++++++++++++++++++----------------
 net/rxrpc/rxkad.c       |  8 ++---
 net/rxrpc/sendmsg.c     | 51 +++++-------------------------
 6 files changed, 71 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 034f525f2235..f021df4a6a22 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -385,10 +385,9 @@ struct rxrpc_connection {
 	int			debug_id;	/* debug ID for printks */
 	atomic_t		serial;		/* packet serial number counter */
 	unsigned int		hi_serial;	/* highest serial number received */
+	u32			security_nonce;	/* response re-use preventer */
 	u8			size_align;	/* data size alignment (for security) */
-	u8			header_size;	/* rxrpc + security header size */
 	u8			security_size;	/* security header size */
-	u32			security_nonce;	/* response re-use preventer */
 	u8			security_ix;	/* security type */
 	u8			out_clientflag;	/* RXRPC_CLIENT_INITIATED if we are client */
 };
@@ -946,7 +945,7 @@ extern const s8 rxrpc_ack_priority[];
  * output.c
  */
 int rxrpc_send_call_packet(struct rxrpc_call *, u8);
-int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *);
 void rxrpc_reject_packets(struct rxrpc_local *);
 
 /*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 7d1b99824ed9..6247ce25eb21 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -139,7 +139,6 @@ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
  */
 static void rxrpc_resend(struct rxrpc_call *call)
 {
-	struct rxrpc_wire_header *whdr;
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
 	rxrpc_seq_t cursor, seq, top;
@@ -201,15 +200,8 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		skb = call->rxtx_buffer[ix];
 		rxrpc_get_skb(skb, rxrpc_skb_tx_got);
 		spin_unlock_bh(&call->lock);
-		sp = rxrpc_skb(skb);
-
-		/* Each Tx packet needs a new serial number */
-		sp->hdr.serial = atomic_inc_return(&call->conn->serial);
 
-		whdr = (struct rxrpc_wire_header *)skb->head;
-		whdr->serial = htonl(sp->hdr.serial);
-
-		if (rxrpc_send_data_packet(call->conn, skb) < 0) {
+		if (rxrpc_send_data_packet(call, skb) < 0) {
 			call->resend_at = now + 2;
 			rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 			return;
@@ -217,6 +209,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 		if (rxrpc_is_client_call(call))
 			rxrpc_expose_client_call(call);
+		sp = rxrpc_skb(skb);
 		sp->resend_at = now + rxrpc_resend_timeout;
 
 		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 3b55aee0c436..e1e83af47866 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -53,7 +53,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
 		spin_lock_init(&conn->state_lock);
 		conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
 		conn->size_align = 4;
-		conn->header_size = sizeof(struct rxrpc_wire_header);
 		conn->idle_timestamp = jiffies;
 	}
 
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 16e18a94ffa6..817fb0e82d6a 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -208,19 +208,42 @@ out:
 /*
  * send a packet through the transport endpoint
  */
-int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
+int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 {
-	struct kvec iov[1];
+	struct rxrpc_connection *conn = call->conn;
+	struct rxrpc_wire_header whdr;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct msghdr msg;
+	struct kvec iov[2];
+	rxrpc_serial_t serial;
+	size_t len;
 	int ret, opt;
 
 	_enter(",{%d}", skb->len);
 
-	iov[0].iov_base = skb->head;
-	iov[0].iov_len = skb->len;
+	/* Each transmission of a Tx packet needs a new serial number */
+	serial = atomic_inc_return(&conn->serial);
+
+	whdr.epoch	= htonl(conn->proto.epoch);
+	whdr.cid	= htonl(call->cid);
+	whdr.callNumber	= htonl(call->call_id);
+	whdr.seq	= htonl(sp->hdr.seq);
+	whdr.serial	= htonl(serial);
+	whdr.type	= RXRPC_PACKET_TYPE_DATA;
+	whdr.flags	= sp->hdr.flags;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = call->security_ix;
+	whdr._rsvd	= htons(sp->hdr._rsvd);
+	whdr.serviceId	= htons(call->service_id);
+
+	iov[0].iov_base = &whdr;
+	iov[0].iov_len = sizeof(whdr);
+	iov[1].iov_base = skb->head;
+	iov[1].iov_len = skb->len;
+	len = iov[0].iov_len + iov[1].iov_len;
 
-	msg.msg_name = &conn->params.peer->srx.transport;
-	msg.msg_namelen = conn->params.peer->srx.transport_len;
+	msg.msg_name = &call->peer->srx.transport;
+	msg.msg_namelen = call->peer->srx.transport_len;
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
 	msg.msg_flags = 0;
@@ -234,26 +257,33 @@ int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
 		}
 	}
 
+	_proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq);
+
 	/* send the packet with the don't fragment bit set if we currently
 	 * think it's small enough */
-	if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
-		down_read(&conn->params.local->defrag_sem);
-		/* send the packet by UDP
-		 * - returns -EMSGSIZE if UDP would have to fragment the packet
-		 *   to go out of the interface
-		 *   - in which case, we'll have processed the ICMP error
-		 *     message and update the peer record
-		 */
-		ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
-				     iov[0].iov_len);
-
-		up_read(&conn->params.local->defrag_sem);
-		if (ret == -EMSGSIZE)
-			goto send_fragmentable;
-
-		_leave(" = %d [%u]", ret, conn->params.peer->maxdata);
-		return ret;
+	if (iov[1].iov_len >= call->peer->maxdata)
+		goto send_fragmentable;
+
+	down_read(&conn->params.local->defrag_sem);
+	/* send the packet by UDP
+	 * - returns -EMSGSIZE if UDP would have to fragment the packet
+	 *   to go out of the interface
+	 *   - in which case, we'll have processed the ICMP error
+	 *     message and update the peer record
+	 */
+	ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+
+	up_read(&conn->params.local->defrag_sem);
+	if (ret == -EMSGSIZE)
+		goto send_fragmentable;
+
+done:
+	if (ret == 0) {
+		sp->resend_at = jiffies + rxrpc_resend_timeout;
+		sp->hdr.serial = serial;
 	}
+	_leave(" = %d [%u]", ret, call->peer->maxdata);
+	return ret;
 
 send_fragmentable:
 	/* attempt to send this message with fragmentation enabled */
@@ -268,8 +298,8 @@ send_fragmentable:
 					SOL_IP, IP_MTU_DISCOVER,
 					(char *)&opt, sizeof(opt));
 		if (ret == 0) {
-			ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
-					     iov[0].iov_len);
+			ret = kernel_sendmsg(conn->params.local->socket, &msg,
+					     iov, 2, len);
 
 			opt = IP_PMTUDISC_DO;
 			kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -298,8 +328,7 @@ send_fragmentable:
 	}
 
 	up_write(&conn->params.local->defrag_sem);
-	_leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
-	return ret;
+	goto done;
 }
 
 /*
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index ae392558829d..88d080a1a3de 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
 	case RXRPC_SECURITY_AUTH:
 		conn->size_align = 8;
 		conn->security_size = sizeof(struct rxkad_level1_hdr);
-		conn->header_size += sizeof(struct rxkad_level1_hdr);
 		break;
 	case RXRPC_SECURITY_ENCRYPT:
 		conn->size_align = 8;
 		conn->security_size = sizeof(struct rxkad_level2_hdr);
-		conn->header_size += sizeof(struct rxkad_level2_hdr);
 		break;
 	default:
 		ret = -EKEYREJECTED;
@@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
 
 	_enter("");
 
-	check = sp->hdr.seq ^ sp->hdr.callNumber;
+	check = sp->hdr.seq ^ call->call_id;
 	data_size |= (u32)check << 16;
 
 	hdr.data_size = htonl(data_size);
@@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
 
 	_enter("");
 
-	check = sp->hdr.seq ^ sp->hdr.callNumber;
+	check = sp->hdr.seq ^ call->call_id;
 
 	rxkhdr.data_size = htonl(data_size | (u32)check << 16);
 	rxkhdr.checksum = 0;
@@ -277,7 +275,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
 	/* calculate the security checksum */
 	x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
 	x |= sp->hdr.seq & 0x3fffffff;
-	call->crypto_buf[0] = htonl(sp->hdr.callNumber);
+	call->crypto_buf[0] = htonl(call->call_id);
 	call->crypto_buf[1] = htonl(x);
 
 	sg_init_one(&sg, call->crypto_buf, 8);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 6a39ee97a0b7..814b17f23971 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -134,13 +134,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 		write_unlock_bh(&call->state_lock);
 	}
 
-	_proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
 	if (seq == 1 && rxrpc_is_client_call(call))
 		rxrpc_expose_client_call(call);
 
 	sp->resend_at = jiffies + rxrpc_resend_timeout;
-	ret = rxrpc_send_data_packet(call->conn, skb);
+	ret = rxrpc_send_data_packet(call, skb);
 	if (ret < 0) {
 		_debug("need instant resend %d", ret);
 		rxrpc_instant_resend(call, ix);
@@ -150,29 +148,6 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	_leave("");
 }
 
-/*
- * Convert a host-endian header into a network-endian header.
- */
-static void rxrpc_insert_header(struct sk_buff *skb)
-{
-	struct rxrpc_wire_header whdr;
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
-	whdr.epoch	= htonl(sp->hdr.epoch);
-	whdr.cid	= htonl(sp->hdr.cid);
-	whdr.callNumber	= htonl(sp->hdr.callNumber);
-	whdr.seq	= htonl(sp->hdr.seq);
-	whdr.serial	= htonl(sp->hdr.serial);
-	whdr.type	= sp->hdr.type;
-	whdr.flags	= sp->hdr.flags;
-	whdr.userStatus	= sp->hdr.userStatus;
-	whdr.securityIndex = sp->hdr.securityIndex;
-	whdr._rsvd	= htons(sp->hdr._rsvd);
-	whdr.serviceId	= htons(sp->hdr.serviceId);
-
-	memcpy(skb->head, &whdr, sizeof(whdr));
-}
-
 /*
  * send data through a socket
  * - must be called in process context
@@ -232,7 +207,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			space = chunk + call->conn->size_align;
 			space &= ~(call->conn->size_align - 1UL);
 
-			size = space + call->conn->header_size;
+			size = space + call->conn->security_size;
 
 			_debug("SIZE: %zu/%zu/%zu", chunk, space, size);
 
@@ -248,9 +223,9 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			ASSERTCMP(skb->mark, ==, 0);
 
-			_debug("HS: %u", call->conn->header_size);
-			skb_reserve(skb, call->conn->header_size);
-			skb->len += call->conn->header_size;
+			_debug("HS: %u", call->conn->security_size);
+			skb_reserve(skb, call->conn->security_size);
+			skb->len += call->conn->security_size;
 
 			sp = rxrpc_skb(skb);
 			sp->remain = chunk;
@@ -312,33 +287,23 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			seq = call->tx_top + 1;
 
-			sp->hdr.epoch	= conn->proto.epoch;
-			sp->hdr.cid	= call->cid;
-			sp->hdr.callNumber = call->call_id;
 			sp->hdr.seq	= seq;
-			sp->hdr.serial	= atomic_inc_return(&conn->serial);
-			sp->hdr.type	= RXRPC_PACKET_TYPE_DATA;
-			sp->hdr.userStatus = 0;
-			sp->hdr.securityIndex = call->security_ix;
 			sp->hdr._rsvd	= 0;
-			sp->hdr.serviceId = call->service_id;
+			sp->hdr.flags	= conn->out_clientflag;
 
-			sp->hdr.flags = conn->out_clientflag;
 			if (msg_data_left(msg) == 0 && !more)
 				sp->hdr.flags |= RXRPC_LAST_PACKET;
 			else if (call->tx_top - call->tx_hard_ack <
 				 call->tx_winsize)
 				sp->hdr.flags |= RXRPC_MORE_PACKETS;
-			if (more && seq & 1)
+			if (seq & 1)
 				sp->hdr.flags |= RXRPC_REQUEST_ACK;
 
 			ret = conn->security->secure_packet(
-				call, skb, skb->mark,
-				skb->head + sizeof(struct rxrpc_wire_header));
+				call, skb, skb->mark, skb->head);
 			if (ret < 0)
 				goto out;
 
-			rxrpc_insert_header(skb);
 			rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
 			skb = NULL;
 		}
-- 
cgit v1.2.3


From f07373ead455a396e15a431bc08d8ce1dac6f1cf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:32 +0100
Subject: rxrpc: Add re-sent Tx annotation

Add a Tx-phase annotation for packet buffers to indicate that a buffer has
already been retransmitted.  This will be used by future congestion
management.  Re-retransmissions of a packet don't affect the congestion
window managment in the same way as initial retransmissions.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  2 ++
 net/rxrpc/call_event.c  | 28 +++++++++++++++++++---------
 net/rxrpc/input.c       | 14 +++++++++++---
 3 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f021df4a6a22..dcf54e3fb478 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -505,6 +505,8 @@ struct rxrpc_call {
 #define RXRPC_TX_ANNO_UNACK	1
 #define RXRPC_TX_ANNO_NAK	2
 #define RXRPC_TX_ANNO_RETRANS	3
+#define RXRPC_TX_ANNO_MASK	0x03
+#define RXRPC_TX_ANNO_RESENT	0x04
 #define RXRPC_RX_ANNO_JUMBO	0x3f		/* Jumbo subpacket number + 1 if not zero */
 #define RXRPC_RX_ANNO_JLAST	0x40		/* Set if last element of a jumbo packet */
 #define RXRPC_RX_ANNO_VERIFIED	0x80		/* Set if verified and decrypted */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6247ce25eb21..34ad967f2d81 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -144,7 +144,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	rxrpc_seq_t cursor, seq, top;
 	unsigned long resend_at, now;
 	int ix;
-	u8 annotation;
+	u8 annotation, anno_type;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
@@ -165,14 +165,16 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	for (seq = cursor + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		annotation = call->rxtx_annotations[ix];
-		if (annotation == RXRPC_TX_ANNO_ACK)
+		anno_type = annotation & RXRPC_TX_ANNO_MASK;
+		annotation &= ~RXRPC_TX_ANNO_MASK;
+		if (anno_type == RXRPC_TX_ANNO_ACK)
 			continue;
 
 		skb = call->rxtx_buffer[ix];
 		rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
 		sp = rxrpc_skb(skb);
 
-		if (annotation == RXRPC_TX_ANNO_UNACK) {
+		if (anno_type == RXRPC_TX_ANNO_UNACK) {
 			if (time_after(sp->resend_at, now)) {
 				if (time_before(sp->resend_at, resend_at))
 					resend_at = sp->resend_at;
@@ -181,7 +183,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		}
 
 		/* Okay, we need to retransmit a packet. */
-		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
 	}
 
 	call->resend_at = resend_at;
@@ -194,7 +196,8 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	for (seq = cursor + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		annotation = call->rxtx_annotations[ix];
-		if (annotation != RXRPC_TX_ANNO_RETRANS)
+		anno_type = annotation & RXRPC_TX_ANNO_MASK;
+		if (anno_type != RXRPC_TX_ANNO_RETRANS)
 			continue;
 
 		skb = call->rxtx_buffer[ix];
@@ -220,10 +223,17 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		 * received and the packet might have been hard-ACK'd (in which
 		 * case it will no longer be in the buffer).
 		 */
-		if (after(seq, call->tx_hard_ack) &&
-		    (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_RETRANS ||
-		     call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK))
-			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
+		if (after(seq, call->tx_hard_ack)) {
+			annotation = call->rxtx_annotations[ix];
+			anno_type = annotation & RXRPC_TX_ANNO_MASK;
+			if (anno_type == RXRPC_TX_ANNO_RETRANS ||
+			    anno_type == RXRPC_TX_ANNO_NAK) {
+				annotation &= ~RXRPC_TX_ANNO_MASK;
+				annotation |= RXRPC_TX_ANNO_UNACK;
+			}
+			annotation |= RXRPC_TX_ANNO_RESENT;
+			call->rxtx_annotations[ix] = annotation;
+		}
 
 		if (after(call->tx_hard_ack, seq))
 			seq = call->tx_hard_ack;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 7ac1edf3aac7..aa261df9fc9e 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -388,17 +388,25 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 {
 	bool resend = false;
 	int ix;
+	u8 annotation, anno_type;
 
 	for (; nr_acks > 0; nr_acks--, seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
+		annotation = call->rxtx_annotations[ix];
+		anno_type = annotation & RXRPC_TX_ANNO_MASK;
+		annotation &= ~RXRPC_TX_ANNO_MASK;
 		switch (*acks++) {
 		case RXRPC_ACK_TYPE_ACK:
-			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK;
+			if (anno_type == RXRPC_TX_ANNO_ACK)
+				continue;
+			call->rxtx_annotations[ix] =
+				RXRPC_TX_ANNO_ACK | annotation;
 			break;
 		case RXRPC_ACK_TYPE_NACK:
-			if (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK)
+			if (anno_type == RXRPC_TX_ANNO_NAK)
 				continue;
-			call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK;
+			call->rxtx_annotations[ix] =
+				RXRPC_TX_ANNO_NAK | annotation;
 			resend = true;
 			break;
 		default:
-- 
cgit v1.2.3


From cf1a6474f80735ff4a5d99f3dd68a94dbec8455f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:41:53 +0100
Subject: rxrpc: Add per-peer RTT tracker

Add a function to track the average RTT for a peer.  Sources of RTT data
will be added in subsequent patches.

The RTT data will be useful in the future for determining resend timeouts
and for handling the slow-start part of the Rx protocol.

Also add a pair of tracepoints, one to log transmissions to elicit a
response for RTT purposes and one to log responses that contribute RTT
data.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 61 ++++++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 25 +++++++++++++++---
 net/rxrpc/misc.c             |  8 ++++++
 net/rxrpc/peer_event.c       | 41 +++++++++++++++++++++++++++++
 4 files changed, 131 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 75a5d8bf50e1..e8f2afbbe0bf 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -353,6 +353,67 @@ TRACE_EVENT(rxrpc_recvmsg,
 		      __entry->ret)
 	    );
 
+TRACE_EVENT(rxrpc_rtt_tx,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_tx_trace why,
+		     rxrpc_serial_t send_serial),
+
+	    TP_ARGS(call, why, send_serial),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_rtt_tx_trace,	why		)
+		    __field(rxrpc_serial_t,		send_serial	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->send_serial = send_serial;
+			   ),
+
+	    TP_printk("c=%p %s sr=%08x",
+		      __entry->call,
+		      rxrpc_rtt_tx_traces[__entry->why],
+		      __entry->send_serial)
+	    );
+
+TRACE_EVENT(rxrpc_rtt_rx,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+		     rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+		     s64 rtt, u8 nr, s64 avg),
+
+	    TP_ARGS(call, why, send_serial, resp_serial, rtt, nr, avg),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(enum rxrpc_rtt_rx_trace,	why		)
+		    __field(u8,				nr		)
+		    __field(rxrpc_serial_t,		send_serial	)
+		    __field(rxrpc_serial_t,		resp_serial	)
+		    __field(s64,			rtt		)
+		    __field(u64,			avg		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->why = why;
+		    __entry->send_serial = send_serial;
+		    __entry->resp_serial = resp_serial;
+		    __entry->rtt = rtt;
+		    __entry->nr = nr;
+		    __entry->avg = avg;
+			   ),
+
+	    TP_printk("c=%p %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld",
+		      __entry->call,
+		      rxrpc_rtt_rx_traces[__entry->why],
+		      __entry->send_serial,
+		      __entry->resp_serial,
+		      __entry->rtt,
+		      __entry->nr,
+		      __entry->avg)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index dcf54e3fb478..79c671e552c3 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -258,10 +258,11 @@ struct rxrpc_peer {
 
 	/* calculated RTT cache */
 #define RXRPC_RTT_CACHE_SIZE 32
-	suseconds_t		rtt;		/* current RTT estimate (in uS) */
-	unsigned int		rtt_point;	/* next entry at which to insert */
-	unsigned int		rtt_usage;	/* amount of cache actually used */
-	suseconds_t		rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */
+	u64			rtt;		/* Current RTT estimate (in nS) */
+	u64			rtt_sum;	/* Sum of cache contents */
+	u64			rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
+	u8			rtt_cursor;	/* next entry at which to insert */
+	u8			rtt_usage;	/* amount of cache actually used */
 };
 
 /*
@@ -657,6 +658,20 @@ enum rxrpc_recvmsg_trace {
 
 extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
 
+enum rxrpc_rtt_tx_trace {
+	rxrpc_rtt_tx_ping,
+	rxrpc_rtt_tx__nr_trace
+};
+
+extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
+
+enum rxrpc_rtt_rx_trace {
+	rxrpc_rtt_rx_ping_response,
+	rxrpc_rtt_rx__nr_trace
+};
+
+extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
@@ -955,6 +970,8 @@ void rxrpc_reject_packets(struct rxrpc_local *);
  */
 void rxrpc_error_report(struct sock *);
 void rxrpc_peer_error_distributor(struct work_struct *);
+void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
+			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
 
 /*
  * peer_object.c
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 026e1f2e83ff..6321c23f9a6e 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -182,3 +182,11 @@ const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
 	[rxrpc_recvmsg_to_be_accepted]	= "TBAC",
 	[rxrpc_recvmsg_return]		= "RETN",
 };
+
+const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
+	[rxrpc_rtt_tx_ping]		= "PING",
+};
+
+const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
+	[rxrpc_rtt_rx_ping_response]	= "PONG",
+};
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 18276e7cb9e0..bf13b8470c9a 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -305,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
 	rxrpc_put_peer(peer);
 	_leave("");
 }
+
+/*
+ * Add RTT information to cache.  This is called in softirq mode and has
+ * exclusive access to the peer RTT data.
+ */
+void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+			rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+			ktime_t send_time, ktime_t resp_time)
+{
+	struct rxrpc_peer *peer = call->peer;
+	s64 rtt;
+	u64 sum = peer->rtt_sum, avg;
+	u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage;
+
+	rtt = ktime_to_ns(ktime_sub(resp_time, send_time));
+	if (rtt < 0)
+		return;
+
+	/* Replace the oldest datum in the RTT buffer */
+	sum -= peer->rtt_cache[cursor];
+	sum += rtt;
+	peer->rtt_cache[cursor] = rtt;
+	peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1);
+	peer->rtt_sum = sum;
+	if (usage < RXRPC_RTT_CACHE_SIZE) {
+		usage++;
+		peer->rtt_usage = usage;
+	}
+
+	/* Now recalculate the average */
+	if (usage == RXRPC_RTT_CACHE_SIZE) {
+		avg = sum / RXRPC_RTT_CACHE_SIZE;
+	} else {
+		avg = sum;
+		do_div(avg, usage);
+	}
+
+	peer->rtt = avg;
+	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
+			   usage, avg);
+}
-- 
cgit v1.2.3


From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 19 Sep 2016 19:11:09 +0300
Subject: net: skbuff: Export __skb_vlan_pop

This exports the functionality of extracting the tag from the payload,
without moving next vlan tag into hw accel tag.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c6dab3f7457c..9bf60b556bd2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3085,6 +3085,7 @@ bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7bf82a28e10a..6c22351bd519 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4522,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
 }
 EXPORT_SYMBOL(skb_ensure_writable);
 
-/* remove VLAN header from packet and update csum accordingly. */
-static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
 {
 	struct vlan_hdr *vhdr;
 	unsigned int offset = skb->data - skb_mac_header(skb);
@@ -4554,6 +4556,7 @@ pull:
 
 	return err;
 }
+EXPORT_SYMBOL(__skb_vlan_pop);
 
 int skb_vlan_pop(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 45a497f2d149a4a8061c61518a79d59f1f3034b2 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 19 Sep 2016 19:11:10 +0300
Subject: net/sched: act_vlan: Introduce TCA_VLAN_ACT_MODIFY vlan action

TCA_VLAN_ACT_MODIFY allows one to change an existing tag.

It accepts same attributes as TCA_VLAN_ACT_PUSH (protocol, id,
priority).
If packet is vlan tagged, then the tag gets overwritten according to
user specified attributes.

For example, this allows user to replace a tag's vid while preserving
its priority bits (as opposed to "action vlan pop pipe action vlan push").

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_vlan.h |  1 +
 net/sched/act_vlan.c                | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h
index be72b6e3843b..bddb272b843f 100644
--- a/include/uapi/linux/tc_act/tc_vlan.h
+++ b/include/uapi/linux/tc_act/tc_vlan.h
@@ -16,6 +16,7 @@
 
 #define TCA_VLAN_ACT_POP	1
 #define TCA_VLAN_ACT_PUSH	2
+#define TCA_VLAN_ACT_MODIFY	3
 
 struct tc_vlan {
 	tc_gen;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 59a8d3150ae2..a95c00b119da 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,6 +30,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_vlan *v = to_vlan(a);
 	int action;
 	int err;
+	u16 tci;
 
 	spin_lock(&v->tcf_lock);
 	tcf_lastuse_update(&v->tcf_tm);
@@ -48,6 +49,30 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 		if (err)
 			goto drop;
 		break;
+	case TCA_VLAN_ACT_MODIFY:
+		/* No-op if no vlan tag (either hw-accel or in-payload) */
+		if (!skb_vlan_tagged(skb))
+			goto unlock;
+		/* extract existing tag (and guarantee no hw-accel tag) */
+		if (skb_vlan_tag_present(skb)) {
+			tci = skb_vlan_tag_get(skb);
+			skb->vlan_tci = 0;
+		} else {
+			/* in-payload vlan tag, pop it */
+			err = __skb_vlan_pop(skb, &tci);
+			if (err)
+				goto drop;
+		}
+		/* replace the vid */
+		tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+		/* replace prio bits, if tcfv_push_prio specified */
+		if (v->tcfv_push_prio) {
+			tci &= ~VLAN_PRIO_MASK;
+			tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+		}
+		/* put updated tci as hwaccel tag */
+		__vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
+		break;
 	default:
 		BUG();
 	}
@@ -102,6 +127,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	case TCA_VLAN_ACT_POP:
 		break;
 	case TCA_VLAN_ACT_PUSH:
+	case TCA_VLAN_ACT_MODIFY:
 		if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
 			if (exists)
 				tcf_hash_release(*a, bind);
@@ -185,7 +211,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 	if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-	if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
+	if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
+	     v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
 	    (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
 	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
 			  v->tcfv_push_proto) ||
-- 
cgit v1.2.3


From 636c2628086e40c86dac7ddc84a1c4b4fcccc6e3 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@ravellosystems.com>
Date: Tue, 20 Sep 2016 12:48:36 +0300
Subject: net: skbuff: Remove errornous length validation in skb_vlan_pop()

In 93515d53b1
  "net: move vlan pop/push functions into common code"
skb_vlan_pop was moved from its private location in openvswitch to
skbuff common code.

In case skb has non hw-accel vlan tag, the original 'pop_vlan()' assured
that skb->len is sufficient (if skb->len < VLAN_ETH_HLEN then pop was
considered a no-op).

This validation was moved as is into the new common 'skb_vlan_pop'.

Alas, in its original location (openvswitch), there was a guarantee that
'data' points to the mac_header, therefore the 'skb->len < VLAN_ETH_HLEN'
condition made sense.
However there's no such guarantee in the generic 'skb_vlan_pop'.

For short packets received in rx path going through 'skb_vlan_pop',
this causes 'skb_vlan_pop' to fail pop-ing a valid vlan hdr (in the non
hw-accel case) or to fail moving next tag into hw-accel tag.

Remove the 'skb->len < VLAN_ETH_HLEN' condition entirely:
It is superfluous since inner '__skb_vlan_pop' already verifies there
are VLAN_ETH_HLEN writable bytes at the mac_header.

Note this presents a slight change to skb_vlan_pop() users:
In case total length is smaller than VLAN_ETH_HLEN, skb_vlan_pop() now
returns an error, as opposed to previous "no-op" behavior.
Existing callers (e.g. tc act vlan, ovs) usually drop the packet if
'skb_vlan_pop' fails.

Fixes: 93515d53b1 ("net: move vlan pop/push functions into common code")
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: Pravin Shelar <pshelar@ovn.org>
Reviewed-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6c22351bd519..b2a51bf1b0f9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4567,9 +4567,8 @@ int skb_vlan_pop(struct sk_buff *skb)
 	if (likely(skb_vlan_tag_present(skb))) {
 		skb->vlan_tci = 0;
 	} else {
-		if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
-			      skb->protocol != htons(ETH_P_8021AD)) ||
-			     skb->len < VLAN_ETH_HLEN))
+		if (unlikely(skb->protocol != htons(ETH_P_8021Q) &&
+			     skb->protocol != htons(ETH_P_8021AD)))
 			return 0;
 
 		err = __skb_vlan_pop(skb, &vlan_tci);
@@ -4577,9 +4576,8 @@ int skb_vlan_pop(struct sk_buff *skb)
 			return err;
 	}
 	/* move next vlan tag to hw accel tag */
-	if (likely((skb->protocol != htons(ETH_P_8021Q) &&
-		    skb->protocol != htons(ETH_P_8021AD)) ||
-		   skb->len < VLAN_ETH_HLEN))
+	if (likely(skb->protocol != htons(ETH_P_8021Q) &&
+		   skb->protocol != htons(ETH_P_8021AD)))
 		return 0;
 
 	vlan_proto = skb->protocol;
-- 
cgit v1.2.3


From ecf4ee41d25832a6ec52f8b54dfaa46c08b949d5 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@ravellosystems.com>
Date: Tue, 20 Sep 2016 12:48:37 +0300
Subject: net: skbuff: Coding: Use eth_type_vlan() instead of open coding it

Fix 'skb_vlan_pop' to use eth_type_vlan instead of directly comparing
skb->protocol to ETH_P_8021Q or ETH_P_8021AD.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Reviewed-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2a51bf1b0f9..d36c7548952f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4567,8 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb)
 	if (likely(skb_vlan_tag_present(skb))) {
 		skb->vlan_tci = 0;
 	} else {
-		if (unlikely(skb->protocol != htons(ETH_P_8021Q) &&
-			     skb->protocol != htons(ETH_P_8021AD)))
+		if (unlikely(!eth_type_vlan(skb->protocol)))
 			return 0;
 
 		err = __skb_vlan_pop(skb, &vlan_tci);
@@ -4576,8 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb)
 			return err;
 	}
 	/* move next vlan tag to hw accel tag */
-	if (likely(skb->protocol != htons(ETH_P_8021Q) &&
-		   skb->protocol != htons(ETH_P_8021AD)))
+	if (likely(!eth_type_vlan(skb->protocol)))
 		return 0;
 
 	vlan_proto = skb->protocol;
-- 
cgit v1.2.3


From adb03115f4590baa280ddc440a8eff08a6be0cb7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 20 Sep 2016 18:06:17 -0700
Subject: net: get rid of an signed integer overflow in ip_idents_reserve()

Jiri Pirko reported an UBSAN warning happening in ip_idents_reserve()

[] UBSAN: Undefined behaviour in ./arch/x86/include/asm/atomic.h:156:11
[] signed integer overflow:
[] -2117905507 + -695755206 cannot be represented in type 'int'

Since we do not have uatomic_add_return() yet, use atomic_cmpxchg()
so that the arithmetics can be done using unsigned int.

Fixes: 04ca6973f7c1 ("ip: make IP identifiers less predictable")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a1f2830d8110..b5b47a26d4ec 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs)
 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 	u32 old = ACCESS_ONCE(*p_tstamp);
 	u32 now = (u32)jiffies;
-	u32 delta = 0;
+	u32 new, delta = 0;
 
 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 		delta = prandom_u32_max(now - old);
 
-	return atomic_add_return(segs + delta, p_id) - segs;
+	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
+	do {
+		old = (u32)atomic_read(p_id);
+		new = old + delta + segs;
+	} while (atomic_cmpxchg(p_id, old, new) != old);
+
+	return new - segs;
 }
 EXPORT_SYMBOL(ip_idents_reserve);
 
-- 
cgit v1.2.3


From f9616c35a0d786bc64fff4bf819d1e4984873367 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 20 Sep 2016 22:45:58 -0700
Subject: tcp: implement TSQ for retransmits

We saw sch_fq drops caused by the per flow limit of 100 packets and TCP
when dealing with large cwnd and bursts of retransmits.

Even after increasing the limit to 1000, and even after commit
10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time"),
we can still have these drops.

Under certain conditions, TCP can spend a considerable amount of
time queuing thousands of skbs in a single tcp_xmit_retransmit_queue()
invocation, incurring latency spikes and stalls of other softirq
handlers.

This patch implements TSQ for retransmits, limiting number of packets
and giving more chance for scheduling packets in both ways.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 72 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d025a7804b5..478dfc539178 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk)
 {
 	if ((1 << sk->sk_state) &
 	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
-	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
-		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		if (tp->lost_out > tp->retrans_out &&
+		    tp->snd_cwnd > tcp_packets_in_flight(tp))
+			tcp_xmit_retransmit_queue(sk);
+
+		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
 			       0, GFP_ATOMIC);
+	}
 }
 /*
  * One tasklet per cpu tries to send more skbs.
@@ -2039,6 +2046,39 @@ static int tcp_mtu_probe(struct sock *sk)
 	return -1;
 }
 
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ *  - better RTT estimation and ACK scheduling
+ *  - faster recovery
+ *  - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+				  unsigned int factor)
+{
+	unsigned int limit;
+
+	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+	limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+	limit <<= factor;
+
+	if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+		set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+		/* It is possible TX completion already happened
+		 * before we set TSQ_THROTTLED, so we must
+		 * test again the condition.
+		 */
+		smp_mb__after_atomic();
+		if (atomic_read(&sk->sk_wmem_alloc) > limit)
+			return true;
+	}
+	return false;
+}
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -2125,29 +2165,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
 			break;
 
-		/* TCP Small Queues :
-		 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
-		 * This allows for :
-		 *  - better RTT estimation and ACK scheduling
-		 *  - faster recovery
-		 *  - high rates
-		 * Alas, some drivers / subsystems require a fair amount
-		 * of queued bytes to ensure line rate.
-		 * One example is wifi aggregation (802.11 AMPDU)
-		 */
-		limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
-		limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
-
-		if (atomic_read(&sk->sk_wmem_alloc) > limit) {
-			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
-			/* It is possible TX completion already happened
-			 * before we set TSQ_THROTTLED, so we must
-			 * test again the condition.
-			 */
-			smp_mb__after_atomic();
-			if (atomic_read(&sk->sk_wmem_alloc) > limit)
-				break;
-		}
+		if (tcp_small_queue_check(sk, skb, 0))
+			break;
 
 		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
 			break;
@@ -2847,6 +2866,9 @@ begin_fwd:
 		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
 			continue;
 
+		if (tcp_small_queue_check(sk, skb, 1))
+			return;
+
 		if (tcp_retransmit_skb(sk, skb, segs))
 			return;
 
-- 
cgit v1.2.3


From e2f036a97271cf5811ee754bf321a29a814577f9 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 21 Sep 2016 08:45:55 -0300
Subject: sctp: rename WORD_TRUNC/ROUND macros

To something more meaningful these days, specially because this is
working on packet headers or lengths and which are not tied to any CPU
arch but to the protocol itself.

So, WORD_TRUNC becomes SCTP_TRUNC4 and WORD_ROUND becomes SCTP_PAD4.

Reported-by: David Laight <David.Laight@ACULAB.COM>
Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h  | 10 +++++-----
 net/netfilter/xt_sctp.c  |  2 +-
 net/sctp/associola.c     |  2 +-
 net/sctp/chunk.c         |  6 +++---
 net/sctp/input.c         |  8 ++++----
 net/sctp/inqueue.c       |  2 +-
 net/sctp/output.c        | 12 ++++++------
 net/sctp/sm_make_chunk.c | 28 ++++++++++++++--------------
 net/sctp/sm_statefuns.c  |  6 +++---
 net/sctp/transport.c     |  4 ++--
 net/sctp/ulpevent.c      |  4 ++--
 11 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 632e205ca54b..87a7f42e7639 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -83,9 +83,9 @@
 #endif
 
 /* Round an int up to the next multiple of 4.  */
-#define WORD_ROUND(s) (((s)+3)&~3)
+#define SCTP_PAD4(s) (((s)+3)&~3)
 /* Truncate to the previous multiple of 4.  */
-#define WORD_TRUNC(s) ((s)&~3)
+#define SCTP_TRUNC4(s) ((s)&~3)
 
 /*
  * Function declarations.
@@ -433,7 +433,7 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
 	if (asoc->user_frag)
 		frag = min_t(int, frag, asoc->user_frag);
 
-	frag = WORD_TRUNC(min_t(int, frag, SCTP_MAX_CHUNK_LEN));
+	frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN));
 
 	return frag;
 }
@@ -462,7 +462,7 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member)
 for (pos.v = chunk->member;\
      pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\
      ntohs(pos.p->length) >= sizeof(sctp_paramhdr_t);\
-     pos.v += WORD_ROUND(ntohs(pos.p->length)))
+     pos.v += SCTP_PAD4(ntohs(pos.p->length)))
 
 #define sctp_walk_errors(err, chunk_hdr)\
 _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length))
@@ -472,7 +472,7 @@ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \
 	    sizeof(sctp_chunkhdr_t));\
      (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\
      ntohs(err->length) >= sizeof(sctp_errhdr_t); \
-     err = (sctp_errhdr_t *)((void *)err + WORD_ROUND(ntohs(err->length))))
+     err = (sctp_errhdr_t *)((void *)err + SCTP_PAD4(ntohs(err->length))))
 
 #define sctp_walk_fwdtsn(pos, chunk)\
 _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk))
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index ef36a56a02c6..4dedb96d1a06 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb,
 			 ++i, offset, sch->type, htons(sch->length),
 			 sch->flags);
 #endif
-		offset += WORD_ROUND(ntohs(sch->length));
+		offset += SCTP_PAD4(ntohs(sch->length));
 
 		pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1c23060c41a6..f10d3397f917 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
 				transports) {
 		if (t->pmtu_pending && t->dst) {
 			sctp_transport_update_pmtu(sk, t,
-						   WORD_TRUNC(dst_mtu(t->dst)));
+						   SCTP_TRUNC4(dst_mtu(t->dst)));
 			t->pmtu_pending = 0;
 		}
 		if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index af9cc8055465..76eae828ec89 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -208,8 +208,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
 
 		if (hmac_desc)
-			max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
-					    hmac_desc->hmac_len);
+			max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
+					      hmac_desc->hmac_len);
 	}
 
 	/* Now, check if we need to reduce our max */
@@ -229,7 +229,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	    asoc->outqueue.out_qlen == 0 &&
 	    list_empty(&asoc->outqueue.retransmit) &&
 	    msg_len > max)
-		max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
+		max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
 
 	/* Encourage Cookie-ECHO bundling. */
 	if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 69444d32ecda..a1d85065bfc0 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 		/* PMTU discovery (RFC1191) */
 		if (ICMP_FRAG_NEEDED == code) {
 			sctp_icmp_frag_needed(sk, asoc, transport,
-					      WORD_TRUNC(info));
+					      SCTP_TRUNC4(info));
 			goto out_unlock;
 		} else {
 			if (ICMP_PROT_UNREACH == code) {
@@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
 		if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
 			break;
 
-		ch_end = offset + WORD_ROUND(ntohs(ch->length));
+		ch_end = offset + SCTP_PAD4(ntohs(ch->length));
 		if (ch_end > skb->len)
 			break;
 
@@ -1121,7 +1121,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
 		if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
 			break;
 
-		ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+		ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
 		if (ch_end > skb_tail_pointer(skb))
 			break;
 
@@ -1190,7 +1190,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
 	 * that the chunk length doesn't cause overflow.  Otherwise, we'll
 	 * walk off the end.
 	 */
-	if (WORD_ROUND(ntohs(ch->length)) > skb->len)
+	if (SCTP_PAD4(ntohs(ch->length)) > skb->len)
 		return NULL;
 
 	/* If this is INIT/INIT-ACK look inside the chunk too. */
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 6437aa97cfd7..f731de3e8428 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -213,7 +213,7 @@ new_skb:
 	}
 
 	chunk->chunk_hdr = ch;
-	chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+	chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
 	skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
 	chunk->subh.v = NULL; /* Subheader is no longer valid.  */
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 0c605ec74dc4..2a5c1896d18f 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -297,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
 					      struct sctp_chunk *chunk)
 {
 	sctp_xmit_t retval = SCTP_XMIT_OK;
-	__u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length));
+	__u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
 
 	/* Check to see if this chunk will fit into the packet */
 	retval = sctp_packet_will_fit(packet, chunk, chunk_len);
@@ -508,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 		if (gso) {
 			pkt_size = packet->overhead;
 			list_for_each_entry(chunk, &packet->chunk_list, list) {
-				int padded = WORD_ROUND(chunk->skb->len);
+				int padded = SCTP_PAD4(chunk->skb->len);
 
 				if (pkt_size + padded > tp->pathmtu)
 					break;
@@ -538,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 		 * included in the chunk length field.  The sender should
 		 * never pad with more than 3 bytes.
 		 *
-		 * [This whole comment explains WORD_ROUND() below.]
+		 * [This whole comment explains SCTP_PAD4() below.]
 		 */
 
 		pkt_size -= packet->overhead;
@@ -560,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 				has_data = 1;
 			}
 
-			padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+			padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
 			if (padding)
 				memset(skb_put(chunk->skb, padding), 0, padding);
 
@@ -587,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 			 * acknowledged or have failed.
 			 * Re-queue auth chunks if needed.
 			 */
-			pkt_size -= WORD_ROUND(chunk->skb->len);
+			pkt_size -= SCTP_PAD4(chunk->skb->len);
 
 			if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
 				sctp_chunk_free(chunk);
@@ -911,7 +911,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
 		 */
 		maxsize = pmtu - packet->overhead;
 		if (packet->auth)
-			maxsize -= WORD_ROUND(packet->auth->skb->len);
+			maxsize -= SCTP_PAD4(packet->auth->skb->len);
 		if (chunk_len > maxsize)
 			retval = SCTP_XMIT_PMTU_FULL;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8c77b87a8565..79dd66079dd7 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	num_types = sp->pf->supported_addrs(sp, types);
 
 	chunksize = sizeof(init) + addrs_len;
-	chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
+	chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
 	chunksize += sizeof(ecap_param);
 
 	if (asoc->prsctp_enable)
@@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		/* Add HMACS parameter length if any were defined */
 		auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
 		if (auth_hmacs->length)
-			chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+			chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
 		else
 			auth_hmacs = NULL;
 
 		/* Add CHUNKS parameter length */
 		auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
 		if (auth_chunks->length)
-			chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+			chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
 		else
 			auth_chunks = NULL;
 
@@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 
 	/* If we have any extensions to report, account for that */
 	if (num_ext)
-		chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
-					num_ext);
+		chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+				       num_ext);
 
 	/* RFC 2960 3.3.2 Initiation (INIT) (1)
 	 *
@@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 
 		auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
 		if (auth_hmacs->length)
-			chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+			chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
 		else
 			auth_hmacs = NULL;
 
 		auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
 		if (auth_chunks->length)
-			chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+			chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
 		else
 			auth_chunks = NULL;
 
@@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	}
 
 	if (num_ext)
-		chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
-					num_ext);
+		chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+				       num_ext);
 
 	/* Now allocate and fill out the chunk.  */
 	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@ -1390,7 +1390,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	struct sock *sk;
 
 	/* No need to allocate LL here, as this is only a chunk. */
-	skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
+	skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp);
 	if (!skb)
 		goto nodata;
 
@@ -1482,7 +1482,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
 	void *target;
 	void *padding;
 	int chunklen = ntohs(chunk->chunk_hdr->length);
-	int padlen = WORD_ROUND(chunklen) - chunklen;
+	int padlen = SCTP_PAD4(chunklen) - chunklen;
 
 	padding = skb_put(chunk->skb, padlen);
 	target = skb_put(chunk->skb, len);
@@ -1900,7 +1900,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc,
 	struct __sctp_missing report;
 	__u16 len;
 
-	len = WORD_ROUND(sizeof(report));
+	len = SCTP_PAD4(sizeof(report));
 
 	/* Make an ERROR chunk, preparing enough room for
 	 * returning multiple unknown parameters.
@@ -2098,9 +2098,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
 
 		if (*errp) {
 			if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
-					WORD_ROUND(ntohs(param.p->length))))
+					SCTP_PAD4(ntohs(param.p->length))))
 				sctp_addto_chunk_fixed(*errp,
-						WORD_ROUND(ntohs(param.p->length)),
+						SCTP_PAD4(ntohs(param.p->length)),
 						param.v);
 		} else {
 			/* If there is no memory for generating the ERROR
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d88bb2b0b699..026e3bca4a94 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
 		}
 
 		/* Report violation if chunk len overflows */
-		ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+		ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
 		if (ch_end > skb_tail_pointer(skb))
 			return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
@@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
 		hdr = unk_chunk->chunk_hdr;
 		err_chunk = sctp_make_op_error(asoc, unk_chunk,
 					       SCTP_ERROR_UNKNOWN_CHUNK, hdr,
-					       WORD_ROUND(ntohs(hdr->length)),
+					       SCTP_PAD4(ntohs(hdr->length)),
 					       0);
 		if (err_chunk) {
 			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
@@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
 		hdr = unk_chunk->chunk_hdr;
 		err_chunk = sctp_make_op_error(asoc, unk_chunk,
 					       SCTP_ERROR_UNKNOWN_CHUNK, hdr,
-					       WORD_ROUND(ntohs(hdr->length)),
+					       SCTP_PAD4(ntohs(hdr->length)),
 					       0);
 		if (err_chunk) {
 			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 81b86678be4d..ce54dce13ddb 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 	}
 
 	if (transport->dst) {
-		transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+		transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
 	} else
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
@@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport,
 		return;
 	}
 	if (transport->dst) {
-		transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+		transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
 
 		/* Initialize sk->sk_rcv_saddr, if the transport is the
 		 * association's active path for getsockname().
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d85b803da11d..bea00058ce35 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
 
 	ch = (sctp_errhdr_t *)(chunk->skb->data);
 	cause = ch->cause;
-	elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+	elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
 
 	/* Pull off the ERROR header.  */
 	skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
@@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	 * MUST ignore the padding bytes.
 	 */
 	len = ntohs(chunk->chunk_hdr->length);
-	padding = WORD_ROUND(len) - len;
+	padding = SCTP_PAD4(len) - len;
 
 	/* Fixup cloned skb with just this chunks data.  */
 	skb_trim(skb, chunk->chunk_end - padding - skb->data);
-- 
cgit v1.2.3


From 4a225ce3950879a5426c56f306f5d1c9d6330292 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 21 Sep 2016 08:45:56 -0300
Subject: sctp: make use of SCTP_TRUNC4 macro

And avoid the usage of '&~3'. This is the last place still not using
the macro.
Also break the line to make it easier to read.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/chunk.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 76eae828ec89..8afe2e90d003 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -195,9 +195,10 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
 	 */
-	max_data = (asoc->pathmtu -
-		sctp_sk(asoc->base.sk)->pf->af->net_header_len -
-		sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3;
+	max_data = asoc->pathmtu -
+		   sctp_sk(asoc->base.sk)->pf->af->net_header_len -
+		   sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+	max_data = SCTP_TRUNC4(max_data);
 
 	max = asoc->frag_point;
 	/* If the the peer requested that we authenticate DATA chunks
-- 
cgit v1.2.3


From 8e83134db4ecb77a1dc3390b60ddeea840a5afbc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:31 +0100
Subject: rxrpc: Send pings to get RTT data

Send a PING ACK packet to the peer when we get a new incoming call from a
peer we don't have a record for.  The PING RESPONSE ACK packet will tell us
the following about the peer:

 (1) its receive window size

 (2) its MTU sizes

 (3) its support for jumbo DATA packets

 (4) if it supports slow start (similar to RFC 5681)

 (5) an estimate of the RTT

This is necessary because the peer won't normally send us an ACK until it
gets to the Rx phase and we send it a packet, but we would like to know
some of this information before we start sending packets.

A pair of tracepoints are added so that RTT determination can be observed.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  7 +++++--
 net/rxrpc/input.c       | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 net/rxrpc/misc.c        | 11 ++++++-----
 net/rxrpc/output.c      | 22 ++++++++++++++++++++++
 4 files changed, 80 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 79c671e552c3..8b47f468eb9d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -403,6 +403,7 @@ enum rxrpc_call_flag {
 	RXRPC_CALL_EXPOSED,		/* The call was exposed to the world */
 	RXRPC_CALL_RX_LAST,		/* Received the last packet (at rxtx_top) */
 	RXRPC_CALL_TX_LAST,		/* Last packet in Tx buffer (at rxtx_top) */
+	RXRPC_CALL_PINGING,		/* Ping in process */
 };
 
 /*
@@ -487,6 +488,8 @@ struct rxrpc_call {
 	u32			call_id;	/* call ID on connection  */
 	u32			cid;		/* connection ID plus channel index */
 	int			debug_id;	/* debug ID for printks */
+	unsigned short		rx_pkt_offset;	/* Current recvmsg packet offset */
+	unsigned short		rx_pkt_len;	/* Current recvmsg packet len */
 
 	/* Rx/Tx circular buffer, depending on phase.
 	 *
@@ -530,8 +533,8 @@ struct rxrpc_call {
 	u16			ackr_skew;	/* skew on packet being ACK'd */
 	rxrpc_serial_t		ackr_serial;	/* serial of packet being ACK'd */
 	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
-	unsigned short		rx_pkt_offset;	/* Current recvmsg packet offset */
-	unsigned short		rx_pkt_len;	/* Current recvmsg packet len */
+	rxrpc_serial_t		ackr_ping;	/* Last ping sent */
+	ktime_t			ackr_ping_time;	/* Time last ping sent */
 
 	/* transmission-phase ACK management */
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index aa261df9fc9e..a0a5bd108c9e 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -36,6 +36,19 @@ static void rxrpc_proto_abort(const char *why,
 	}
 }
 
+/*
+ * Ping the other end to fill our RTT cache and to retrieve the rwind
+ * and MTU parameters.
+ */
+static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
+			    int skew)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+	rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+			  true, true);
+}
+
 /*
  * Apply a hard ACK by advancing the Tx window.
  */
@@ -342,6 +355,32 @@ ack:
 	_leave(" [queued]");
 }
 
+/*
+ * Process a ping response.
+ */
+static void rxrpc_input_ping_response(struct rxrpc_call *call,
+				      ktime_t resp_time,
+				      rxrpc_serial_t orig_serial,
+				      rxrpc_serial_t ack_serial)
+{
+	rxrpc_serial_t ping_serial;
+	ktime_t ping_time;
+
+	ping_time = call->ackr_ping_time;
+	smp_rmb();
+	ping_serial = call->ackr_ping;
+
+	if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
+	    before(orig_serial, ping_serial))
+		return;
+	clear_bit(RXRPC_CALL_PINGING, &call->flags);
+	if (after(orig_serial, ping_serial))
+		return;
+
+	rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response,
+			   orig_serial, ack_serial, ping_time, resp_time);
+}
+
 /*
  * Process the extra information that may be appended to an ACK packet
  */
@@ -438,6 +477,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		struct rxrpc_ackinfo info;
 		u8 acks[RXRPC_MAXACKS];
 	} buf;
+	rxrpc_serial_t acked_serial;
 	rxrpc_seq_t first_soft_ack, hard_ack;
 	int nr_acks, offset;
 
@@ -449,6 +489,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 	sp->offset += sizeof(buf.ack);
 
+	acked_serial = ntohl(buf.ack.serial);
 	first_soft_ack = ntohl(buf.ack.firstPacket);
 	hard_ack = first_soft_ack - 1;
 	nr_acks = buf.ack.nAcks;
@@ -460,10 +501,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	       ntohs(buf.ack.maxSkew),
 	       first_soft_ack,
 	       ntohl(buf.ack.previousPacket),
-	       ntohl(buf.ack.serial),
+	       acked_serial,
 	       rxrpc_acks(buf.ack.reason),
 	       buf.ack.nAcks);
 
+	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
+		rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
+					  sp->hdr.serial);
+
 	if (buf.ack.reason == RXRPC_ACK_PING) {
 		_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
@@ -830,6 +875,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
 			rcu_read_unlock();
 			goto reject_packet;
 		}
+		rxrpc_send_ping(call, skb, skew);
 	}
 
 	rxrpc_input_call_packet(call, skb, skew);
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 6321c23f9a6e..56e668352fc7 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -83,11 +83,12 @@ const s8 rxrpc_ack_priority[] = {
 	[RXRPC_ACK_DELAY]		= 1,
 	[RXRPC_ACK_REQUESTED]		= 2,
 	[RXRPC_ACK_IDLE]		= 3,
-	[RXRPC_ACK_PING_RESPONSE]	= 4,
-	[RXRPC_ACK_DUPLICATE]		= 5,
-	[RXRPC_ACK_OUT_OF_SEQUENCE]	= 6,
-	[RXRPC_ACK_EXCEEDS_WINDOW]	= 7,
-	[RXRPC_ACK_NOSPACE]		= 8,
+	[RXRPC_ACK_DUPLICATE]		= 4,
+	[RXRPC_ACK_OUT_OF_SEQUENCE]	= 5,
+	[RXRPC_ACK_EXCEEDS_WINDOW]	= 6,
+	[RXRPC_ACK_NOSPACE]		= 7,
+	[RXRPC_ACK_PING_RESPONSE]	= 8,
+	[RXRPC_ACK_PING]		= 9,
 };
 
 const char *rxrpc_acks(u8 reason)
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 817fb0e82d6a..0d89cd3f2c01 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -57,6 +57,9 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	pkt->ack.reason		= call->ackr_reason;
 	pkt->ack.nAcks		= top - hard_ack;
 
+	if (pkt->ack.reason == RXRPC_ACK_PING)
+		pkt->whdr.flags |= RXRPC_REQUEST_ACK;
+
 	if (after(top, hard_ack)) {
 		seq = hard_ack + 1;
 		do {
@@ -97,6 +100,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	struct kvec iov[2];
 	rxrpc_serial_t serial;
 	size_t len, n;
+	bool ping = false;
 	int ioc, ret;
 	u32 abort_code;
 
@@ -147,6 +151,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 			ret = 0;
 			goto out;
 		}
+		ping = (call->ackr_reason == RXRPC_ACK_PING);
 		n = rxrpc_fill_out_ack(call, pkt);
 		call->ackr_reason = 0;
 
@@ -183,12 +188,29 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 		goto out;
 	}
 
+	if (ping) {
+		call->ackr_ping = serial;
+		smp_wmb();
+		/* We need to stick a time in before we send the packet in case
+		 * the reply gets back before kernel_sendmsg() completes - but
+		 * asking UDP to send the packet can take a relatively long
+		 * time, so we update the time after, on the assumption that
+		 * the packet transmission is more likely to happen towards the
+		 * end of the kernel_sendmsg() call.
+		 */
+		call->ackr_ping_time = ktime_get_real();
+		set_bit(RXRPC_CALL_PINGING, &call->flags);
+		trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial);
+	}
 	ret = kernel_sendmsg(conn->params.local->socket,
 			     &msg, iov, ioc, len);
+	if (ping)
+		call->ackr_ping_time = ktime_get_real();
 
 	if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) {
 		switch (type) {
 		case RXRPC_PACKET_TYPE_ACK:
+			clear_bit(RXRPC_CALL_PINGING, &call->flags);
 			rxrpc_propose_ACK(call, pkt->ack.reason,
 					  ntohs(pkt->ack.maxSkew),
 					  ntohl(pkt->ack.serial),
-- 
cgit v1.2.3


From 7aa51da7c88d42cc0bb85ab7d01429fbd4e51282 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:31 +0100
Subject: rxrpc: Expedite ping response transmission

Expedite the transmission of a response to a PING ACK by sending it from
sendmsg if one is pending.  We're most likely to see a PING ACK during the
client call Tx phase as the other side may use it to determine a number of
parameters, such as the client's receive window size, the RTT and whether
the client is doing slow start (similar to RFC5681).

If we don't expedite it, it's left to the background processing thread to
transmit.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/sendmsg.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 814b17f23971..3c969de3ef05 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -180,6 +180,10 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 	copied = 0;
 	do {
+		/* Check to see if there's a ping ACK to reply to. */
+		if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
+			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+
 		if (!skb) {
 			size_t size, chunk, max, space;
 
-- 
cgit v1.2.3


From 50235c4b5a2fb9a9690f02cd1dea6ca047d7f79e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:31 +0100
Subject: rxrpc: Obtain RTT data by requesting ACKs on DATA packets

In addition to sending a PING ACK to gain RTT data, we can set the
RXRPC_REQUEST_ACK flag on a DATA packet and get a REQUESTED-ACK ACK.  The
ACK packet contains the serial number of the packet it is in response to,
so we can look through the Tx buffer for a matching DATA packet.

This requires that the data packets be stamped with the time of
transmission as a ktime rather than having the resend_at time in jiffies.

This further requires the resend code to do the resend determination in
ktimes and convert to jiffies to set the timer.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  7 +++----
 net/rxrpc/call_event.c  | 19 +++++++++----------
 net/rxrpc/input.c       | 35 +++++++++++++++++++++++++++++++++++
 net/rxrpc/misc.c        |  6 ++++--
 net/rxrpc/output.c      |  7 +++++--
 net/rxrpc/sendmsg.c     |  1 -
 net/rxrpc/sysctl.c      |  2 +-
 7 files changed, 57 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 8b47f468eb9d..1c4597b2c6cd 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -142,10 +142,7 @@ struct rxrpc_host_header {
  */
 struct rxrpc_skb_priv {
 	union {
-		unsigned long	resend_at;	/* time in jiffies at which to resend */
-		struct {
-			u8	nr_jumbo;	/* Number of jumbo subpackets */
-		};
+		u8		nr_jumbo;	/* Number of jumbo subpackets */
 	};
 	union {
 		unsigned int	offset;		/* offset into buffer of next read */
@@ -663,6 +660,7 @@ extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
 
 enum rxrpc_rtt_tx_trace {
 	rxrpc_rtt_tx_ping,
+	rxrpc_rtt_tx_data,
 	rxrpc_rtt_tx__nr_trace
 };
 
@@ -670,6 +668,7 @@ extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
 
 enum rxrpc_rtt_rx_trace {
 	rxrpc_rtt_rx_ping_response,
+	rxrpc_rtt_rx_requested_ack,
 	rxrpc_rtt_rx__nr_trace
 };
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 34ad967f2d81..adb2ec61e21f 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -142,12 +142,14 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
 	rxrpc_seq_t cursor, seq, top;
-	unsigned long resend_at, now;
+	ktime_t now = ktime_get_real(), max_age, oldest, resend_at;
 	int ix;
 	u8 annotation, anno_type;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
+	max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
+
 	spin_lock_bh(&call->lock);
 
 	cursor = call->tx_hard_ack;
@@ -160,8 +162,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	 * the packets in the Tx buffer we're going to resend and what the new
 	 * resend timeout will be.
 	 */
-	now = jiffies;
-	resend_at = now + rxrpc_resend_timeout;
+	oldest = now;
 	for (seq = cursor + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
 		annotation = call->rxtx_annotations[ix];
@@ -175,9 +176,9 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		sp = rxrpc_skb(skb);
 
 		if (anno_type == RXRPC_TX_ANNO_UNACK) {
-			if (time_after(sp->resend_at, now)) {
-				if (time_before(sp->resend_at, resend_at))
-					resend_at = sp->resend_at;
+			if (ktime_after(skb->tstamp, max_age)) {
+				if (ktime_before(skb->tstamp, oldest))
+					oldest = skb->tstamp;
 				continue;
 			}
 		}
@@ -186,7 +187,8 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
 	}
 
-	call->resend_at = resend_at;
+	resend_at = ktime_sub(ktime_add_ns(oldest, rxrpc_resend_timeout), now);
+	call->resend_at = jiffies + nsecs_to_jiffies(ktime_to_ns(resend_at));
 
 	/* Now go through the Tx window and perform the retransmissions.  We
 	 * have to drop the lock for each send.  If an ACK comes in whilst the
@@ -205,15 +207,12 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		spin_unlock_bh(&call->lock);
 
 		if (rxrpc_send_data_packet(call, skb) < 0) {
-			call->resend_at = now + 2;
 			rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 			return;
 		}
 
 		if (rxrpc_is_client_call(call))
 			rxrpc_expose_client_call(call);
-		sp = rxrpc_skb(skb);
-		sp->resend_at = now + rxrpc_resend_timeout;
 
 		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 		spin_lock_bh(&call->lock);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index a0a5bd108c9e..c121949de3c8 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -355,6 +355,38 @@ ack:
 	_leave(" [queued]");
 }
 
+/*
+ * Process a requested ACK.
+ */
+static void rxrpc_input_requested_ack(struct rxrpc_call *call,
+				      ktime_t resp_time,
+				      rxrpc_serial_t orig_serial,
+				      rxrpc_serial_t ack_serial)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *skb;
+	ktime_t sent_at;
+	int ix;
+
+	for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) {
+		skb = call->rxtx_buffer[ix];
+		if (!skb)
+			continue;
+
+		sp = rxrpc_skb(skb);
+		if (sp->hdr.serial != orig_serial)
+			continue;
+		smp_rmb();
+		sent_at = skb->tstamp;
+		goto found;
+	}
+	return;
+
+found:
+	rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack,
+			   orig_serial, ack_serial, sent_at, resp_time);
+}
+
 /*
  * Process a ping response.
  */
@@ -508,6 +540,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
 		rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
 					  sp->hdr.serial);
+	if (buf.ack.reason == RXRPC_ACK_REQUESTED)
+		rxrpc_input_requested_ack(call, skb->tstamp, acked_serial,
+					  sp->hdr.serial);
 
 	if (buf.ack.reason == RXRPC_ACK_PING) {
 		_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 56e668352fc7..0d425e707f22 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -68,9 +68,9 @@ unsigned int rxrpc_rx_mtu = 5692;
 unsigned int rxrpc_rx_jumbo_max = 4;
 
 /*
- * Time till packet resend (in jiffies).
+ * Time till packet resend (in milliseconds).
  */
-unsigned int rxrpc_resend_timeout = 4 * HZ;
+unsigned int rxrpc_resend_timeout = 4 * 1000;
 
 const char *const rxrpc_pkts[] = {
 	"?00",
@@ -186,8 +186,10 @@ const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
 
 const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
 	[rxrpc_rtt_tx_ping]		= "PING",
+	[rxrpc_rtt_tx_data]		= "DATA",
 };
 
 const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
 	[rxrpc_rtt_rx_ping_response]	= "PONG",
+	[rxrpc_rtt_rx_requested_ack]	= "RACK",
 };
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 0d89cd3f2c01..db01fbb70d23 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -300,9 +300,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		goto send_fragmentable;
 
 done:
-	if (ret == 0) {
-		sp->resend_at = jiffies + rxrpc_resend_timeout;
+	if (ret >= 0) {
+		skb->tstamp = ktime_get_real();
+		smp_wmb();
 		sp->hdr.serial = serial;
+		if (whdr.flags & RXRPC_REQUEST_ACK)
+			trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
 	}
 	_leave(" = %d [%u]", ret, call->peer->maxdata);
 	return ret;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 3c969de3ef05..607223f4f871 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -137,7 +137,6 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	if (seq == 1 && rxrpc_is_client_call(call))
 		rxrpc_expose_client_call(call);
 
-	sp->resend_at = jiffies + rxrpc_resend_timeout;
 	ret = rxrpc_send_data_packet(call, skb);
 	if (ret < 0) {
 		_debug("need instant resend %d", ret);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index a03c61c672f5..13d1df03ebac 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -59,7 +59,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.data		= &rxrpc_resend_timeout,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
+		.proc_handler	= proc_dointvec,
 		.extra1		= (void *)&one,
 	},
 	{
-- 
cgit v1.2.3


From de1d657816c6fbb70f07b01d50ec669dff0d4e60 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 21 Sep 2016 16:16:14 -0700
Subject: tcp: fix under-accounting retransmit SNMP counters

This patch fixes these under-accounting SNMP rtx stats
LINUX_MIB_TCPFORWARDRETRANS
LINUX_MIB_TCPFASTRETRANS
LINUX_MIB_TCPSLOWSTARTRETRANS
when retransmitting TSO packets

Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time")
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f53d0cca5fa4..e15ec82a6319 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2831,7 +2831,7 @@ begin_fwd:
 		if (tcp_retransmit_skb(sk, skb, segs))
 			return;
 
-		NET_INC_STATS(sock_net(sk), mib_idx);
+		NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
 
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += tcp_skb_pcount(skb);
-- 
cgit v1.2.3


From 7e32b44361abc77fbc01f2b97b045c405b2583e5 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 21 Sep 2016 16:16:15 -0700
Subject: tcp: properly account Fast Open SYN-ACK retrans

Since the TFO socket is accepted right off SYN-data, the socket
owner can call getsockopt(TCP_INFO) to collect ongoing SYN-ACK
retransmission or timeout stats (i.e., tcpi_total_retrans,
tcpi_retransmits). Currently those stats are only updated
upon handshake completes. This patch fixes it.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 2 +-
 net/ipv4/tcp_output.c | 2 ++
 net/ipv4/tcp_timer.c  | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ebf45b38bc3..08323bd95f2a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5885,7 +5885,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		 * so release it.
 		 */
 		if (req) {
-			tp->total_retrans = req->num_retrans;
+			inet_csk(sk)->icsk_retransmits = 0;
 			reqsk_fastopen_remove(sk, req, false);
 		} else {
 			/* Make sure socket is routed, for correct metrics. */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e15ec82a6319..5288cec4a2b2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3568,6 +3568,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
 	if (!res) {
 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+		if (unlikely(tcp_passive_fastopen(sk)))
+			tcp_sk(sk)->total_retrans++;
 	}
 	return res;
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d84930b2dd95..f712b411f6ed 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -384,6 +384,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 	 */
 	inet_rtx_syn_ack(sk, req);
 	req->num_timeout++;
+	icsk->icsk_retransmits++;
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 			  TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
 }
-- 
cgit v1.2.3


From 0d4b103c008ac9f6f438d2618c155f6e868e5a67 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:31 +0100
Subject: rxrpc: Reduce the number of ACK-Requests sent

Reduce the number of ACK-Requests we set on DATA packets that we're sending
to reduce network traffic.  We set the flag on odd-numbered DATA packets to
start off the RTT cache until we have at least three entries in it and then
probe once per second thereafter to keep it topped up.

This could be made tunable in future.

Note that from this point, the RXRPC_REQUEST_ACK flag is set on DATA
packets as we transmit them and not stored statically in the sk_buff.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  1 +
 net/rxrpc/output.c      | 13 +++++++++++--
 net/rxrpc/peer_object.c |  1 +
 net/rxrpc/sendmsg.c     |  2 --
 4 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 1c4597b2c6cd..b13754a6dd7a 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -255,6 +255,7 @@ struct rxrpc_peer {
 
 	/* calculated RTT cache */
 #define RXRPC_RTT_CACHE_SIZE 32
+	ktime_t			rtt_last_req;	/* Time of last RTT request */
 	u64			rtt;		/* Current RTT estimate (in nS) */
 	u64			rtt_sum;	/* Sum of cache contents */
 	u64			rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index db01fbb70d23..282cb1e36d06 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -270,6 +270,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	msg.msg_controllen = 0;
 	msg.msg_flags = 0;
 
+	/* If our RTT cache needs working on, request an ACK. */
+	if ((call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+			 ktime_get_real()))
+		whdr.flags |= RXRPC_REQUEST_ACK;
+
 	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
 		static int lose;
 		if ((lose++ & 7) == 7) {
@@ -301,11 +307,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 
 done:
 	if (ret >= 0) {
-		skb->tstamp = ktime_get_real();
+		ktime_t now = ktime_get_real();
+		skb->tstamp = now;
 		smp_wmb();
 		sp->hdr.serial = serial;
-		if (whdr.flags & RXRPC_REQUEST_ACK)
+		if (whdr.flags & RXRPC_REQUEST_ACK) {
+			call->peer->rtt_last_req = now;
 			trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
+		}
 	}
 	_leave(" = %d [%u]", ret, call->peer->maxdata);
 	return ret;
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index f3e5766910fd..941b724d523b 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -244,6 +244,7 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
 	peer->hash_key = hash_key;
 	rxrpc_assess_MTU_size(peer);
 	peer->mtu = peer->if_mtu;
+	peer->rtt_last_req = ktime_get_real();
 
 	switch (peer->srx.transport.family) {
 	case AF_INET:
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 607223f4f871..ca7c3be60ad2 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -299,8 +299,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			else if (call->tx_top - call->tx_hard_ack <
 				 call->tx_winsize)
 				sp->hdr.flags |= RXRPC_MORE_PACKETS;
-			if (seq & 1)
-				sp->hdr.flags |= RXRPC_REQUEST_ACK;
 
 			ret = conn->security->secure_packet(
 				call, skb, skb->mark, skb->head);
-- 
cgit v1.2.3


From fc943f67773487bb85131273f39b5f183caafe95 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Sep 2016 00:29:32 +0100
Subject: rxrpc: Reduce the number of PING ACKs sent

We don't want to send a PING ACK for every new incoming call as that just
adds to the network traffic.  Instead, we send a PING ACK to the first
three that we receive and then once per second thereafter.

This could probably be made adjustable in future.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 2 +-
 net/rxrpc/input.c      | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index adb2ec61e21f..6e2ea8f4ae75 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -142,7 +142,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t now = ktime_get_real(), max_age, oldest, resend_at;
+	ktime_t now = ktime_get_real(), max_age, oldest,  resend_at;
 	int ix;
 	u8 annotation, anno_type;
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c121949de3c8..cbb5d53f09d7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -44,9 +44,12 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
 			    int skew)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	ktime_t now = skb->tstamp;
 
-	rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
-			  true, true);
+	if (call->peer->rtt_usage < 3 ||
+	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+				  true, true);
 }
 
 /*
-- 
cgit v1.2.3


From 21641c2e1ffd0b504610a33beaeab8fcc5140677 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sun, 18 Sep 2016 15:52:20 -0700
Subject: net_sched: check NULL on error path in route4_change()

On error path in route4_change(), 'f' could be NULL,
so we should check NULL before calling tcf_exts_destroy().

Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index a4ce39b19be0..455fc8f83d0a 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -559,7 +559,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	return 0;
 
 errout:
-	tcf_exts_destroy(&f->exts);
+	if (f)
+		tcf_exts_destroy(&f->exts);
 	kfree(f);
 	return err;
 }
-- 
cgit v1.2.3


From a3007446e53af07c53bdb4cabad7b3ea60859da4 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 20 Sep 2016 18:19:13 -0300
Subject: sctp: fix the handling of SACK Gap Ack blocks

sctp_acked() is using 32bit arithmetics on 16bits vars, via TSN_lte()
macros, which is weird and confusing.

Once the offset to ctsn is calculated, all wrapping is already handled
and thus to verify the Gap Ack blocks we can just use pure
less/big-or-equal than checks.

Also, rename gap variable to tsn_offset, so it's more meaningful, as
it doesn't point to any gap at all.

Even so, I don't think this discrepancy resulted in any practical bug.

This patch is a preparation for the next one, which will introduce
typecheck() for TSN_lte() macros and would cause a compile error here.

Suggested-by: David Laight <David.Laight@ACULAB.COM>
Reported-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 8c3f446d965c..3ec6da8bbb53 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1719,7 +1719,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
 {
 	int i;
 	sctp_sack_variable_t *frags;
-	__u16 gap;
+	__u16 tsn_offset, blocks;
 	__u32 ctsn = ntohl(sack->cum_tsn_ack);
 
 	if (TSN_lte(tsn, ctsn))
@@ -1738,10 +1738,11 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
 	 */
 
 	frags = sack->variable;
-	gap = tsn - ctsn;
-	for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) {
-		if (TSN_lte(ntohs(frags[i].gab.start), gap) &&
-		    TSN_lte(gap, ntohs(frags[i].gab.end)))
+	blocks = ntohs(sack->num_gap_ack_blocks);
+	tsn_offset = tsn - ctsn;
+	for (i = 0; i < blocks; ++i) {
+		if (tsn_offset >= ntohs(frags[i].gab.start) &&
+		    tsn_offset <= ntohs(frags[i].gab.end))
 			goto pass;
 	}
 
-- 
cgit v1.2.3


From fefa569a9d4bc4b7758c0fddd75bb0382c95da77 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 22 Sep 2016 08:58:55 -0700
Subject: net_sched: sch_fq: account for schedule/timers drifts

It looks like the following patch can make FQ very precise, even in VM
or stressed hosts. It matters at high pacing rates.

We take into account the difference between the time that was programmed
when last packet was sent, and current time (a drift of tens of usecs is
often observed)

Add an EWMA of the unthrottle latency to help diagnostics.

This latency is the difference between current time and oldest packet in
delayed RB-tree. This accounts for the high resolution timer latency,
but can be different under stress, as fq_check_throttled() can be
opportunistically be called from a dequeue() called after an enqueue()
for a different flow.

Tested:
// Start a 10Gbit flow
$ netperf --google-pacing-rate 1250000000 -H lpaa24 -l 10000 -- -K bbr &

Before patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17106.04 756876.84   1102.75 1119049.02      0.00      0.00      0.52

After patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17867.00 800245.90   1151.77 1183172.12      0.00      0.00      0.52

A new iproute2 tc can output the 'unthrottle latency' :

$ tc -s qd sh dev eth0 | grep latency
  0 gc, 0 highprio, 32490767 throttled, 2382 ns latency

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 +-
 net/sched/sch_fq.c             | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f8e39dbaa781..df7451d35131 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -811,7 +811,7 @@ struct tc_fq_qd_stats {
 	__u32	flows;
 	__u32	inactive_flows;
 	__u32	throttled_flows;
-	__u32	pad;
+	__u32	unthrottle_latency_ns;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 5dd929cc1423..18e752439f6f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
 
 	struct rb_root	delayed;	/* for rate limited flows */
 	u64		time_next_delayed_flow;
+	unsigned long	unthrottle_latency_ns;
 
 	struct fq_flow	internal;	/* for non classified or high prio packets */
 	u32		quantum;
@@ -408,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 {
+	unsigned long sample;
 	struct rb_node *p;
 
 	if (q->time_next_delayed_flow > now)
 		return;
 
+	/* Update unthrottle latency EWMA.
+	 * This is cheap and can help diagnosing timer/latency problems.
+	 */
+	sample = (unsigned long)(now - q->time_next_delayed_flow);
+	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+	q->unthrottle_latency_ns += sample >> 3;
+
 	q->time_next_delayed_flow = ~0ULL;
 	while ((p = rb_first(&q->delayed)) != NULL) {
 		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
@@ -515,7 +524,12 @@ begin:
 			len = NSEC_PER_SEC;
 			q->stat_pkts_too_long++;
 		}
-
+		/* Account for schedule/timers drifts.
+		 * f->time_next_packet was set when prior packet was sent,
+		 * and current time (@now) can be too late by tens of us.
+		 */
+		if (f->time_next_packet)
+			len -= min(len/2, now - f->time_next_packet);
 		f->time_next_packet = now + len;
 	}
 out:
@@ -787,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
 	q->flow_refill_delay	= msecs_to_jiffies(40);
 	q->flow_max_rate	= ~0U;
+	q->time_next_delayed_flow = ~0ULL;
 	q->rate_enable		= 1;
 	q->new_flows.first	= NULL;
 	q->old_flows.first	= NULL;
@@ -854,8 +869,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.flows		  = q->flows;
 	st.inactive_flows	  = q->inactive_flows;
 	st.throttled_flows	  = q->throttled_flows;
-	st.pad			  = 0;
-
+	st.unthrottle_latency_ns  = min_t(unsigned long,
+					  q->unthrottle_latency_ns, ~0U);
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
-- 
cgit v1.2.3


From b24d2891cfb0a7975b0039743439c98fe7b7dea7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 13:17:33 +0100
Subject: rxrpc: Preset timestamp on Tx sk_buffs

Set the timestamp on sk_buffs holding packets to be transmitted before
queueing them because the moment the packet is on the queue it can be seen
by the retransmission algorithm - which may see a completely random
timestamp.

If the retransmission algorithm sees such a timestamp, it may retransmit
the packet and, in future, tell the congestion management algorithm that
the retransmit timer expired.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/sendmsg.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index ca7c3be60ad2..ca3811bfbd17 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -99,6 +99,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 	ASSERTCMP(seq, ==, call->tx_top + 1);
 
+	/* We have to set the timestamp before queueing as the retransmit
+	 * algorithm can see the packet as soon as we queue it.
+	 */
+	skb->tstamp = ktime_get_real();
+
 	ix = seq & RXRPC_RXTX_BUFF_MASK;
 	rxrpc_get_skb(skb, rxrpc_skb_tx_got);
 	call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
-- 
cgit v1.2.3


From 9aff212bd677829189fae2e2e408cefc196ae5ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:23 +0100
Subject: rxrpc: Don't send an ACK at the end of service call response
 transmission

Don't send an IDLE ACK at the end of the transmission of the response to a
service call.  The service end resends DATA packets until the client sends an
ACK that hard-acks all the send data.  At that point, the call is complete.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/recvmsg.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 6ba4af5a8d95..99e4c0ae30f1 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -143,8 +143,6 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
 		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
-	} else {
-		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, false);
 	}
 
 	write_lock_bh(&call->state_lock);
-- 
cgit v1.2.3


From c0d058c21c69b3685c3f1bb008aa11f1a5eaee7e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:23 +0100
Subject: rxrpc: Make sure sendmsg() is woken on call completion

Make sure that sendmsg() gets woken up if the call it is waiting for
completes abnormally.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index b13754a6dd7a..808ab750dc6b 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -758,6 +758,7 @@ static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call,
 		call->error = error;
 		call->completion = compl,
 		call->state = RXRPC_CALL_COMPLETE;
+		wake_up(&call->waitq);
 		return true;
 	}
 	return false;
-- 
cgit v1.2.3


From 90bd684ded900673d86f64f4b4197704a38f04bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:23 +0100
Subject: rxrpc: Should be using ktime_add_ms() not ktime_add_ns()

ktime_add_ms() should be used to add the resend time (in ms) rather than
ktime_add_ns().

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6e2ea8f4ae75..a2909da5d581 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -187,7 +187,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
 	}
 
-	resend_at = ktime_sub(ktime_add_ns(oldest, rxrpc_resend_timeout), now);
+	resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now);
 	call->resend_at = jiffies + nsecs_to_jiffies(ktime_to_ns(resend_at));
 
 	/* Now go through the Tx window and perform the retransmissions.  We
-- 
cgit v1.2.3


From 4acfee8143b33efa8bda6f03fe1462d545ff8170 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 22 Sep 2016 16:49:21 -0400
Subject: net: dsa: add port STP state helper

Add a void helper to set the STP state of a port, checking first if the
required routine is provided by the driver.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9ecbe787f102..fd78d4c9b197 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -69,6 +69,12 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
 	return !!p->bridge_dev;
 }
 
+static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
+{
+	if (ds->ops->port_stp_state_set)
+		ds->ops->port_stp_state_set(ds, port, state);
+}
+
 static int dsa_slave_open(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -104,8 +110,7 @@ static int dsa_slave_open(struct net_device *dev)
 			goto clear_promisc;
 	}
 
-	if (ds->ops->port_stp_state_set)
-		ds->ops->port_stp_state_set(ds, p->port, stp_state);
+	dsa_port_set_stp_state(ds, p->port, stp_state);
 
 	if (p->phy)
 		phy_start(p->phy);
@@ -147,8 +152,7 @@ static int dsa_slave_close(struct net_device *dev)
 	if (ds->ops->port_disable)
 		ds->ops->port_disable(ds, p->port, p->phy);
 
-	if (ds->ops->port_stp_state_set)
-		ds->ops->port_stp_state_set(ds, p->port, BR_STATE_DISABLED);
+	dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
 
 	return 0;
 }
@@ -354,7 +358,7 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
 	if (switchdev_trans_ph_prepare(trans))
 		return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
 
-	ds->ops->port_stp_state_set(ds, p->port, attr->u.stp_state);
+	dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
 
 	return 0;
 }
@@ -556,8 +560,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev)
 	/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
 	 */
-	if (ds->ops->port_stp_state_set)
-		ds->ops->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING);
+	dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
 }
 
 static int dsa_slave_port_attr_get(struct net_device *dev,
-- 
cgit v1.2.3


From 732f794c1baf58e1eb2be4431635829c3da655bd Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 22 Sep 2016 16:49:22 -0400
Subject: net: dsa: add port fast ageing

Today the DSA drivers are in charge of flushing the MAC addresses
associated to a port when its STP state changes from Learning or
Forwarding, to Disabled or Blocking or Listening.

This makes the drivers more complex and hides the generic switch logic.
Introduce a new optional port_fast_age operation to dsa_switch_ops, to
move this logic to the DSA layer and keep drivers simple.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 ++
 net/dsa/slave.c   | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 7556646db2d3..b122196d5a1f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -143,6 +143,7 @@ struct dsa_port {
 	struct net_device	*netdev;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
+	u8			stp_state;
 };
 
 struct dsa_switch {
@@ -339,6 +340,7 @@ struct dsa_switch_ops {
 	void	(*port_bridge_leave)(struct dsa_switch *ds, int port);
 	void	(*port_stp_state_set)(struct dsa_switch *ds, int port,
 				      u8 state);
+	void	(*port_fast_age)(struct dsa_switch *ds, int port);
 
 	/*
 	 * VLAN support
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index fd78d4c9b197..6b1282c006b1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -71,8 +71,26 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
 
 static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
 {
+	struct dsa_port *dp = &ds->ports[port];
+
 	if (ds->ops->port_stp_state_set)
 		ds->ops->port_stp_state_set(ds, port, state);
+
+	if (ds->ops->port_fast_age) {
+		/* Fast age FDB entries or flush appropriate forwarding database
+		 * for the given port, if we are moving it from Learning or
+		 * Forwarding state, to Disabled or Blocking or Listening state.
+		 */
+
+		if ((dp->stp_state == BR_STATE_LEARNING ||
+		     dp->stp_state == BR_STATE_FORWARDING) &&
+		    (state == BR_STATE_DISABLED ||
+		     state == BR_STATE_BLOCKING ||
+		     state == BR_STATE_LISTENING))
+			ds->ops->port_fast_age(ds, port);
+	}
+
+	dp->stp_state = state;
 }
 
 static int dsa_slave_open(struct net_device *dev)
-- 
cgit v1.2.3


From 2d48c5f9335e48ddac7a52db10bf3bfd01986b9c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 23 Sep 2016 01:28:35 +0200
Subject: bpf: use skb_to_full_sk helper in bpf_skb_under_cgroup

We need to use skb_to_full_sk() helper introduced in commit bd5eb35f16a9
("xfrm: take care of request sockets") as otherwise we miss tcp synack
messages, since ownership is on request socket and therefore it would
miss the sk_fullsock() check. Use skb_to_full_sk() as also done similarly
in the bpf_get_cgroup_classid() helper via 2309236c13fe ("cls_cgroup:
get sk_classid only from full sockets") fix to not let this fall through.

Fixes: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 0920c2ac1d00..e5d997759d5e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2408,7 +2408,7 @@ BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
 	struct cgroup *cgrp;
 	struct sock *sk;
 
-	sk = skb->sk;
+	sk = skb_to_full_sk(skb);
 	if (!sk || !sk_fullsock(sk))
 		return -ENOENT;
 	if (unlikely(idx >= array->map.max_entries))
-- 
cgit v1.2.3


From 669dc4d76d0ecc2d795df735839f43cfddf9f617 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 23 Sep 2016 01:28:36 +0200
Subject: bpf: use bpf_get_smp_processor_id_proto instead of raw one

Same motivation as in commit 80b48c445797 ("bpf: don't use raw processor
id in generic helper"), but this time for XDP typed programs. Thus, allow
for preemption checks when we have DEBUG_PREEMPT enabled, and otherwise
use the raw variant.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index e5d997759d5e..acf84fbfb043 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2551,6 +2551,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
 		return &bpf_xdp_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 7a4b28c6cc9ffac50f791b99cc7e46106436e5d8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 23 Sep 2016 01:28:37 +0200
Subject: bpf: add helper to invalidate hash

Add a small helper that complements 36bbef52c7eb ("bpf: direct packet
write and access for helpers for clsact progs") for invalidating the
current skb->hash after mangling on headers via direct packet write.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  7 +++++++
 net/core/filter.c        | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e07432b9f8b8..f09c70b97eca 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -419,6 +419,13 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_csum_update,
 
+	/**
+	 * bpf_set_hash_invalid(skb)
+	 * Invalidate current skb>hash.
+	 * @skb: pointer to skb
+	 */
+	BPF_FUNC_set_hash_invalid,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index acf84fbfb043..00351cdf7d0c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1777,6 +1777,22 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+	/* After all direct packet write, this can be used once for
+	 * triggering a lazy recalc on next skb_get_hash() invocation.
+	 */
+	skb_clear_hash(skb);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+	.func		= bpf_set_hash_invalid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	   u16, vlan_tci)
 {
@@ -2534,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_route_realm_proto;
 	case BPF_FUNC_get_hash_recalc:
 		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
-- 
cgit v1.2.3


From 98dafac5697fbe1fb4bef9e3204baf9051641b00 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 14:04:38 +0100
Subject: rxrpc: Use before_eq() and friends to compare serial numbers

before_eq() and friends should be used to compare serial numbers (when not
checking for (non)equality) rather than casting to int, subtracting and
checking the result.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index cbb5d53f09d7..06027b6d9c19 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -578,7 +578,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 
 	/* Discard any out-of-order or duplicate ACKs. */
-	if ((int)sp->hdr.serial - (int)call->acks_latest <= 0) {
+	if (before_eq(sp->hdr.serial, call->acks_latest)) {
 		_debug("discard ACK %d <= %d",
 		       sp->hdr.serial, call->acks_latest);
 		return;
-- 
cgit v1.2.3


From dfc3da4404ad1ec42a0a649a4ffa2b0f37e80352 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:23 +0100
Subject: rxrpc: Need to start the resend timer on initial transmission

When a DATA packet has its initial transmission, we may need to start or
adjust the resend timer.  Without this we end up relying on being sent a
NACK to initiate the resend.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 1 +
 net/rxrpc/call_event.c  | 2 +-
 net/rxrpc/sendmsg.c     | 9 +++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 808ab750dc6b..9e3ba4dc9578 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -704,6 +704,7 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
+void rxrpc_set_timer(struct rxrpc_call *);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
 void rxrpc_process_call(struct work_struct *);
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index a2909da5d581..3a7f90a2659c 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -24,7 +24,7 @@
 /*
  * Set the timer
  */
-static void rxrpc_set_timer(struct rxrpc_call *call)
+void rxrpc_set_timer(struct rxrpc_call *call)
 {
 	unsigned long t, now = jiffies;
 
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index ca3811bfbd17..7cb34b2dfba9 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -146,6 +146,15 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	if (ret < 0) {
 		_debug("need instant resend %d", ret);
 		rxrpc_instant_resend(call, ix);
+	} else {
+		unsigned long resend_at;
+
+		resend_at = jiffies + msecs_to_jiffies(rxrpc_resend_timeout);
+
+		if (time_before(resend_at, call->resend_at)) {
+			call->resend_at = resend_at;
+			rxrpc_set_timer(call);
+		}
 	}
 
 	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
-- 
cgit v1.2.3


From be8aa3380678183821bd7d7b5dec845f10d776ce Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:23 +0100
Subject: rxrpc: Fix accidental cancellation of scheduled resend by ACK parser

When rxrpc_input_soft_acks() is parsing the soft-ACKs from an ACK packet,
it updates the Tx packet annotations in the annotation buffer.  If a
soft-ACK is an ACK, then we overwrite unack'd, nak'd or to-be-retransmitted
states and that is fine; but if the soft-ACK is an NACK, we overwrite the
to-be-retransmitted with a nak - which isn't.

Instead, we need to let any scheduled retransmission stand if the packet
was NAK'd.

Note that we don't reissue a resend if the annotation is in the
to-be-retransmitted state because someone else must've scheduled the
resend already.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 06027b6d9c19..d3d69ab1f0a1 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -479,6 +479,8 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 		case RXRPC_ACK_TYPE_NACK:
 			if (anno_type == RXRPC_TX_ANNO_NAK)
 				continue;
+			if (anno_type == RXRPC_TX_ANNO_RETRANS)
+				continue;
 			call->rxtx_annotations[ix] =
 				RXRPC_TX_ANNO_NAK | annotation;
 			resend = true;
-- 
cgit v1.2.3


From 01a88f7f6bd4514de9551c3fc9a6fd9e65cbf79d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:22 +0100
Subject: rxrpc: Fix call timer

Fix the call timer in the following ways:

 (1) If call->resend_at or call->ack_at are before or equal to the current
     time, then ignore that timeout.

 (2) If call->expire_at is before or equal to the current time, then don't
     set the timer at all (possibly we should queue the call).

 (3) Don't skip modifying the timer if timer_pending() is true.  This
     indicates that the timer is working, not that it has expired and is
     running/waiting to run its expiry handler.

Also call rxrpc_set_timer() to start the call timer going rather than
calling add_timer().

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c  | 25 ++++++++++++++-----------
 net/rxrpc/call_object.c |  4 ++--
 2 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 3a7f90a2659c..8bc5c8e37ab4 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -28,24 +28,27 @@ void rxrpc_set_timer(struct rxrpc_call *call)
 {
 	unsigned long t, now = jiffies;
 
-	_enter("{%ld,%ld,%ld:%ld}",
-	       call->ack_at - now, call->resend_at - now, call->expire_at - now,
-	       call->timer.expires - now);
-
 	read_lock_bh(&call->state_lock);
 
 	if (call->state < RXRPC_CALL_COMPLETE) {
-		t = call->ack_at;
-		if (time_before(call->resend_at, t))
+		t = call->expire_at;
+		if (time_before_eq(t, now))
+			goto out;
+
+		if (time_after(call->resend_at, now) &&
+		    time_before(call->resend_at, t))
 			t = call->resend_at;
-		if (time_before(call->expire_at, t))
-			t = call->expire_at;
-		if (!timer_pending(&call->timer) ||
-		    time_before(t, call->timer.expires)) {
-			_debug("set timer %ld", t - now);
+
+		if (time_after(call->ack_at, now) &&
+		    time_before(call->ack_at, t))
+			t = call->ack_at;
+
+		if (call->timer.expires != t || !timer_pending(&call->timer)) {
 			mod_timer(&call->timer, t);
 		}
 	}
+
+out:
 	read_unlock_bh(&call->state_lock);
 }
 
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f50a6094e198..f2fadf667e19 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -199,8 +199,8 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
 	call->expire_at = expire_at;
 	call->ack_at = expire_at;
 	call->resend_at = expire_at;
-	call->timer.expires = expire_at;
-	add_timer(&call->timer);
+	call->timer.expires = expire_at + 1;
+	rxrpc_set_timer(call);
 }
 
 /*
-- 
cgit v1.2.3


From 70790dbe3f6651fb66ad38da0a1e24368778bc16 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:22 +0100
Subject: rxrpc: Pass the last Tx packet marker in the annotation buffer

When the last packet of data to be transmitted on a call is queued, tx_top
is set and then the RXRPC_CALL_TX_LAST flag is set.  Unfortunately, this
leaves a race in the ACK processing side of things because the flag affects
the interpretation of tx_top and also allows us to start receiving reply
data before we've finished transmitting.

To fix this, make the following changes:

 (1) rxrpc_queue_packet() now sets a marker in the annotation buffer
     instead of setting the RXRPC_CALL_TX_LAST flag.

 (2) rxrpc_rotate_tx_window() detects the marker and sets the flag in the
     same context as the routines that use it.

 (3) rxrpc_end_tx_phase() is simplified to just shift the call state.
     The Tx window must have been rotated before calling to discard the
     last packet.

 (4) rxrpc_receiving_reply() is added to handle the arrival of the first
     DATA packet of a reply to a client call (which is an implicit ACK of
     the Tx phase).

 (5) The last part of rxrpc_input_ack() is reordered to perform Tx
     rotation, then soft-ACK application and then to end the phase if we've
     rotated the last packet.  In the event of a terminal ACK, the soft-ACK
     application will be skipped as nAcks should be 0.

 (6) rxrpc_input_ackall() now has to rotate as well as ending the phase.

In addition:

 (7) Alter the transmit tracepoint to log the rotation of the last packet.

 (8) Remove the no-longer relevant queue_reqack tracepoint note.  The
     ACK-REQUESTED packet header flag is now set as needed when we actually
     transmit the packet and may vary by retransmission.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |   7 +++-
 net/rxrpc/input.c       | 102 +++++++++++++++++++++++++++++++-----------------
 net/rxrpc/misc.c        |   3 +-
 net/rxrpc/sendmsg.c     |  14 +++----
 4 files changed, 81 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 9e3ba4dc9578..a494d56eb236 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -508,7 +508,9 @@ struct rxrpc_call {
 #define RXRPC_TX_ANNO_NAK	2
 #define RXRPC_TX_ANNO_RETRANS	3
 #define RXRPC_TX_ANNO_MASK	0x03
-#define RXRPC_TX_ANNO_RESENT	0x04
+#define RXRPC_TX_ANNO_LAST	0x04
+#define RXRPC_TX_ANNO_RESENT	0x08
+
 #define RXRPC_RX_ANNO_JUMBO	0x3f		/* Jumbo subpacket number + 1 if not zero */
 #define RXRPC_RX_ANNO_JLAST	0x40		/* Set if last element of a jumbo packet */
 #define RXRPC_RX_ANNO_VERIFIED	0x80		/* Set if verified and decrypted */
@@ -621,9 +623,10 @@ extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
 enum rxrpc_transmit_trace {
 	rxrpc_transmit_wait,
 	rxrpc_transmit_queue,
-	rxrpc_transmit_queue_reqack,
 	rxrpc_transmit_queue_last,
 	rxrpc_transmit_rotate,
+	rxrpc_transmit_rotate_last,
+	rxrpc_transmit_await_reply,
 	rxrpc_transmit_end,
 	rxrpc_transmit__nr_trace
 };
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index d3d69ab1f0a1..fb3e2f6afa3b 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -59,6 +59,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 {
 	struct sk_buff *skb, *list = NULL;
 	int ix;
+	u8 annotation;
 
 	spin_lock(&call->lock);
 
@@ -66,16 +67,22 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 		call->tx_hard_ack++;
 		ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
 		skb = call->rxtx_buffer[ix];
+		annotation = call->rxtx_annotations[ix];
 		rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
 		call->rxtx_buffer[ix] = NULL;
 		call->rxtx_annotations[ix] = 0;
 		skb->next = list;
 		list = skb;
+
+		if (annotation & RXRPC_TX_ANNO_LAST)
+			set_bit(RXRPC_CALL_TX_LAST, &call->flags);
 	}
 
 	spin_unlock(&call->lock);
 
-	trace_rxrpc_transmit(call, rxrpc_transmit_rotate);
+	trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ?
+				    rxrpc_transmit_rotate_last :
+				    rxrpc_transmit_rotate));
 	wake_up(&call->waitq);
 
 	while (list) {
@@ -92,42 +99,65 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
  * This occurs when we get an ACKALL packet, the first DATA packet of a reply,
  * or a final ACK packet.
  */
-static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
+static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
+			       const char *abort_why)
 {
-	_enter("");
-
-	switch (call->state) {
-	case RXRPC_CALL_CLIENT_RECV_REPLY:
-		return true;
-	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-	case RXRPC_CALL_SERVER_AWAIT_ACK:
-		break;
-	default:
-		rxrpc_proto_abort(abort_why, call, call->tx_top);
-		return false;
-	}
 
-	rxrpc_rotate_tx_window(call, call->tx_top);
+	ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
 
 	write_lock(&call->state_lock);
 
 	switch (call->state) {
-	default:
-		break;
+	case RXRPC_CALL_CLIENT_SEND_REQUEST:
 	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-		call->tx_phase = false;
-		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+		if (reply_begun)
+			call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+		else
+			call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
 		break;
+
 	case RXRPC_CALL_SERVER_AWAIT_ACK:
 		__rxrpc_call_completed(call);
 		rxrpc_notify_socket(call);
 		break;
+
+	default:
+		goto bad_state;
 	}
 
 	write_unlock(&call->state_lock);
-	trace_rxrpc_transmit(call, rxrpc_transmit_end);
+	if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
+		trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
+	} else {
+		trace_rxrpc_transmit(call, rxrpc_transmit_end);
+	}
 	_leave(" = ok");
 	return true;
+
+bad_state:
+	write_unlock(&call->state_lock);
+	kdebug("end_tx %s", rxrpc_call_states[call->state]);
+	rxrpc_proto_abort(abort_why, call, call->tx_top);
+	return false;
+}
+
+/*
+ * Begin the reply reception phase of a call.
+ */
+static bool rxrpc_receiving_reply(struct rxrpc_call *call)
+{
+	rxrpc_seq_t top = READ_ONCE(call->tx_top);
+
+	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+		rxrpc_rotate_tx_window(call, top);
+	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+		rxrpc_proto_abort("TXL", call, top);
+		return false;
+	}
+	if (!rxrpc_end_tx_phase(call, true, "ETD"))
+		return false;
+	call->tx_phase = false;
+	return true;
 }
 
 /*
@@ -226,8 +256,9 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 	/* Received data implicitly ACKs all of the request packets we sent
 	 * when we're acting as a client.
 	 */
-	if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY &&
-	    !rxrpc_end_tx_phase(call, "ETD"))
+	if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+	     call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
+	    !rxrpc_receiving_reply(call))
 		return;
 
 	call->ackr_prev_seq = seq;
@@ -587,27 +618,26 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 	call->acks_latest = sp->hdr.serial;
 
-	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
-	    hard_ack == call->tx_top) {
-		rxrpc_end_tx_phase(call, "ETA");
-		return;
-	}
-
 	if (before(hard_ack, call->tx_hard_ack) ||
 	    after(hard_ack, call->tx_top))
 		return rxrpc_proto_abort("AKW", call, 0);
+	if (nr_acks > call->tx_top - hard_ack)
+		return rxrpc_proto_abort("AKN", call, 0);
 
 	if (after(hard_ack, call->tx_hard_ack))
 		rxrpc_rotate_tx_window(call, hard_ack);
 
-	if (after(first_soft_ack, call->tx_top))
+	if (nr_acks > 0) {
+		if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0)
+			return rxrpc_proto_abort("XSA", call, 0);
+		rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks);
+	}
+
+	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+		rxrpc_end_tx_phase(call, false, "ETA");
 		return;
+	}
 
-	if (nr_acks > call->tx_top - first_soft_ack + 1)
-		nr_acks = first_soft_ack - call->tx_top + 1;
-	if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0)
-		return rxrpc_proto_abort("XSA", call, 0);
-	rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks);
 }
 
 /*
@@ -619,7 +649,9 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 
 	_proto("Rx ACKALL %%%u", sp->hdr.serial);
 
-	rxrpc_end_tx_phase(call, "ETL");
+	rxrpc_rotate_tx_window(call, call->tx_top);
+	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+		rxrpc_end_tx_phase(call, false, "ETL");
 }
 
 /*
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 0d425e707f22..fe648711c2f7 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -155,9 +155,10 @@ const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
 const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
 	[rxrpc_transmit_wait]		= "WAI",
 	[rxrpc_transmit_queue]		= "QUE",
-	[rxrpc_transmit_queue_reqack]	= "QRA",
 	[rxrpc_transmit_queue_last]	= "QLS",
 	[rxrpc_transmit_rotate]		= "ROT",
+	[rxrpc_transmit_rotate_last]	= "RLS",
+	[rxrpc_transmit_await_reply]	= "AWR",
 	[rxrpc_transmit_end]		= "END",
 };
 
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 7cb34b2dfba9..93e6584cd751 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -94,11 +94,15 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	rxrpc_seq_t seq = sp->hdr.seq;
 	int ret, ix;
+	u8 annotation = RXRPC_TX_ANNO_UNACK;
 
 	_net("queue skb %p [%d]", skb, seq);
 
 	ASSERTCMP(seq, ==, call->tx_top + 1);
 
+	if (last)
+		annotation |= RXRPC_TX_ANNO_LAST;
+
 	/* We have to set the timestamp before queueing as the retransmit
 	 * algorithm can see the packet as soon as we queue it.
 	 */
@@ -106,18 +110,14 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 	ix = seq & RXRPC_RXTX_BUFF_MASK;
 	rxrpc_get_skb(skb, rxrpc_skb_tx_got);
-	call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
+	call->rxtx_annotations[ix] = annotation;
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
 	call->tx_top = seq;
-	if (last) {
-		set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+	if (last)
 		trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
-	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
-		trace_rxrpc_transmit(call, rxrpc_transmit_queue_reqack);
-	} else {
+	else
 		trace_rxrpc_transmit(call, rxrpc_transmit_queue);
-	}
 
 	if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
 		_debug("________awaiting reply/ACK__________");
-- 
cgit v1.2.3


From b86e218e0d422488e0febb07620fa97ae9713779 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 15:08:48 +0100
Subject: rxrpc: Don't call the tx_ack tracepoint if don't generate an ACK

rxrpc_send_call_packet() is invoking the tx_ack tracepoint before it checks
whether there's an ACK to transmit (another thread may jump in and transmit
it).

Fix this by only invoking the tracepoint if we get a valid ACK to transmit.

Further, only allocate a serial number if we're going to actually transmit
something.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/output.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 282cb1e36d06..5c1e008a5323 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -80,9 +80,6 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	pkt->ackinfo.rwind	= htonl(call->rx_winsize);
 	pkt->ackinfo.jumbo_max	= htonl(jmax);
 
-	trace_rxrpc_tx_ack(call, hard_ack + 1, serial, call->ackr_reason,
-			   top - hard_ack);
-
 	*ackp++ = 0;
 	*ackp++ = 0;
 	*ackp++ = 0;
@@ -119,8 +116,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 		return -ENOMEM;
 	}
 
-	serial = atomic_inc_return(&conn->serial);
-
 	msg.msg_name	= &call->peer->srx.transport;
 	msg.msg_namelen	= call->peer->srx.transport_len;
 	msg.msg_control	= NULL;
@@ -131,7 +126,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	pkt->whdr.cid		= htonl(call->cid);
 	pkt->whdr.callNumber	= htonl(call->call_id);
 	pkt->whdr.seq		= 0;
-	pkt->whdr.serial	= htonl(serial);
 	pkt->whdr.type		= type;
 	pkt->whdr.flags		= conn->out_clientflag;
 	pkt->whdr.userStatus	= 0;
@@ -157,14 +151,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 
 		spin_unlock_bh(&call->lock);
 
-		_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-		       serial,
-		       ntohs(pkt->ack.maxSkew),
-		       ntohl(pkt->ack.firstPacket),
-		       ntohl(pkt->ack.previousPacket),
-		       ntohl(pkt->ack.serial),
-		       rxrpc_acks(pkt->ack.reason),
-		       pkt->ack.nAcks);
 
 		iov[0].iov_len += sizeof(pkt->ack) + n;
 		iov[1].iov_base = &pkt->ackinfo;
@@ -176,7 +162,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	case RXRPC_PACKET_TYPE_ABORT:
 		abort_code = call->abort_code;
 		pkt->abort_code = htonl(abort_code);
-		_proto("Tx ABORT %%%u { %d }", serial, abort_code);
 		iov[0].iov_len += sizeof(pkt->abort_code);
 		len += sizeof(pkt->abort_code);
 		ioc = 1;
@@ -188,6 +173,17 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 		goto out;
 	}
 
+	serial = atomic_inc_return(&conn->serial);
+	pkt->whdr.serial = htonl(serial);
+	switch (type) {
+	case RXRPC_PACKET_TYPE_ACK:
+		trace_rxrpc_tx_ack(call,
+				   ntohl(pkt->ack.firstPacket),
+				   ntohl(pkt->ack.serial),
+				   pkt->ack.reason, pkt->ack.nAcks);
+		break;
+	}
+
 	if (ping) {
 		call->ackr_ping = serial;
 		smp_wmb();
-- 
cgit v1.2.3


From fc7ab6d29a3af0b7f6df7c095509378c8caf85b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 15:22:36 +0100
Subject: rxrpc: Add a tracepoint for the call timer

Add a tracepoint to log call timer initiation, setting and expiry.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 36 ++++++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      | 13 ++++++++++++-
 net/rxrpc/call_event.c       |  7 ++++---
 net/rxrpc/call_object.c      |  6 ++++--
 net/rxrpc/misc.c             |  8 ++++++++
 net/rxrpc/sendmsg.c          |  2 +-
 6 files changed, 65 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index e8f2afbbe0bf..57322897d745 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -414,6 +414,42 @@ TRACE_EVENT(rxrpc_rtt_rx,
 		      __entry->avg)
 	    );
 
+TRACE_EVENT(rxrpc_timer,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+		     unsigned long now),
+
+	    TP_ARGS(call, why, now),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,		call		)
+		    __field(enum rxrpc_timer_trace,		why		)
+		    __field(unsigned long,			now		)
+		    __field(unsigned long,			expire_at	)
+		    __field(unsigned long,			ack_at		)
+		    __field(unsigned long,			resend_at	)
+		    __field(unsigned long,			timer		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call	= call;
+		    __entry->why	= why;
+		    __entry->now	= now;
+		    __entry->expire_at	= call->expire_at;
+		    __entry->ack_at	= call->ack_at;
+		    __entry->resend_at	= call->resend_at;
+		    __entry->timer	= call->timer.expires;
+			   ),
+
+	    TP_printk("c=%p %s now=%lx x=%ld a=%ld r=%ld t=%ld",
+		      __entry->call,
+		      rxrpc_timer_traces[__entry->why],
+		      __entry->now,
+		      __entry->expire_at - __entry->now,
+		      __entry->ack_at - __entry->now,
+		      __entry->resend_at - __entry->now,
+		      __entry->timer - __entry->now)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a494d56eb236..e564eca75985 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -678,6 +678,17 @@ enum rxrpc_rtt_rx_trace {
 
 extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
 
+enum rxrpc_timer_trace {
+	rxrpc_timer_begin,
+	rxrpc_timer_expired,
+	rxrpc_timer_set_for_ack,
+	rxrpc_timer_set_for_resend,
+	rxrpc_timer_set_for_send,
+	rxrpc_timer__nr_trace
+};
+
+extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
@@ -707,7 +718,7 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void rxrpc_set_timer(struct rxrpc_call *);
+void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
 void rxrpc_process_call(struct work_struct *);
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 8bc5c8e37ab4..90e970ba048a 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -24,7 +24,7 @@
 /*
  * Set the timer
  */
-void rxrpc_set_timer(struct rxrpc_call *call)
+void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why)
 {
 	unsigned long t, now = jiffies;
 
@@ -45,6 +45,7 @@ void rxrpc_set_timer(struct rxrpc_call *call)
 
 		if (call->timer.expires != t || !timer_pending(&call->timer)) {
 			mod_timer(&call->timer, t);
+			trace_rxrpc_timer(call, why, now);
 		}
 	}
 
@@ -120,7 +121,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		_debug("deferred ACK %ld < %ld", expiry, call->ack_at - now);
 		if (time_before(ack_at, call->ack_at)) {
 			call->ack_at = ack_at;
-			rxrpc_set_timer(call);
+			rxrpc_set_timer(call, rxrpc_timer_set_for_ack);
 		}
 	}
 }
@@ -293,7 +294,7 @@ recheck_state:
 		goto recheck_state;
 	}
 
-	rxrpc_set_timer(call);
+	rxrpc_set_timer(call, rxrpc_timer_set_for_resend);
 
 	/* other events may have been raised since we started checking */
 	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f2fadf667e19..a53f4c2c0025 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -76,8 +76,10 @@ static void rxrpc_call_timer_expired(unsigned long _call)
 
 	_enter("%d", call->debug_id);
 
-	if (call->state < RXRPC_CALL_COMPLETE)
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies);
 		rxrpc_queue_call(call);
+	}
 }
 
 /*
@@ -200,7 +202,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
 	call->ack_at = expire_at;
 	call->resend_at = expire_at;
 	call->timer.expires = expire_at + 1;
-	rxrpc_set_timer(call);
+	rxrpc_set_timer(call, rxrpc_timer_begin);
 }
 
 /*
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index fe648711c2f7..fa9942fabdf2 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -194,3 +194,11 @@ const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
 	[rxrpc_rtt_rx_ping_response]	= "PONG",
 	[rxrpc_rtt_rx_requested_ack]	= "RACK",
 };
+
+const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
+	[rxrpc_timer_begin]			= "Begin ",
+	[rxrpc_timer_expired]			= "*EXPR*",
+	[rxrpc_timer_set_for_ack]		= "SetAck",
+	[rxrpc_timer_set_for_send]		= "SetTx ",
+	[rxrpc_timer_set_for_resend]		= "SetRTx",
+};
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 93e6584cd751..99939372b5a4 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -153,7 +153,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 
 		if (time_before(resend_at, call->resend_at)) {
 			call->resend_at = resend_at;
-			rxrpc_set_timer(call);
+			rxrpc_set_timer(call, rxrpc_timer_set_for_send);
 		}
 	}
 
-- 
cgit v1.2.3


From be832aecc5ba811728e15a10f675f4a2187f25dd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:22 +0100
Subject: rxrpc: Add data Tx tracepoint and adjust Tx ACK tracepoint

Add a tracepoint to log transmission of DATA packets (including loss
injection).

Adjust the ACK transmission tracepoint to include the packet serial number
and to line this up with the DATA transmission display.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 50 +++++++++++++++++++++++++++++++++++++-------
 net/rxrpc/conn_event.c       |  5 ++---
 net/rxrpc/output.c           |  5 ++++-
 3 files changed, 48 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 57322897d745..6001bf93dc79 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -256,33 +256,67 @@ TRACE_EVENT(rxrpc_rx_ack,
 		      __entry->n_acks)
 	    );
 
+TRACE_EVENT(rxrpc_tx_data,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
+		     rxrpc_serial_t serial, u8 flags, bool lose),
+
+	    TP_ARGS(call, seq, serial, flags, lose),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(rxrpc_serial_t,		serial		)
+		    __field(u8,				flags		)
+		    __field(bool,			lose		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->seq = seq;
+		    __entry->serial = serial;
+		    __entry->flags = flags;
+		    __entry->lose = lose;
+			   ),
+
+	    TP_printk("c=%p DATA %08x q=%08x fl=%02x%s",
+		      __entry->call,
+		      __entry->serial,
+		      __entry->seq,
+		      __entry->flags,
+		      __entry->lose ? " *LOSE*" : "")
+	    );
+
 TRACE_EVENT(rxrpc_tx_ack,
-	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first,
-		     rxrpc_serial_t serial, u8 reason, u8 n_acks),
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial,
+		     rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial,
+		     u8 reason, u8 n_acks),
 
-	    TP_ARGS(call, first, serial, reason, n_acks),
+	    TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,	call		)
-		    __field(rxrpc_seq_t,		first		)
 		    __field(rxrpc_serial_t,		serial		)
+		    __field(rxrpc_seq_t,		ack_first	)
+		    __field(rxrpc_serial_t,		ack_serial	)
 		    __field(u8,				reason		)
 		    __field(u8,				n_acks		)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call = call;
-		    __entry->first = first;
 		    __entry->serial = serial;
+		    __entry->ack_first = ack_first;
+		    __entry->ack_serial = ack_serial;
 		    __entry->reason = reason;
 		    __entry->n_acks = n_acks;
 			   ),
 
-	    TP_printk("c=%p %s f=%08x r=%08x n=%u",
+	    TP_printk(" c=%p ACK  %08x %s f=%08x r=%08x n=%u",
 		      __entry->call,
-		      rxrpc_acks(__entry->reason),
-		      __entry->first,
 		      __entry->serial,
+		      rxrpc_acks(__entry->reason),
+		      __entry->ack_first,
+		      __entry->ack_serial,
 		      __entry->n_acks)
 	    );
 
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 75a15a4c74c3..a1cf1ec5f29e 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -98,9 +98,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.info.rwind		= htonl(rxrpc_rx_window_size);
 		pkt.info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
 		len += sizeof(pkt.ack) + sizeof(pkt.info);
-
-		trace_rxrpc_tx_ack(NULL, chan->last_seq, 0,
-				   RXRPC_ACK_DUPLICATE, 0);
 		break;
 	}
 
@@ -122,6 +119,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		_proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
 		break;
 	case RXRPC_PACKET_TYPE_ACK:
+		trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
+				   RXRPC_ACK_DUPLICATE, 0);
 		_proto("Tx ACK %%%u [re]", serial);
 		break;
 	}
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 5c1e008a5323..e47fbd1c836d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -177,7 +177,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	pkt->whdr.serial = htonl(serial);
 	switch (type) {
 	case RXRPC_PACKET_TYPE_ACK:
-		trace_rxrpc_tx_ack(call,
+		trace_rxrpc_tx_ack(call, serial,
 				   ntohl(pkt->ack.firstPacket),
 				   ntohl(pkt->ack.serial),
 				   pkt->ack.reason, pkt->ack.nAcks);
@@ -275,6 +275,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
 		static int lose;
 		if ((lose++ & 7) == 7) {
+			trace_rxrpc_tx_data(call, sp->hdr.seq, serial,
+					    whdr.flags, true);
 			rxrpc_lose_skb(skb, rxrpc_skb_tx_lost);
 			_leave(" = 0 [lose]");
 			return 0;
@@ -302,6 +304,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		goto send_fragmentable;
 
 done:
+	trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, false);
 	if (ret >= 0) {
 		ktime_t now = ktime_get_real();
 		skb->tstamp = now;
-- 
cgit v1.2.3


From 89b475abdb107a74f57497b65becaf837a0e5b6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 12:39:22 +0100
Subject: rxrpc: Add a tracepoint to log injected Rx packet loss

Add a tracepoint to log received packets that get discarded due to Rx
packet loss.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 21 +++++++++++++++++++++
 net/rxrpc/input.c            | 11 +++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 6001bf93dc79..9413b17ba04b 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -484,6 +484,27 @@ TRACE_EVENT(rxrpc_timer,
 		      __entry->timer - __entry->now)
 	    );
 
+TRACE_EVENT(rxrpc_rx_lose,
+	    TP_PROTO(struct rxrpc_skb_priv *sp),
+
+	    TP_ARGS(sp),
+
+	    TP_STRUCT__entry(
+		    __field_struct(struct rxrpc_host_header,	hdr		)
+			     ),
+
+	    TP_fast_assign(
+		    memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
+			   ),
+
+	    TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s *LOSE*",
+		      __entry->hdr.epoch, __entry->hdr.cid,
+		      __entry->hdr.callNumber, __entry->hdr.serviceId,
+		      __entry->hdr.serial, __entry->hdr.seq,
+		      __entry->hdr.type, __entry->hdr.flags,
+		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index fb3e2f6afa3b..19b1e189f5dc 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -837,20 +837,19 @@ void rxrpc_data_ready(struct sock *udp_sk)
 	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
 
+	/* dig out the RxRPC connection details */
+	if (rxrpc_extract_header(sp, skb) < 0)
+		goto bad_message;
+
 	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
 		static int lose;
 		if ((lose++ & 7) == 7) {
+			trace_rxrpc_rx_lose(sp);
 			rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
 			return;
 		}
 	}
 
-	_net("Rx UDP packet from %08x:%04hu",
-	     ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
-
-	/* dig out the RxRPC connection details */
-	if (rxrpc_extract_header(sp, skb) < 0)
-		goto bad_message;
 	trace_rxrpc_rx_packet(sp);
 
 	_net("Rx RxRPC %s ep=%x call=%x:%x",
-- 
cgit v1.2.3


From 9c7ad434441da6b5d4ac878cac368fbdaec99b56 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 13:50:40 +0100
Subject: rxrpc: Add tracepoint for ACK proposal

Add a tracepoint to log proposed ACKs, including whether the proposal is
used to update a pending ACK or is discarded in favour of an easlier,
higher priority ACK.

Whilst we're at it, get rid of the rxrpc_acks() function and access the
name array directly.  We do, however, need to validate the ACK reason
number given to trace_rxrpc_rx_ack() to make sure we don't overrun the
array.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/rxrpc/packet.h       |  1 +
 include/trace/events/rxrpc.h | 42 ++++++++++++++++++++++++++++++++++++++++--
 net/rxrpc/ar-internal.h      | 25 +++++++++++++++++++++++--
 net/rxrpc/call_event.c       | 21 ++++++++++++++-------
 net/rxrpc/input.c            | 19 +++++++++++++------
 net/rxrpc/misc.c             | 30 +++++++++++++++++++-----------
 net/rxrpc/output.c           |  3 ++-
 net/rxrpc/recvmsg.c          |  3 ++-
 8 files changed, 114 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index fd6eb3a60a8c..703a64b4681a 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -123,6 +123,7 @@ struct rxrpc_ackpacket {
 #define RXRPC_ACK_PING_RESPONSE		7	/* response to RXRPC_ACK_PING */
 #define RXRPC_ACK_DELAY			8	/* nothing happened since received packet */
 #define RXRPC_ACK_IDLE			9	/* ACK due to fully received ACK window */
+#define RXRPC_ACK__INVALID		10	/* Representation of invalid ACK reason */
 
 	uint8_t		nAcks;		/* number of ACKs */
 #define RXRPC_MAXACKS	255
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 9413b17ba04b..d67a8c6b085a 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -251,7 +251,7 @@ TRACE_EVENT(rxrpc_rx_ack,
 
 	    TP_printk("c=%p %s f=%08x n=%u",
 		      __entry->call,
-		      rxrpc_acks(__entry->reason),
+		      rxrpc_ack_names[__entry->reason],
 		      __entry->first,
 		      __entry->n_acks)
 	    );
@@ -314,7 +314,7 @@ TRACE_EVENT(rxrpc_tx_ack,
 	    TP_printk(" c=%p ACK  %08x %s f=%08x r=%08x n=%u",
 		      __entry->call,
 		      __entry->serial,
-		      rxrpc_acks(__entry->reason),
+		      rxrpc_ack_names[__entry->reason],
 		      __entry->ack_first,
 		      __entry->ack_serial,
 		      __entry->n_acks)
@@ -505,6 +505,44 @@ TRACE_EVENT(rxrpc_rx_lose,
 		      __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : "?UNK")
 	    );
 
+TRACE_EVENT(rxrpc_propose_ack,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why,
+		     u8 ack_reason, rxrpc_serial_t serial, bool immediate,
+		     bool background, enum rxrpc_propose_ack_outcome outcome),
+
+	    TP_ARGS(call, why, ack_reason, serial, immediate, background,
+		    outcome),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,		call		)
+		    __field(enum rxrpc_propose_ack_trace,	why		)
+		    __field(rxrpc_serial_t,			serial		)
+		    __field(u8,					ack_reason	)
+		    __field(bool,				immediate	)
+		    __field(bool,				background	)
+		    __field(enum rxrpc_propose_ack_outcome,	outcome		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call	= call;
+		    __entry->why	= why;
+		    __entry->serial	= serial;
+		    __entry->ack_reason	= ack_reason;
+		    __entry->immediate	= immediate;
+		    __entry->background	= background;
+		    __entry->outcome	= outcome;
+			   ),
+
+	    TP_printk("c=%p %s %s r=%08x i=%u b=%u%s",
+		      __entry->call,
+		      rxrpc_propose_ack_traces[__entry->why],
+		      rxrpc_ack_names[__entry->ack_reason],
+		      __entry->serial,
+		      __entry->immediate,
+		      __entry->background,
+		      rxrpc_propose_ack_outcomes[__entry->outcome])
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e564eca75985..042dbcc52654 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -689,8 +689,28 @@ enum rxrpc_timer_trace {
 
 extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
 
+enum rxrpc_propose_ack_trace {
+	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_params,
+	rxrpc_propose_ack_respond_to_ack,
+	rxrpc_propose_ack_respond_to_ping,
+	rxrpc_propose_ack_retry_tx,
+	rxrpc_propose_ack_terminal_ack,
+	rxrpc_propose_ack__nr_trace
+};
+
+enum rxrpc_propose_ack_outcome {
+	rxrpc_propose_ack_use,
+	rxrpc_propose_ack_update,
+	rxrpc_propose_ack_subsume,
+	rxrpc_propose_ack__nr_outcomes
+};
+
+extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
+extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
+
 extern const char *const rxrpc_pkts[];
-extern const char *rxrpc_acks(u8 reason);
+extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
 
 #include <trace/events/rxrpc.h>
 
@@ -719,7 +739,8 @@ int rxrpc_reject_call(struct rxrpc_sock *);
  * call_event.c
  */
 void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
+		       enum rxrpc_propose_ack_trace);
 void rxrpc_process_call(struct work_struct *);
 
 /*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 90e970ba048a..fd5b11339ffb 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -58,14 +58,13 @@ out:
  */
 static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 				u16 skew, u32 serial, bool immediate,
-				bool background)
+				bool background,
+				enum rxrpc_propose_ack_trace why)
 {
+	enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
 	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
-	_enter("{%d},%s,%%%x,%u",
-	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
-
 	/* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
 	 * numbers, but we don't alter the timeout.
 	 */
@@ -74,15 +73,18 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 	       call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
 	if (ack_reason == call->ackr_reason) {
 		if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
+			outcome = rxrpc_propose_ack_update;
 			call->ackr_serial = serial;
 			call->ackr_skew = skew;
 		}
 		if (!immediate)
-			return;
+			goto trace;
 	} else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
 		call->ackr_reason = ack_reason;
 		call->ackr_serial = serial;
 		call->ackr_skew = skew;
+	} else {
+		outcome = rxrpc_propose_ack_subsume;
 	}
 
 	switch (ack_reason) {
@@ -124,17 +126,22 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 			rxrpc_set_timer(call, rxrpc_timer_set_for_ack);
 		}
 	}
+
+trace:
+	trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate,
+				background, outcome);
 }
 
 /*
  * propose an ACK be sent, locking the call structure
  */
 void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-		       u16 skew, u32 serial, bool immediate, bool background)
+		       u16 skew, u32 serial, bool immediate, bool background,
+		       enum rxrpc_propose_ack_trace why)
 {
 	spin_lock_bh(&call->lock);
 	__rxrpc_propose_ACK(call, ack_reason, skew, serial,
-			    immediate, background);
+			    immediate, background, why);
 	spin_unlock_bh(&call->lock);
 }
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 19b1e189f5dc..349698d87ad1 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -49,7 +49,8 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
 	if (call->peer->rtt_usage < 3 ||
 	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
-				  true, true);
+				  true, true,
+				  rxrpc_propose_ack_ping_for_params);
 }
 
 /*
@@ -382,7 +383,8 @@ skip:
 ack:
 	if (ack)
 		rxrpc_propose_ACK(call, ack, skew, ack_serial,
-				  immediate_ack, true);
+				  immediate_ack, true,
+				  rxrpc_propose_ack_input_data);
 
 	if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
 		rxrpc_notify_socket(call);
@@ -539,6 +541,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 			    u16 skew)
 {
+	u8 ack_reason;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	union {
 		struct rxrpc_ackpacket ack;
@@ -561,8 +564,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	first_soft_ack = ntohl(buf.ack.firstPacket);
 	hard_ack = first_soft_ack - 1;
 	nr_acks = buf.ack.nAcks;
+	ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
+		      buf.ack.reason : RXRPC_ACK__INVALID);
 
-	trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks);
+	trace_rxrpc_rx_ack(call, first_soft_ack, ack_reason, nr_acks);
 
 	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
 	       sp->hdr.serial,
@@ -570,7 +575,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	       first_soft_ack,
 	       ntohl(buf.ack.previousPacket),
 	       acked_serial,
-	       rxrpc_acks(buf.ack.reason),
+	       rxrpc_ack_names[ack_reason],
 	       buf.ack.nAcks);
 
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
@@ -583,10 +588,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	if (buf.ack.reason == RXRPC_ACK_PING) {
 		_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
-				  skew, sp->hdr.serial, true, true);
+				  skew, sp->hdr.serial, true, true,
+				  rxrpc_propose_ack_respond_to_ping);
 	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
-				  skew, sp->hdr.serial, true, true);
+				  skew, sp->hdr.serial, true, true,
+				  rxrpc_propose_ack_respond_to_ack);
 	}
 
 	offset = sp->offset + nr_acks + 3;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index fa9942fabdf2..1ca14835d87f 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -91,17 +91,10 @@ const s8 rxrpc_ack_priority[] = {
 	[RXRPC_ACK_PING]		= 9,
 };
 
-const char *rxrpc_acks(u8 reason)
-{
-	static const char *const str[] = {
-		"---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
-		"IDL", "-?-"
-	};
-
-	if (reason >= ARRAY_SIZE(str))
-		reason = ARRAY_SIZE(str) - 1;
-	return str[reason];
-}
+const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
+	"---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
+	"IDL", "-?-"
+};
 
 const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
 	[rxrpc_skb_rx_cleaned]		= "Rx CLN",
@@ -202,3 +195,18 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
 	[rxrpc_timer_set_for_send]		= "SetTx ",
 	[rxrpc_timer_set_for_resend]		= "SetRTx",
 };
+
+const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
+	[rxrpc_propose_ack_input_data]		= "DataIn ",
+	[rxrpc_propose_ack_ping_for_params]	= "Params ",
+	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
+	[rxrpc_propose_ack_respond_to_ping]	= "Rsp2Png",
+	[rxrpc_propose_ack_retry_tx]		= "RetryTx",
+	[rxrpc_propose_ack_terminal_ack]	= "ClTerm ",
+};
+
+const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
+	[rxrpc_propose_ack_use]			= "",
+	[rxrpc_propose_ack_update]		= " Update",
+	[rxrpc_propose_ack_subsume]		= " Subsume",
+};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index e47fbd1c836d..0c563e325c9d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -210,7 +210,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 			rxrpc_propose_ACK(call, pkt->ack.reason,
 					  ntohs(pkt->ack.maxSkew),
 					  ntohl(pkt->ack.serial),
-					  true, true);
+					  true, true,
+					  rxrpc_propose_ack_retry_tx);
 			break;
 		case RXRPC_PACKET_TYPE_ABORT:
 			break;
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 99e4c0ae30f1..8c7f3de45bac 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -141,7 +141,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 	ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
 
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
-		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false,
+				  rxrpc_propose_ack_terminal_ack);
 		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
 	}
 
-- 
cgit v1.2.3


From c6672e3fe4a641bf302d6309ab4d5ee55648e758 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Sep 2016 13:58:55 +0100
Subject: rxrpc: Add a tracepoint to log which packets will be retransmitted

Add a tracepoint to log in rxrpc_resend() which packets will be
retransmitted.  Note that if a positive ACK comes in whilst we have dropped
the lock to retransmit another packet, the actual retransmission may not
happen, though some of the effects will (such as altering the congestion
management).

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 27 +++++++++++++++++++++++++++
 net/rxrpc/call_event.c       |  2 ++
 2 files changed, 29 insertions(+)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index d67a8c6b085a..56475497043d 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -543,6 +543,33 @@ TRACE_EVENT(rxrpc_propose_ack,
 		      rxrpc_propose_ack_outcomes[__entry->outcome])
 	    );
 
+TRACE_EVENT(rxrpc_retransmit,
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation,
+		     s64 expiry),
+
+	    TP_ARGS(call, seq, annotation, expiry),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,	call		)
+		    __field(rxrpc_seq_t,		seq		)
+		    __field(u8,				annotation	)
+		    __field(s64,			expiry		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call = call;
+		    __entry->seq = seq;
+		    __entry->annotation = annotation;
+		    __entry->expiry = expiry;
+			   ),
+
+	    TP_printk("c=%p q=%x a=%02x xp=%lld",
+		      __entry->call,
+		      __entry->seq,
+		      __entry->annotation,
+		      __entry->expiry)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index fd5b11339ffb..a78a92fe5d77 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -196,6 +196,8 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 		/* Okay, we need to retransmit a packet. */
 		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
+		trace_rxrpc_retransmit(call, seq, annotation | anno_type,
+				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
 
 	resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now);
-- 
cgit v1.2.3


From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 22 Sep 2016 12:11:15 +0300
Subject: net: Update API for VF vlan protocol 802.1ad support

Introduce new rtnl UAPI that exposes a list of vlans per VF, giving
the ability for user-space application to specify it for the VF, as an
option to support 802.1ad.
We adjusted IP Link tool to support this option.

For future use cases, the new UAPI supports multiple vlans. For now we
limit the list size to a single vlan in kernel.
Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward
compatibility with older versions of IP Link tool.

Add a vlan protocol parameter to the ndo_set_vf_vlan callback.
We kept 802.1Q as the drivers' default vlan protocol.
Suitable ip link tool command examples:
  Set vf vlan protocol 802.1ad:
    ip link set eth0 vf 1 vlan 100 proto 802.1ad
  Set vf to VST (802.1Q) mode:
    ip link set eth0 vf 1 vlan 100 proto 802.1Q
  Or by omitting the new parameter
    ip link set eth0 vf 1 vlan 100

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h    |  3 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c  |  9 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c    |  6 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h    |  2 +-
 drivers/net/ethernet/emulex/benet/be_main.c        |  6 +-
 drivers/net/ethernet/intel/fm10k/fm10k.h           |  2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_iov.c       |  6 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  4 +-
 drivers/net/ethernet/intel/igb/igb_main.c          |  9 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c     |  5 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h     |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  6 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  6 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h  |  2 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c   |  5 +-
 drivers/net/ethernet/sfc/sriov.c                   |  5 +-
 drivers/net/ethernet/sfc/sriov.h                   |  2 +-
 include/linux/if_link.h                            |  1 +
 include/linux/netdevice.h                          |  6 +-
 include/uapi/linux/if_link.h                       | 19 ++++-
 net/core/rtnetlink.c                               | 80 ++++++++++++++++++----
 23 files changed, 161 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 0e68fadecfdb..243cb9748d35 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -492,7 +492,8 @@ int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
 int bnx2x_set_vf_mac(struct net_device *dev, int queue, u8 *mac);
-int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos);
+int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+		      __be16 vlan_proto);
 
 /* select_queue callback */
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 6c586b045d1d..3f77d0863543 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -2521,7 +2521,8 @@ void bnx2x_pf_set_vfs_vlan(struct bnx2x *bp)
 	for_each_vf(bp, vfidx) {
 		bulletin = BP_VF_BULLETIN(bp, vfidx);
 		if (bulletin->valid_bitmap & (1 << VLAN_VALID))
-			bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0);
+			bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0,
+					  htons(ETH_P_8021Q));
 	}
 }
 
@@ -2781,7 +2782,8 @@ static int bnx2x_set_vf_vlan_filter(struct bnx2x *bp, struct bnx2x_virtf *vf,
 	return 0;
 }
 
-int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos)
+int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos,
+		      __be16 vlan_proto)
 {
 	struct pf_vf_bulletin_content *bulletin = NULL;
 	struct bnx2x *bp = netdev_priv(dev);
@@ -2796,6 +2798,9 @@ int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos)
 		return -EINVAL;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	DP(BNX2X_MSG_IOV, "configuring VF %d with VLAN %d qos %d\n",
 	   vfidx, vlan, 0);
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 8be718508600..ec6cd18842c3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -174,7 +174,8 @@ int bnxt_set_vf_mac(struct net_device *dev, int vf_id, u8 *mac)
 	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 }
 
-int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos)
+int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos,
+		     __be16 vlan_proto)
 {
 	struct hwrm_func_cfg_input req = {0};
 	struct bnxt *bp = netdev_priv(dev);
@@ -185,6 +186,9 @@ int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos)
 	if (bp->hwrm_spec_code < 0x10201)
 		return -ENOTSUPP;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	rc = bnxt_vf_ndo_prep(bp, vf_id);
 	if (rc)
 		return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
index 0392670ab49c..1ab72e4820af 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
@@ -12,7 +12,7 @@
 
 int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *);
 int bnxt_set_vf_mac(struct net_device *, int, u8 *);
-int bnxt_set_vf_vlan(struct net_device *, int, u16, u8);
+int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
 int bnxt_set_vf_bw(struct net_device *, int, int, int);
 int bnxt_set_vf_link_state(struct net_device *, int, int);
 int bnxt_set_vf_spoofchk(struct net_device *, int, bool);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 9a94840c5757..ac513e6627d1 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1895,7 +1895,8 @@ static int be_clear_vf_tvt(struct be_adapter *adapter, int vf)
 	return 0;
 }
 
-static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+			  __be16 vlan_proto)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
@@ -1907,6 +1908,9 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
 	if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7)
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	if (vlan || qos) {
 		vlan |= qos << VLAN_PRIO_SHIFT;
 		status = be_set_vf_tvt(adapter, vf, vlan);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 67ff01aeb11a..4d19e46f7c55 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -507,7 +507,7 @@ int fm10k_iov_configure(struct pci_dev *pdev, int num_vfs);
 s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid);
 int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac);
 int fm10k_ndo_set_vf_vlan(struct net_device *netdev,
-			  int vf_idx, u16 vid, u8 qos);
+			  int vf_idx, u16 vid, u8 qos, __be16 vlan_proto);
 int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate,
 			int unused);
 int fm10k_ndo_get_vf_config(struct net_device *netdev,
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
index d9dec81f6b6d..5f4dac0d36ef 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
@@ -445,7 +445,7 @@ int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac)
 }
 
 int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid,
-			  u8 qos)
+			  u8 qos, __be16 vlan_proto)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_iov_data *iov_data = interface->iov_data;
@@ -460,6 +460,10 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid,
 	if (qos || (vid > (VLAN_VID_MASK - 1)))
 		return -EINVAL;
 
+	/* VF VLAN Protocol part to default is unsupported */
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	vf_info = &iov_data->vf_info[vf_idx];
 
 	/* exit if there is nothing to do */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index da3423561b3a..724d8740d4cc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2747,11 +2747,12 @@ error_param:
  * @vf_id: VF identifier
  * @vlan_id: mac address
  * @qos: priority setting
+ * @vlan_proto: vlan protocol
  *
  * program VF vlan id and/or qos
  **/
-int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
-			      int vf_id, u16 vlan_id, u8 qos)
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto)
 {
 	u16 vlanprio = vlan_id | (qos << I40E_VLAN_PRIORITY_SHIFT);
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
@@ -2774,6 +2775,12 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
 		goto error_pvid;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q)) {
+		dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n");
+		ret = -EPROTONOSUPPORT;
+		goto error_pvid;
+	}
+
 	vf = &(pf->vf[vf_id]);
 	vsi = pf->vsi[vf->lan_vsi_idx];
 	if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states)) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 875174141451..4012d069939a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -129,8 +129,8 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf);
 
 /* VF configuration related iplink handlers */
 int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac);
-int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
-			      int vf_id, u16 vlan_id, u8 qos);
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto);
 int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
 		       int max_tx_rate);
 int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index af75eac5fa16..a83aa13a5bf4 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -169,7 +169,7 @@ static int igb_set_vf_mac(struct igb_adapter *, int, unsigned char *);
 static void igb_restore_vf_multicasts(struct igb_adapter *adapter);
 static int igb_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac);
 static int igb_ndo_set_vf_vlan(struct net_device *netdev,
-			       int vf, u16 vlan, u8 qos);
+			       int vf, u16 vlan, u8 qos, __be16 vlan_proto);
 static int igb_ndo_set_vf_bw(struct net_device *, int, int, int);
 static int igb_ndo_set_vf_spoofchk(struct net_device *netdev, int vf,
 				   bool setting);
@@ -6222,14 +6222,17 @@ static int igb_disable_port_vlan(struct igb_adapter *adapter, int vf)
 	return 0;
 }
 
-static int igb_ndo_set_vf_vlan(struct net_device *netdev,
-			       int vf, u16 vlan, u8 qos)
+static int igb_ndo_set_vf_vlan(struct net_device *netdev, int vf,
+			       u16 vlan, u8 qos, __be16 vlan_proto)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 
 	if ((vf >= adapter->vfs_allocated_count) || (vlan > 4095) || (qos > 7))
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return (vlan || qos) ? igb_enable_port_vlan(adapter, vf, vlan, qos) :
 			       igb_disable_port_vlan(adapter, vf);
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 8618599dfd6f..b18590a995db 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1354,13 +1354,16 @@ static int ixgbe_disable_port_vlan(struct ixgbe_adapter *adapter, int vf)
 	return err;
 }
 
-int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
+			  u8 qos, __be16 vlan_proto)
 {
 	int err = 0;
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
 	if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7))
 		return -EINVAL;
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
 	if (vlan || qos) {
 		/* Check if there is already a port VLAN set, if so
 		 * we have to delete the old one first before we
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 47e65e2f886a..0c7977d27b71 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -43,7 +43,7 @@ void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter);
 void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter);
 int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac);
 int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan,
-			   u8 qos);
+			   u8 qos, __be16 vlan_proto);
 int ixgbe_link_mbps(struct ixgbe_adapter *adapter);
 int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate,
 			int max_tx_rate);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index a94f8a3f026c..132eeeafcdc4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2400,11 +2400,15 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
 	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
 }
 
-static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			       __be16 vlan_proto)
 {
 	struct mlx4_en_priv *en_priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = en_priv->mdev;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c12792314be7..b58cfe37dead 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2917,11 +2917,15 @@ static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
 	return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac);
 }
 
-static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			     __be16 vlan_proto)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1,
 					   vlan, qos);
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index cd23a2946db7..0e198fe89d1a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -100,7 +100,8 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
 #ifdef CONFIG_QED_SRIOV
-static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
+static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos,
+			    __be16 vlan_proto)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
 
@@ -109,6 +110,9 @@ static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
 		return -EINVAL;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n",
 		   vlan, vf);
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
index 24061b9b92e8..5f327659efa7 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
@@ -238,7 +238,7 @@ int qlcnic_sriov_set_vf_mac(struct net_device *, int, u8 *);
 int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int, int);
 int qlcnic_sriov_get_vf_config(struct net_device *, int ,
 			       struct ifla_vf_info *);
-int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8);
+int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
 int qlcnic_sriov_set_vf_spoofchk(struct net_device *, int, bool);
 #else
 static inline void qlcnic_sriov_pf_disable(struct qlcnic_adapter *adapter) {}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
index afd687e5e779..50eaafa3eaba 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
@@ -1915,7 +1915,7 @@ int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf,
 }
 
 int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf,
-			     u16 vlan, u8 qos)
+			     u16 vlan, u8 qos, __be16 vlan_proto)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
@@ -1928,6 +1928,9 @@ int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf,
 	if (vf >= sriov->num_vfs || qos > 7)
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	if (vlan > MAX_VLAN_ID) {
 		netdev_err(netdev,
 			   "Invalid VLAN ID, allowed range is [0 - %d]\n",
diff --git a/drivers/net/ethernet/sfc/sriov.c b/drivers/net/ethernet/sfc/sriov.c
index 816c44689e67..9abcf4aded30 100644
--- a/drivers/net/ethernet/sfc/sriov.c
+++ b/drivers/net/ethernet/sfc/sriov.c
@@ -22,7 +22,7 @@ int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac)
 }
 
 int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
-			  u8 qos)
+			  u8 qos, __be16 vlan_proto)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
@@ -31,6 +31,9 @@ int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
 		    (qos & ~(VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT)))
 			return -EINVAL;
 
+		if (vlan_proto != htons(ETH_P_8021Q))
+			return -EPROTONOSUPPORT;
+
 		return efx->type->sriov_set_vf_vlan(efx, vf_i, vlan, qos);
 	} else {
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/sfc/sriov.h b/drivers/net/ethernet/sfc/sriov.h
index 400df526586d..ba1762e7f216 100644
--- a/drivers/net/ethernet/sfc/sriov.h
+++ b/drivers/net/ethernet/sfc/sriov.h
@@ -16,7 +16,7 @@
 
 int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac);
 int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
-			  u8 qos);
+			  u8 qos, __be16 vlan_proto);
 int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf_i,
 			      bool spoofchk);
 int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i,
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index f923d15b432c..0b17c585b5cd 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -25,5 +25,6 @@ struct ifla_vf_info {
 	__u32 max_tx_rate;
 	__u32 rss_query_en;
 	__u32 trusted;
+	__be16 vlan_proto;
 };
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 69f242c71865..1e8a5c734d72 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -946,7 +946,8 @@ struct netdev_xdp {
  *
  *	SR-IOV management functions.
  * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
- * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
+ * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
+ *			  u8 qos, __be16 proto);
  * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
  *			  int max_tx_rate);
  * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
@@ -1187,7 +1188,8 @@ struct net_device_ops {
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
-						   int queue, u16 vlan, u8 qos);
+						   int queue, u16 vlan,
+						   u8 qos, __be16 proto);
 	int			(*ndo_set_vf_rate)(struct net_device *dev,
 						   int vf, int min_tx_rate,
 						   int max_tx_rate);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 7ec9e99d5491..b4fba662cd32 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -619,7 +619,7 @@ enum {
 enum {
 	IFLA_VF_UNSPEC,
 	IFLA_VF_MAC,		/* Hardware queue specific attributes */
-	IFLA_VF_VLAN,
+	IFLA_VF_VLAN,		/* VLAN ID and QoS */
 	IFLA_VF_TX_RATE,	/* Max TX Bandwidth Allocation */
 	IFLA_VF_SPOOFCHK,	/* Spoof Checking on/off switch */
 	IFLA_VF_LINK_STATE,	/* link state enable/disable/auto switch */
@@ -631,6 +631,7 @@ enum {
 	IFLA_VF_TRUST,		/* Trust VF */
 	IFLA_VF_IB_NODE_GUID,	/* VF Infiniband node GUID */
 	IFLA_VF_IB_PORT_GUID,	/* VF Infiniband port GUID */
+	IFLA_VF_VLAN_LIST,	/* nested list of vlans, option for QinQ */
 	__IFLA_VF_MAX,
 };
 
@@ -647,6 +648,22 @@ struct ifla_vf_vlan {
 	__u32 qos;
 };
 
+enum {
+	IFLA_VF_VLAN_INFO_UNSPEC,
+	IFLA_VF_VLAN_INFO,	/* VLAN ID, QoS and VLAN protocol */
+	__IFLA_VF_VLAN_INFO_MAX,
+};
+
+#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1)
+#define MAX_VLAN_LIST_LEN 1
+
+struct ifla_vf_vlan_info {
+	__u32 vf;
+	__u32 vlan; /* 0 - 4095, 0 disables VLAN filter */
+	__u32 qos;
+	__be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */
+};
+
 struct ifla_vf_tx_rate {
 	__u32 vf;
 	__u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 0dbae4244a89..3ac8946bf244 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 		size += nla_total_size(num_vfs * sizeof(struct nlattr));
 		size += num_vfs *
 			(nla_total_size(sizeof(struct ifla_vf_mac)) +
-			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
+			 nla_total_size(MAX_VLAN_LIST_LEN *
+					sizeof(struct nlattr)) +
+			 nla_total_size(MAX_VLAN_LIST_LEN *
+					sizeof(struct ifla_vf_vlan_info)) +
 			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
 			 nla_total_size(sizeof(struct ifla_vf_rate)) +
 			 nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 					       struct nlattr *vfinfo)
 {
 	struct ifla_vf_rss_query_en vf_rss_query_en;
+	struct nlattr *vf, *vfstats, *vfvlanlist;
 	struct ifla_vf_link_state vf_linkstate;
+	struct ifla_vf_vlan_info vf_vlan_info;
 	struct ifla_vf_spoofchk vf_spoofchk;
 	struct ifla_vf_tx_rate vf_tx_rate;
 	struct ifla_vf_stats vf_stats;
 	struct ifla_vf_trust vf_trust;
 	struct ifla_vf_vlan vf_vlan;
 	struct ifla_vf_rate vf_rate;
-	struct nlattr *vf, *vfstats;
 	struct ifla_vf_mac vf_mac;
 	struct ifla_vf_info ivi;
 
@@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	 * IFLA_VF_LINK_STATE_AUTO which equals zero
 	 */
 	ivi.linkstate = 0;
+	/* VLAN Protocol by default is 802.1Q */
+	ivi.vlan_proto = htons(ETH_P_8021Q);
 	if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
 		return 0;
 
 	vf_mac.vf =
 		vf_vlan.vf =
+		vf_vlan_info.vf =
 		vf_rate.vf =
 		vf_tx_rate.vf =
 		vf_spoofchk.vf =
@@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
 	vf_vlan.vlan = ivi.vlan;
 	vf_vlan.qos = ivi.qos;
+	vf_vlan_info.vlan = ivi.vlan;
+	vf_vlan_info.qos = ivi.qos;
+	vf_vlan_info.vlan_proto = ivi.vlan_proto;
 	vf_tx_rate.rate = ivi.max_tx_rate;
 	vf_rate.min_tx_rate = ivi.min_tx_rate;
 	vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	vf_rss_query_en.setting = ivi.rss_query_en;
 	vf_trust.setting = ivi.trusted;
 	vf = nla_nest_start(skb, IFLA_VF_INFO);
-	if (!vf) {
-		nla_nest_cancel(skb, vfinfo);
-		return -EMSGSIZE;
-	}
+	if (!vf)
+		goto nla_put_vfinfo_failure;
 	if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
 	    nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
 	    nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 		    &vf_rss_query_en) ||
 	    nla_put(skb, IFLA_VF_TRUST,
 		    sizeof(vf_trust), &vf_trust))
-		return -EMSGSIZE;
+		goto nla_put_vf_failure;
+	vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+	if (!vfvlanlist)
+		goto nla_put_vf_failure;
+	if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+		    &vf_vlan_info)) {
+		nla_nest_cancel(skb, vfvlanlist);
+		goto nla_put_vf_failure;
+	}
+	nla_nest_end(skb, vfvlanlist);
 	memset(&vf_stats, 0, sizeof(vf_stats));
 	if (dev->netdev_ops->ndo_get_vf_stats)
 		dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
 						&vf_stats);
 	vfstats = nla_nest_start(skb, IFLA_VF_STATS);
-	if (!vfstats) {
-		nla_nest_cancel(skb, vf);
-		nla_nest_cancel(skb, vfinfo);
-		return -EMSGSIZE;
-	}
+	if (!vfstats)
+		goto nla_put_vf_failure;
 	if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
 			      vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
 			      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
-			      vf_stats.multicast, IFLA_VF_STATS_PAD))
-		return -EMSGSIZE;
+			      vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+		nla_nest_cancel(skb, vfstats);
+		goto nla_put_vf_failure;
+	}
 	nla_nest_end(skb, vfstats);
 	nla_nest_end(skb, vf);
 	return 0;
+
+nla_put_vf_failure:
+	nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+	nla_nest_cancel(skb, vfinfo);
+	return -EMSGSIZE;
 }
 
 static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_MAC]		= { .len = sizeof(struct ifla_vf_mac) },
 	[IFLA_VF_VLAN]		= { .len = sizeof(struct ifla_vf_vlan) },
+	[IFLA_VF_VLAN_LIST]     = { .type = NLA_NESTED },
 	[IFLA_VF_TX_RATE]	= { .len = sizeof(struct ifla_vf_tx_rate) },
 	[IFLA_VF_SPOOFCHK]	= { .len = sizeof(struct ifla_vf_spoofchk) },
 	[IFLA_VF_RATE]		= { .len = sizeof(struct ifla_vf_rate) },
@@ -1704,7 +1727,34 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 		err = -EOPNOTSUPP;
 		if (ops->ndo_set_vf_vlan)
 			err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
-						   ivv->qos);
+						   ivv->qos,
+						   htons(ETH_P_8021Q));
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_VLAN_LIST]) {
+		struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+		struct nlattr *attr;
+		int rem, len = 0;
+
+		err = -EOPNOTSUPP;
+		if (!ops->ndo_set_vf_vlan)
+			return err;
+
+		nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+			if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+			    nla_len(attr) < NLA_HDRLEN) {
+				return -EINVAL;
+			}
+			if (len >= MAX_VLAN_LIST_LEN)
+				return -EOPNOTSUPP;
+			ivvl[len] = nla_data(attr);
+
+			len++;
+		}
+		err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+					   ivvl[0]->qos, ivvl[0]->vlan_proto);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v1.2.3


From 805b21b929e29192fb5de16154f616bfc1116e3e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:26 +0100
Subject: rxrpc: Send an ACK after every few DATA packets we receive

Send an ACK if we haven't sent one for the last two packets we've received.
This keeps the other end apprised of where we've got to - which is
important if they're doing slow-start.

We do this in recvmsg so that we can dispatch a packet directly without the
need to wake up the background thread.

This should possibly be made configurable in future.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  3 +++
 net/rxrpc/misc.c        |  1 +
 net/rxrpc/output.c      | 25 +++++++++++++++++--------
 net/rxrpc/recvmsg.c     | 13 ++++++++++++-
 4 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 042dbcc52654..e3bf9c0e3ad1 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -533,6 +533,8 @@ struct rxrpc_call {
 	u16			ackr_skew;	/* skew on packet being ACK'd */
 	rxrpc_serial_t		ackr_serial;	/* serial of packet being ACK'd */
 	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
+	rxrpc_seq_t		ackr_consumed;	/* Highest packet shown consumed */
+	rxrpc_seq_t		ackr_seen;	/* Highest packet shown seen */
 	rxrpc_serial_t		ackr_ping;	/* Last ping sent */
 	ktime_t			ackr_ping_time;	/* Time last ping sent */
 
@@ -695,6 +697,7 @@ enum rxrpc_propose_ack_trace {
 	rxrpc_propose_ack_respond_to_ack,
 	rxrpc_propose_ack_respond_to_ping,
 	rxrpc_propose_ack_retry_tx,
+	rxrpc_propose_ack_rotate_rx,
 	rxrpc_propose_ack_terminal_ack,
 	rxrpc_propose_ack__nr_trace
 };
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 1ca14835d87f..a473fd7dabaa 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -202,6 +202,7 @@ const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
 	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
 	[rxrpc_propose_ack_respond_to_ping]	= "Rsp2Png",
 	[rxrpc_propose_ack_retry_tx]		= "RetryTx",
+	[rxrpc_propose_ack_rotate_rx]		= "RxAck  ",
 	[rxrpc_propose_ack_terminal_ack]	= "ClTerm ",
 };
 
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 0c563e325c9d..3eb01445e814 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -36,7 +36,9 @@ struct rxrpc_pkt_buffer {
  * Fill out an ACK packet.
  */
 static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
-				 struct rxrpc_pkt_buffer *pkt)
+				 struct rxrpc_pkt_buffer *pkt,
+				 rxrpc_seq_t *_hard_ack,
+				 rxrpc_seq_t *_top)
 {
 	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top, seq;
@@ -48,6 +50,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
 	serial = call->ackr_serial;
 	hard_ack = READ_ONCE(call->rx_hard_ack);
 	top = smp_load_acquire(&call->rx_top);
+	*_hard_ack = hard_ack;
+	*_top = top;
 
 	pkt->ack.bufferSpace	= htons(8);
 	pkt->ack.maxSkew	= htons(call->ackr_skew);
@@ -96,6 +100,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	struct msghdr msg;
 	struct kvec iov[2];
 	rxrpc_serial_t serial;
+	rxrpc_seq_t hard_ack, top;
 	size_t len, n;
 	bool ping = false;
 	int ioc, ret;
@@ -146,7 +151,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 			goto out;
 		}
 		ping = (call->ackr_reason == RXRPC_ACK_PING);
-		n = rxrpc_fill_out_ack(call, pkt);
+		n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top);
 		call->ackr_reason = 0;
 
 		spin_unlock_bh(&call->lock);
@@ -203,18 +208,22 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 	if (ping)
 		call->ackr_ping_time = ktime_get_real();
 
-	if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) {
-		switch (type) {
-		case RXRPC_PACKET_TYPE_ACK:
+	if (type == RXRPC_PACKET_TYPE_ACK &&
+	    call->state < RXRPC_CALL_COMPLETE) {
+		if (ret < 0) {
 			clear_bit(RXRPC_CALL_PINGING, &call->flags);
 			rxrpc_propose_ACK(call, pkt->ack.reason,
 					  ntohs(pkt->ack.maxSkew),
 					  ntohl(pkt->ack.serial),
 					  true, true,
 					  rxrpc_propose_ack_retry_tx);
-			break;
-		case RXRPC_PACKET_TYPE_ABORT:
-			break;
+		} else {
+			spin_lock_bh(&call->lock);
+			if (after(hard_ack, call->ackr_consumed))
+				call->ackr_consumed = hard_ack;
+			if (after(top, call->ackr_seen))
+				call->ackr_seen = top;
+			spin_unlock_bh(&call->lock);
 		}
 	}
 
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 8c7f3de45bac..a7458c398b9e 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -201,8 +201,19 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 
 	_debug("%u,%u,%02x", hard_ack, top, flags);
 	trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
-	if (flags & RXRPC_LAST_PACKET)
+	if (flags & RXRPC_LAST_PACKET) {
 		rxrpc_end_rx_phase(call);
+	} else {
+		/* Check to see if there's an ACK that needs sending. */
+		if (after_eq(hard_ack, call->ackr_consumed + 2) ||
+		    after_eq(top, call->ackr_seen + 2) ||
+		    (hard_ack == top && after(hard_ack, call->ackr_consumed)))
+			rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial,
+					  true, false,
+					  rxrpc_propose_ack_rotate_rx);
+		if (call->ackr_reason)
+			rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From a7056c5ba67ee6a956b42cf9ff9ba3a6a0bd9794 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:27 +0100
Subject: rxrpc: Send an immediate ACK if we fill in a hole

Send an immediate ACK if we fill in a hole in the buffer left by an
out-of-sequence packet.  This may allow the congestion management in the peer
to avoid a retransmission if packets got reordered on the wire.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 349698d87ad1..757c16f033a0 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -331,8 +331,16 @@ next_subpacket:
 	call->rxtx_annotations[ix] = annotation;
 	smp_wmb();
 	call->rxtx_buffer[ix] = skb;
-	if (after(seq, call->rx_top))
+	if (after(seq, call->rx_top)) {
 		smp_store_release(&call->rx_top, seq);
+	} else if (before(seq, call->rx_top)) {
+		/* Send an immediate ACK if we fill in a hole */
+		if (!ack) {
+			ack = RXRPC_ACK_DELAY;
+			ack_serial = serial;
+		}
+		immediate_ack = true;
+	}
 	if (flags & RXRPC_LAST_PACKET) {
 		set_bit(RXRPC_CALL_RX_LAST, &call->flags);
 		trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
-- 
cgit v1.2.3


From b69d94d7991f83928d3ea18fe12ab011fa852bb0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:27 +0100
Subject: rxrpc: Include the last reply DATA serial number in the final ACK

In a client call, include the serial number of the last DATA packet of the
reply in the final ACK.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/recvmsg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a7458c398b9e..038ae62ddb4d 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -133,7 +133,7 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
 /*
  * End the packet reception phase.
  */
-static void rxrpc_end_rx_phase(struct rxrpc_call *call)
+static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 {
 	_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
 
@@ -141,7 +141,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
 	ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
 
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
-		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false,
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
 				  rxrpc_propose_ack_terminal_ack);
 		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
 	}
@@ -202,7 +202,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 	_debug("%u,%u,%02x", hard_ack, top, flags);
 	trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
 	if (flags & RXRPC_LAST_PACKET) {
-		rxrpc_end_rx_phase(call);
+		rxrpc_end_rx_phase(call, serial);
 	} else {
 		/* Check to see if there's an ACK that needs sending. */
 		if (after_eq(hard_ack, call->ackr_consumed + 2) ||
-- 
cgit v1.2.3


From dd7c1ee59a90ca8a75bce72c721851d5550f3c59 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:27 +0100
Subject: rxrpc: Reinitialise the call ACK and timer state for client reply
 phase

Clear the ACK reason, ACK timer and resend timer when entering the client
reply phase when the first DATA packet is received.  New ACKs will be
proposed once the data is queued.

The resend timer is no longer relevant and we need to cancel ACKs scheduled
to probe for a lost reply.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 1 +
 net/rxrpc/input.c       | 9 +++++++++
 net/rxrpc/misc.c        | 1 +
 3 files changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e3bf9c0e3ad1..cdd35e2b40ba 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -682,6 +682,7 @@ extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
 
 enum rxrpc_timer_trace {
 	rxrpc_timer_begin,
+	rxrpc_timer_init_for_reply,
 	rxrpc_timer_expired,
 	rxrpc_timer_set_for_ack,
 	rxrpc_timer_set_for_resend,
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 757c16f033a0..bda11eb2ab2a 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -149,6 +149,15 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 {
 	rxrpc_seq_t top = READ_ONCE(call->tx_top);
 
+	if (call->ackr_reason) {
+		spin_lock_bh(&call->lock);
+		call->ackr_reason = 0;
+		call->resend_at = call->expire_at;
+		call->ack_at = call->expire_at;
+		spin_unlock_bh(&call->lock);
+		rxrpc_set_timer(call, rxrpc_timer_init_for_reply);
+	}
+
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
 		rxrpc_rotate_tx_window(call, top);
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index a473fd7dabaa..901c012a2700 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -191,6 +191,7 @@ const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
 const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
 	[rxrpc_timer_begin]			= "Begin ",
 	[rxrpc_timer_expired]			= "*EXPR*",
+	[rxrpc_timer_init_for_reply]		= "IniRpl",
 	[rxrpc_timer_set_for_ack]		= "SetAck",
 	[rxrpc_timer_set_for_send]		= "SetTx ",
 	[rxrpc_timer_set_for_resend]		= "SetRTx",
-- 
cgit v1.2.3


From df0562a72dba13ab49c7dd7cb15170697b9848ee Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 23:00:54 +0100
Subject: rxrpc: Delay the resend timer to allow for nsec->jiffies conv error

When determining the resend timer value, we have a value in nsec but the
timer is in jiffies which may be a million or more times more coarse.
nsecs_to_jiffies() rounds down - which means that the resend timeout
expressed as jiffies is very likely earlier than the one expressed as
nanoseconds from which it was derived.

The problem is that rxrpc_resend() gets triggered by the timer, but can't
then find anything to resend yet.  It sets the timer again - but gets
kicked off immediately again and again until the nanosecond-based expiry
time is reached and we actually retransmit.

Fix this by adding 1 to the jiffies-based resend_at value to counteract the
rounding and make sure that the timer happens after the nanosecond-based
expiry is passed.

Alternatives would be to adjust the timestamp on the packets to align
with the jiffie scale or to switch back to using jiffie-timestamps.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index a78a92fe5d77..d5bf9ce7ec6f 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -200,8 +200,14 @@ static void rxrpc_resend(struct rxrpc_call *call)
 				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
 
-	resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now);
-	call->resend_at = jiffies + nsecs_to_jiffies(ktime_to_ns(resend_at));
+	resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
+	call->resend_at = jiffies +
+		nsecs_to_jiffies(ktime_to_ns(ktime_sub(resend_at, now))) +
+		1; /* We have to make sure that the calculated jiffies value
+		    * falls at or after the nsec value, or we shall loop
+		    * ceaselessly because the timer times out, but we haven't
+		    * reached the nsec timeout yet.
+		    */
 
 	/* Now go through the Tx window and perform the retransmissions.  We
 	 * have to drop the lock for each send.  If an ACK comes in whilst the
-- 
cgit v1.2.3


From 31a1b989508ce64e8ead504884ced01e61870852 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:26 +0100
Subject: rxrpc: Generate a summary of the ACK state for later use

Generate a summary of the Tx buffer packet state when an ACK is received
for use in a later patch that does congestion management.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 14 ++++++++++++++
 net/rxrpc/input.c       | 45 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 48 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index cdd35e2b40ba..1a700b6a998b 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -540,6 +540,20 @@ struct rxrpc_call {
 
 	/* transmission-phase ACK management */
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
+	rxrpc_seq_t		acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
+};
+
+/*
+ * Summary of a new ACK and the changes it made.
+ */
+struct rxrpc_ack_summary {
+	u8			ack_reason;
+	u8			nr_acks;		/* Number of ACKs in packet */
+	u8			nr_nacks;		/* Number of NACKs in packet */
+	u8			nr_new_acks;		/* Number of new ACKs in packet */
+	u8			nr_new_nacks;		/* Number of new NACKs in packet */
+	u8			nr_rot_new_acks;	/* Number of rotated new ACKs */
+	bool			new_low_nack;		/* T if new low NACK found */
 };
 
 enum rxrpc_skb_trace {
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index bda11eb2ab2a..dd699667eeef 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -56,12 +56,20 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
 /*
  * Apply a hard ACK by advancing the Tx window.
  */
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
+				   struct rxrpc_ack_summary *summary)
 {
 	struct sk_buff *skb, *list = NULL;
 	int ix;
 	u8 annotation;
 
+	if (call->acks_lowest_nak == call->tx_hard_ack) {
+		call->acks_lowest_nak = to;
+	} else if (before_eq(call->acks_lowest_nak, to)) {
+		summary->new_low_nack = true;
+		call->acks_lowest_nak = to;
+	}
+
 	spin_lock(&call->lock);
 
 	while (before(call->tx_hard_ack, to)) {
@@ -77,6 +85,8 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
 
 		if (annotation & RXRPC_TX_ANNO_LAST)
 			set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+		if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
+			summary->nr_rot_new_acks++;
 	}
 
 	spin_unlock(&call->lock);
@@ -147,6 +157,7 @@ bad_state:
  */
 static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 {
+	struct rxrpc_ack_summary summary = { 0 };
 	rxrpc_seq_t top = READ_ONCE(call->tx_top);
 
 	if (call->ackr_reason) {
@@ -159,7 +170,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 	}
 
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
-		rxrpc_rotate_tx_window(call, top);
+		rxrpc_rotate_tx_window(call, top, &summary);
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
 		rxrpc_proto_abort("TXL", call, top);
 		return false;
@@ -508,7 +519,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
  * the time the ACK was sent.
  */
 static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
-				  rxrpc_seq_t seq, int nr_acks)
+				  rxrpc_seq_t seq, int nr_acks,
+				  struct rxrpc_ack_summary *summary)
 {
 	bool resend = false;
 	int ix;
@@ -521,14 +533,23 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 		annotation &= ~RXRPC_TX_ANNO_MASK;
 		switch (*acks++) {
 		case RXRPC_ACK_TYPE_ACK:
+			summary->nr_acks++;
 			if (anno_type == RXRPC_TX_ANNO_ACK)
 				continue;
+			summary->nr_new_acks++;
 			call->rxtx_annotations[ix] =
 				RXRPC_TX_ANNO_ACK | annotation;
 			break;
 		case RXRPC_ACK_TYPE_NACK:
+			if (!summary->nr_nacks &&
+			    call->acks_lowest_nak != seq) {
+				call->acks_lowest_nak = seq;
+				summary->new_low_nack = true;
+			}
+			summary->nr_nacks++;
 			if (anno_type == RXRPC_TX_ANNO_NAK)
 				continue;
+			summary->nr_new_nacks++;
 			if (anno_type == RXRPC_TX_ANNO_RETRANS)
 				continue;
 			call->rxtx_annotations[ix] =
@@ -558,7 +579,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 			    u16 skew)
 {
-	u8 ack_reason;
+	struct rxrpc_ack_summary summary = { 0 };
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	union {
 		struct rxrpc_ackpacket ack;
@@ -581,10 +602,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	first_soft_ack = ntohl(buf.ack.firstPacket);
 	hard_ack = first_soft_ack - 1;
 	nr_acks = buf.ack.nAcks;
-	ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
-		      buf.ack.reason : RXRPC_ACK__INVALID);
+	summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
+			      buf.ack.reason : RXRPC_ACK__INVALID);
 
-	trace_rxrpc_rx_ack(call, first_soft_ack, ack_reason, nr_acks);
+	trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
 
 	_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
 	       sp->hdr.serial,
@@ -592,7 +613,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	       first_soft_ack,
 	       ntohl(buf.ack.previousPacket),
 	       acked_serial,
-	       rxrpc_ack_names[ack_reason],
+	       rxrpc_ack_names[summary.ack_reason],
 	       buf.ack.nAcks);
 
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
@@ -649,12 +670,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		return rxrpc_proto_abort("AKN", call, 0);
 
 	if (after(hard_ack, call->tx_hard_ack))
-		rxrpc_rotate_tx_window(call, hard_ack);
+		rxrpc_rotate_tx_window(call, hard_ack, &summary);
 
 	if (nr_acks > 0) {
 		if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0)
 			return rxrpc_proto_abort("XSA", call, 0);
-		rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks);
+		rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
+				      &summary);
 	}
 
 	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
@@ -669,11 +691,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
  */
 static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 {
+	struct rxrpc_ack_summary summary = { 0 };
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
 	_proto("Rx ACKALL %%%u", sp->hdr.serial);
 
-	rxrpc_rotate_tx_window(call, call->tx_top);
+	rxrpc_rotate_tx_window(call, call->tx_top, &summary);
 	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
 		rxrpc_end_tx_phase(call, false, "ETL");
 }
-- 
cgit v1.2.3


From 0d967960d39ee89f9e0289692e9f7232f490e55c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:27 +0100
Subject: rxrpc: Schedule an ACK if the reply to a client call appears overdue

If we've sent all the request data in a client call but haven't seen any
sign of the reply data yet, schedule an ACK to be sent to the server to
find out if the reply data got lost.

If the server hasn't yet hard-ACK'd the request data, we send a PING ACK to
demand a response to find out whether we need to retransmit.

If the server says it has received all of the data, we send an IDLE ACK to
tell the server that we haven't received anything in the receive phase as
yet.

To make this work, a non-immediate PING ACK must carry a delay.  I've chosen
the same as the IDLE ACK for the moment.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 2 ++
 net/rxrpc/call_event.c  | 1 +
 net/rxrpc/input.c       | 8 ++++++++
 net/rxrpc/misc.c        | 2 ++
 4 files changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 1a700b6a998b..b1e697fc9ffb 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -707,7 +707,9 @@ enum rxrpc_timer_trace {
 extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
 
 enum rxrpc_propose_ack_trace {
+	rxrpc_propose_ack_client_tx_end,
 	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_lost_reply,
 	rxrpc_propose_ack_ping_for_params,
 	rxrpc_propose_ack_respond_to_ack,
 	rxrpc_propose_ack_respond_to_ping,
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index d5bf9ce7ec6f..05b94d1acf52 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -100,6 +100,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 			expiry = rxrpc_soft_ack_delay;
 		break;
 
+	case RXRPC_ACK_PING:
 	case RXRPC_ACK_IDLE:
 		if (rxrpc_idle_ack_delay < expiry)
 			expiry = rxrpc_idle_ack_delay;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index dd699667eeef..0344f4494eb7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -138,6 +138,8 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
 
 	write_unlock(&call->state_lock);
 	if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
+		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true,
+				  rxrpc_propose_ack_client_tx_end);
 		trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
 	} else {
 		trace_rxrpc_transmit(call, rxrpc_transmit_end);
@@ -684,6 +686,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		return;
 	}
 
+	if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+	    RXRPC_TX_ANNO_LAST &&
+	    summary.nr_acks == call->tx_top - hard_ack)
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+				  false, true,
+				  rxrpc_propose_ack_ping_for_lost_reply);
 }
 
 /*
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 901c012a2700..a608769343e6 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -198,7 +198,9 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
 };
 
 const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
+	[rxrpc_propose_ack_client_tx_end]	= "ClTxEnd",
 	[rxrpc_propose_ack_input_data]		= "DataIn ",
+	[rxrpc_propose_ack_ping_for_lost_reply]	= "LostRpl",
 	[rxrpc_propose_ack_ping_for_params]	= "Params ",
 	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
 	[rxrpc_propose_ack_respond_to_ping]	= "Rsp2Png",
-- 
cgit v1.2.3


From 57494343cb5d66962bb197878fb1cc576177db31 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Sep 2016 18:05:27 +0100
Subject: rxrpc: Implement slow-start

Implement RxRPC slow-start, which is similar to RFC 5681 for TCP.  A
tracepoint is added to log the state of the congestion management algorithm
and the decisions it makes.

Notes:

 (1) Since we send fixed-size DATA packets (apart from the final packet in
     each phase), counters and calculations are in terms of packets rather
     than bytes.

 (2) The ACK packet carries the equivalent of TCP SACK.

 (3) The FLIGHT_SIZE calculation in RFC 5681 doesn't seem particularly
     suited to SACK of a small number of packets.  It seems that, almost
     inevitably, by the time three 'duplicate' ACKs have been seen, we have
     narrowed the loss down to one or two missing packets, and the
     FLIGHT_SIZE calculation ends up as 2.

 (4) In rxrpc_resend(), if there was no data that apparently needed
     retransmission, we transmit a PING ACK to ask the peer to tell us what
     its Rx window state is.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  45 ++++++++++++
 net/rxrpc/ar-internal.h      |  53 +++++++++++++-
 net/rxrpc/call_event.c       |  36 ++++++++-
 net/rxrpc/call_object.c      |  13 ++++
 net/rxrpc/conn_event.c       |   1 +
 net/rxrpc/input.c            | 169 +++++++++++++++++++++++++++++++++++++++++--
 net/rxrpc/misc.c             |  19 +++++
 net/rxrpc/output.c           |   9 ++-
 net/rxrpc/sendmsg.c          |   7 +-
 9 files changed, 339 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 56475497043d..ada12d00118c 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -570,6 +570,51 @@ TRACE_EVENT(rxrpc_retransmit,
 		      __entry->expiry)
 	    );
 
+TRACE_EVENT(rxrpc_congest,
+	    TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary,
+		     rxrpc_serial_t ack_serial, enum rxrpc_congest_change change),
+
+	    TP_ARGS(call, summary, ack_serial, change),
+
+	    TP_STRUCT__entry(
+		    __field(struct rxrpc_call *,		call		)
+		    __field(enum rxrpc_congest_change,		change		)
+		    __field(rxrpc_seq_t,			hard_ack	)
+		    __field(rxrpc_seq_t,			top		)
+		    __field(rxrpc_seq_t,			lowest_nak	)
+		    __field(rxrpc_serial_t,			ack_serial	)
+		    __field_struct(struct rxrpc_ack_summary,	sum		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call	= call;
+		    __entry->change	= change;
+		    __entry->hard_ack	= call->tx_hard_ack;
+		    __entry->top	= call->tx_top;
+		    __entry->lowest_nak	= call->acks_lowest_nak;
+		    __entry->ack_serial	= ack_serial;
+		    memcpy(&__entry->sum, summary, sizeof(__entry->sum));
+			   ),
+
+	    TP_printk("c=%p %08x %s %08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s",
+		      __entry->call,
+		      __entry->ack_serial,
+		      rxrpc_ack_names[__entry->sum.ack_reason],
+		      __entry->hard_ack,
+		      rxrpc_congest_modes[__entry->sum.mode],
+		      __entry->sum.cwnd,
+		      __entry->sum.ssthresh,
+		      __entry->sum.nr_acks, __entry->sum.nr_nacks,
+		      __entry->sum.nr_new_acks, __entry->sum.nr_new_nacks,
+		      __entry->sum.nr_rot_new_acks,
+		      __entry->top - __entry->hard_ack,
+		      __entry->sum.cumulative_acks,
+		      __entry->sum.dup_acks,
+		      __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "",
+		      rxrpc_congest_changes[__entry->change],
+		      __entry->sum.retrans_timeo ? " rTxTo" : "")
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index b1e697fc9ffb..ca96e547cb9a 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -402,6 +402,7 @@ enum rxrpc_call_flag {
 	RXRPC_CALL_RX_LAST,		/* Received the last packet (at rxtx_top) */
 	RXRPC_CALL_TX_LAST,		/* Last packet in Tx buffer (at rxtx_top) */
 	RXRPC_CALL_PINGING,		/* Ping in process */
+	RXRPC_CALL_RETRANS_TIMEOUT,	/* Retransmission due to timeout occurred */
 };
 
 /*
@@ -446,6 +447,17 @@ enum rxrpc_call_completion {
 	NR__RXRPC_CALL_COMPLETIONS
 };
 
+/*
+ * Call Tx congestion management modes.
+ */
+enum rxrpc_congest_mode {
+	RXRPC_CALL_SLOW_START,
+	RXRPC_CALL_CONGEST_AVOIDANCE,
+	RXRPC_CALL_PACKET_LOSS,
+	RXRPC_CALL_FAST_RETRANSMIT,
+	NR__RXRPC_CONGEST_MODES
+};
+
 /*
  * RxRPC call definition
  * - matched by { connection, call_id }
@@ -518,6 +530,20 @@ struct rxrpc_call {
 						 * not hard-ACK'd packet follows this.
 						 */
 	rxrpc_seq_t		tx_top;		/* Highest Tx slot allocated. */
+
+	/* TCP-style slow-start congestion control [RFC5681].  Since the SMSS
+	 * is fixed, we keep these numbers in terms of segments (ie. DATA
+	 * packets) rather than bytes.
+	 */
+#define RXRPC_TX_SMSS		RXRPC_JUMBO_DATALEN
+	u8			cong_cwnd;	/* Congestion window size */
+	u8			cong_extra;	/* Extra to send for congestion management */
+	u8			cong_ssthresh;	/* Slow-start threshold */
+	enum rxrpc_congest_mode	cong_mode:8;	/* Congestion management mode */
+	u8			cong_dup_acks;	/* Count of ACKs showing missing packets */
+	u8			cong_cumul_acks; /* Cumulative ACK count */
+	ktime_t			cong_tstamp;	/* Last time cwnd was changed */
+
 	rxrpc_seq_t		rx_hard_ack;	/* Dead slot in buffer; the first received but not
 						 * consumed packet follows this.
 						 */
@@ -539,12 +565,13 @@ struct rxrpc_call {
 	ktime_t			ackr_ping_time;	/* Time last ping sent */
 
 	/* transmission-phase ACK management */
+	ktime_t			acks_latest_ts;	/* Timestamp of latest ACK received */
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 	rxrpc_seq_t		acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
 };
 
 /*
- * Summary of a new ACK and the changes it made.
+ * Summary of a new ACK and the changes it made to the Tx buffer packet states.
  */
 struct rxrpc_ack_summary {
 	u8			ack_reason;
@@ -554,6 +581,14 @@ struct rxrpc_ack_summary {
 	u8			nr_new_nacks;		/* Number of new NACKs in packet */
 	u8			nr_rot_new_acks;	/* Number of rotated new ACKs */
 	bool			new_low_nack;		/* T if new low NACK found */
+	bool			retrans_timeo;		/* T if reTx due to timeout happened */
+	u8			flight_size;		/* Number of unreceived transmissions */
+	/* Place to stash values for tracing */
+	enum rxrpc_congest_mode	mode:8;
+	u8			cwnd;
+	u8			ssthresh;
+	u8			dup_acks;
+	u8			cumulative_acks;
 };
 
 enum rxrpc_skb_trace {
@@ -709,6 +744,7 @@ extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
 enum rxrpc_propose_ack_trace {
 	rxrpc_propose_ack_client_tx_end,
 	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_lost_ack,
 	rxrpc_propose_ack_ping_for_lost_reply,
 	rxrpc_propose_ack_ping_for_params,
 	rxrpc_propose_ack_respond_to_ack,
@@ -729,6 +765,21 @@ enum rxrpc_propose_ack_outcome {
 extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
 extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
 
+enum rxrpc_congest_change {
+	rxrpc_cong_begin_retransmission,
+	rxrpc_cong_cleared_nacks,
+	rxrpc_cong_new_low_nack,
+	rxrpc_cong_no_change,
+	rxrpc_cong_progress,
+	rxrpc_cong_retransmit_again,
+	rxrpc_cong_rtt_window_end,
+	rxrpc_cong_saw_nack,
+	rxrpc_congest__nr_change
+};
+
+extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
+extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
+
 extern const char *const rxrpc_pkts[];
 extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 05b94d1acf52..0e8478012212 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -146,6 +146,14 @@ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 	spin_unlock_bh(&call->lock);
 }
 
+/*
+ * Handle congestion being detected by the retransmit timeout.
+ */
+static void rxrpc_congestion_timeout(struct rxrpc_call *call)
+{
+	set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
+}
+
 /*
  * Perform retransmission of NAK'd and unack'd packets.
  */
@@ -154,9 +162,9 @@ static void rxrpc_resend(struct rxrpc_call *call)
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t now = ktime_get_real(), max_age, oldest,  resend_at;
+	ktime_t now = ktime_get_real(), max_age, oldest, resend_at, ack_ts;
 	int ix;
-	u8 annotation, anno_type;
+	u8 annotation, anno_type, retrans = 0, unacked = 0;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
@@ -193,10 +201,13 @@ static void rxrpc_resend(struct rxrpc_call *call)
 					oldest = skb->tstamp;
 				continue;
 			}
+			if (!(annotation & RXRPC_TX_ANNO_RESENT))
+				unacked++;
 		}
 
 		/* Okay, we need to retransmit a packet. */
 		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
+		retrans++;
 		trace_rxrpc_retransmit(call, seq, annotation | anno_type,
 				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
@@ -210,6 +221,25 @@ static void rxrpc_resend(struct rxrpc_call *call)
 		    * reached the nsec timeout yet.
 		    */
 
+	if (unacked)
+		rxrpc_congestion_timeout(call);
+
+	/* If there was nothing that needed retransmission then it's likely
+	 * that an ACK got lost somewhere.  Send a ping to find out instead of
+	 * retransmitting data.
+	 */
+	if (!retrans) {
+		rxrpc_set_timer(call, rxrpc_timer_set_for_resend);
+		spin_unlock_bh(&call->lock);
+		ack_ts = ktime_sub(now, call->acks_latest_ts);
+		if (ktime_to_ns(ack_ts) < call->peer->rtt)
+			goto out;
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
+				  rxrpc_propose_ack_ping_for_lost_ack);
+		rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+		goto out;
+	}
+
 	/* Now go through the Tx window and perform the retransmissions.  We
 	 * have to drop the lock for each send.  If an ACK comes in whilst the
 	 * lock is dropped, it may clear some of the retransmission markers for
@@ -260,6 +290,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
 
 out_unlock:
 	spin_unlock_bh(&call->lock);
+out:
 	_leave("");
 }
 
@@ -293,6 +324,7 @@ recheck_state:
 	if (time_after_eq(now, call->expire_at)) {
 		rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
 		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+		goto recheck_state;
 	}
 
 	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index a53f4c2c0025..d4b3293b78fa 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -160,6 +160,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 	call->rx_winsize = rxrpc_rx_window_size;
 	call->tx_winsize = 16;
 	call->rx_expect_next = 1;
+
+	if (RXRPC_TX_SMSS > 2190)
+		call->cong_cwnd = 2;
+	else if (RXRPC_TX_SMSS > 1095)
+		call->cong_cwnd = 3;
+	else
+		call->cong_cwnd = 4;
+	call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
 	return call;
 
 nomem_2:
@@ -176,6 +184,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 						  gfp_t gfp)
 {
 	struct rxrpc_call *call;
+	ktime_t now;
 
 	_enter("");
 
@@ -185,6 +194,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
 	call->service_id = srx->srx_service;
 	call->tx_phase = true;
+	now = ktime_get_real();
+	call->acks_latest_ts = now;
+	call->cong_tstamp = now;
 
 	_leave(" = %p", call);
 	return call;
@@ -325,6 +337,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
 	call->state		= RXRPC_CALL_SERVER_ACCEPTING;
 	if (sp->hdr.securityIndex > 0)
 		call->state	= RXRPC_CALL_SERVER_SECURING;
+	call->cong_tstamp	= skb->tstamp;
 
 	/* Set the channel for this call.  We don't get channel_lock as we're
 	 * only defending against the data_ready handler (which we're called
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index a1cf1ec5f29e..37609ce89f52 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -97,6 +97,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.info.maxMTU		= htonl(mtu);
 		pkt.info.rwind		= htonl(rxrpc_rx_window_size);
 		pkt.info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
+		pkt.whdr.flags		|= RXRPC_SLOW_START_OK;
 		len += sizeof(pkt.ack) + sizeof(pkt.info);
 		break;
 	}
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 0344f4494eb7..094720dd1eaf 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -36,6 +36,166 @@ static void rxrpc_proto_abort(const char *why,
 	}
 }
 
+/*
+ * Do TCP-style congestion management [RFC 5681].
+ */
+static void rxrpc_congestion_management(struct rxrpc_call *call,
+					struct sk_buff *skb,
+					struct rxrpc_ack_summary *summary)
+{
+	enum rxrpc_congest_change change = rxrpc_cong_no_change;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned int cumulative_acks = call->cong_cumul_acks;
+	unsigned int cwnd = call->cong_cwnd;
+	bool resend = false;
+
+	summary->flight_size =
+		(call->tx_top - call->tx_hard_ack) - summary->nr_acks;
+
+	if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
+		summary->retrans_timeo = true;
+		call->cong_ssthresh = max_t(unsigned int,
+					    summary->flight_size / 2, 2);
+		cwnd = 1;
+		if (cwnd > call->cong_ssthresh &&
+		    call->cong_mode == RXRPC_CALL_SLOW_START) {
+			call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+			call->cong_tstamp = skb->tstamp;
+			cumulative_acks = 0;
+		}
+	}
+
+	cumulative_acks += summary->nr_new_acks;
+	cumulative_acks += summary->nr_rot_new_acks;
+	if (cumulative_acks > 255)
+		cumulative_acks = 255;
+
+	summary->mode = call->cong_mode;
+	summary->cwnd = call->cong_cwnd;
+	summary->ssthresh = call->cong_ssthresh;
+	summary->cumulative_acks = cumulative_acks;
+	summary->dup_acks = call->cong_dup_acks;
+
+	switch (call->cong_mode) {
+	case RXRPC_CALL_SLOW_START:
+		if (summary->nr_nacks > 0)
+			goto packet_loss_detected;
+		if (summary->cumulative_acks > 0)
+			cwnd += 1;
+		if (cwnd > call->cong_ssthresh) {
+			call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+			call->cong_tstamp = skb->tstamp;
+		}
+		goto out;
+
+	case RXRPC_CALL_CONGEST_AVOIDANCE:
+		if (summary->nr_nacks > 0)
+			goto packet_loss_detected;
+
+		/* We analyse the number of packets that get ACK'd per RTT
+		 * period and increase the window if we managed to fill it.
+		 */
+		if (call->peer->rtt_usage == 0)
+			goto out;
+		if (ktime_before(skb->tstamp,
+				 ktime_add_ns(call->cong_tstamp,
+					      call->peer->rtt)))
+			goto out_no_clear_ca;
+		change = rxrpc_cong_rtt_window_end;
+		call->cong_tstamp = skb->tstamp;
+		if (cumulative_acks >= cwnd)
+			cwnd++;
+		goto out;
+
+	case RXRPC_CALL_PACKET_LOSS:
+		if (summary->nr_nacks == 0)
+			goto resume_normality;
+
+		if (summary->new_low_nack) {
+			change = rxrpc_cong_new_low_nack;
+			call->cong_dup_acks = 1;
+			if (call->cong_extra > 1)
+				call->cong_extra = 1;
+			goto send_extra_data;
+		}
+
+		call->cong_dup_acks++;
+		if (call->cong_dup_acks < 3)
+			goto send_extra_data;
+
+		change = rxrpc_cong_begin_retransmission;
+		call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
+		call->cong_ssthresh = max_t(unsigned int,
+					    summary->flight_size / 2, 2);
+		cwnd = call->cong_ssthresh + 3;
+		call->cong_extra = 0;
+		call->cong_dup_acks = 0;
+		resend = true;
+		goto out;
+
+	case RXRPC_CALL_FAST_RETRANSMIT:
+		if (!summary->new_low_nack) {
+			if (summary->nr_new_acks == 0)
+				cwnd += 1;
+			call->cong_dup_acks++;
+			if (call->cong_dup_acks == 2) {
+				change = rxrpc_cong_retransmit_again;
+				call->cong_dup_acks = 0;
+				resend = true;
+			}
+		} else {
+			change = rxrpc_cong_progress;
+			cwnd = call->cong_ssthresh;
+			if (summary->nr_nacks == 0)
+				goto resume_normality;
+		}
+		goto out;
+
+	default:
+		BUG();
+		goto out;
+	}
+
+resume_normality:
+	change = rxrpc_cong_cleared_nacks;
+	call->cong_dup_acks = 0;
+	call->cong_extra = 0;
+	call->cong_tstamp = skb->tstamp;
+	if (cwnd <= call->cong_ssthresh)
+		call->cong_mode = RXRPC_CALL_SLOW_START;
+	else
+		call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+out:
+	cumulative_acks = 0;
+out_no_clear_ca:
+	if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1)
+		cwnd = RXRPC_RXTX_BUFF_SIZE - 1;
+	call->cong_cwnd = cwnd;
+	call->cong_cumul_acks = cumulative_acks;
+	trace_rxrpc_congest(call, summary, sp->hdr.serial, change);
+	if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+		rxrpc_queue_call(call);
+	return;
+
+packet_loss_detected:
+	change = rxrpc_cong_saw_nack;
+	call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+	call->cong_dup_acks = 0;
+	goto send_extra_data;
+
+send_extra_data:
+	/* Send some previously unsent DATA if we have some to advance the ACK
+	 * state.
+	 */
+	if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+	    RXRPC_TX_ANNO_LAST ||
+	    summary->nr_acks != call->tx_top - call->tx_hard_ack) {
+		call->cong_extra++;
+		wake_up(&call->waitq);
+	}
+	goto out_no_clear_ca;
+}
+
 /*
  * Ping the other end to fill our RTT cache and to retrieve the rwind
  * and MTU parameters.
@@ -524,7 +684,6 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 				  rxrpc_seq_t seq, int nr_acks,
 				  struct rxrpc_ack_summary *summary)
 {
-	bool resend = false;
 	int ix;
 	u8 annotation, anno_type;
 
@@ -556,16 +715,11 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 				continue;
 			call->rxtx_annotations[ix] =
 				RXRPC_TX_ANNO_NAK | annotation;
-			resend = true;
 			break;
 		default:
 			return rxrpc_proto_abort("SFT", call, 0);
 		}
 	}
-
-	if (resend &&
-	    !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
-		rxrpc_queue_call(call);
 }
 
 /*
@@ -663,6 +817,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		       sp->hdr.serial, call->acks_latest);
 		return;
 	}
+	call->acks_latest_ts = skb->tstamp;
 	call->acks_latest = sp->hdr.serial;
 
 	if (before(hard_ack, call->tx_hard_ack) ||
@@ -692,6 +847,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
 				  false, true,
 				  rxrpc_propose_ack_ping_for_lost_reply);
+
+	return rxrpc_congestion_management(call, skb, &summary);
 }
 
 /*
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index a608769343e6..aedb8978226d 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -200,6 +200,7 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
 const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
 	[rxrpc_propose_ack_client_tx_end]	= "ClTxEnd",
 	[rxrpc_propose_ack_input_data]		= "DataIn ",
+	[rxrpc_propose_ack_ping_for_lost_ack]	= "LostAck",
 	[rxrpc_propose_ack_ping_for_lost_reply]	= "LostRpl",
 	[rxrpc_propose_ack_ping_for_params]	= "Params ",
 	[rxrpc_propose_ack_respond_to_ack]	= "Rsp2Ack",
@@ -214,3 +215,21 @@ const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
 	[rxrpc_propose_ack_update]		= " Update",
 	[rxrpc_propose_ack_subsume]		= " Subsume",
 };
+
+const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
+	[RXRPC_CALL_SLOW_START]		= "SlowStart",
+	[RXRPC_CALL_CONGEST_AVOIDANCE]	= "CongAvoid",
+	[RXRPC_CALL_PACKET_LOSS]	= "PktLoss  ",
+	[RXRPC_CALL_FAST_RETRANSMIT]	= "FastReTx ",
+};
+
+const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
+	[rxrpc_cong_begin_retransmission]	= " Retrans",
+	[rxrpc_cong_cleared_nacks]		= " Cleared",
+	[rxrpc_cong_new_low_nack]		= " NewLowN",
+	[rxrpc_cong_no_change]			= "",
+	[rxrpc_cong_progress]			= " Progres",
+	[rxrpc_cong_retransmit_again]		= " ReTxAgn",
+	[rxrpc_cong_rtt_window_end]		= " RttWinE",
+	[rxrpc_cong_saw_nack]			= " SawNack",
+};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 3eb01445e814..cf43a715685e 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -157,6 +157,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
 		spin_unlock_bh(&call->lock);
 
 
+		pkt->whdr.flags |= RXRPC_SLOW_START_OK;
+
 		iov[0].iov_len += sizeof(pkt->ack) + n;
 		iov[1].iov_base = &pkt->ackinfo;
 		iov[1].iov_len	= sizeof(pkt->ackinfo);
@@ -276,8 +278,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	msg.msg_controllen = 0;
 	msg.msg_flags = 0;
 
-	/* If our RTT cache needs working on, request an ACK. */
-	if ((call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+	/* If our RTT cache needs working on, request an ACK.  Also request
+	 * ACKs if a DATA packet appears to have been lost.
+	 */
+	if (call->cong_mode == RXRPC_CALL_FAST_RETRANSMIT ||
+	    (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
 	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
 			 ktime_get_real()))
 		whdr.flags |= RXRPC_REQUEST_ACK;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 99939372b5a4..1f8040d82395 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -45,7 +45,9 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		ret = 0;
-		if (call->tx_top - call->tx_hard_ack < call->tx_winsize)
+		if (call->tx_top - call->tx_hard_ack <
+		    min_t(unsigned int, call->tx_winsize,
+			  call->cong_cwnd + call->cong_extra))
 			break;
 		if (call->state >= RXRPC_CALL_COMPLETE) {
 			ret = -call->error;
@@ -203,7 +205,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			_debug("alloc");
 
 			if (call->tx_top - call->tx_hard_ack >=
-			    call->tx_winsize) {
+			    min_t(unsigned int, call->tx_winsize,
+				  call->cong_cwnd + call->cong_extra)) {
 				ret = -EAGAIN;
 				if (msg->msg_flags & MSG_DONTWAIT)
 					goto maybe_error;
-- 
cgit v1.2.3


From c2675de447f8238e7e2e7eced78fa671d42a9a7e Mon Sep 17 00:00:00 2001
From: Lance Richardson <lrichard@redhat.com>
Date: Sat, 24 Sep 2016 14:01:04 -0400
Subject: gre: use nla_get_be32() to extract flowinfo

Eliminate a sparse endianness mismatch warning, use nla_get_be32() to
extract a __be32 value instead of nla_get_u32().

Signed-off-by: Lance Richardson <lrichard@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 397e1ed3daa3..4ce74f86291b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1239,7 +1239,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
 		parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
 
 	if (data[IFLA_GRE_FLOWINFO])
-		parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]);
+		parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);
 
 	if (data[IFLA_GRE_FLAGS])
 		parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
-- 
cgit v1.2.3