From aba8269588301f7778bea811d6f7ec74c2e37279 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Wed, 28 Aug 2013 15:09:40 +0800
Subject: {ipv4,xfrm}: Introduce xfrm_tunnel_notifier for xfrm tunnel mode
 callback

Some thoughts on IPv4 VTI implementation:

The connection between VTI receiving part and xfrm tunnel mode input process
is hardly a "xfrm_tunnel", xfrm_tunnel is used in places where, e.g ipip/sit
and xfrm4_tunnel, acts like a true "tunnel" device.

In addition, IMHO, VTI doesn't need vti_err to do something meaningful, as all
VTI needs is just a notifier to be called whenever xfrm_input ingress a packet
to update statistics.

A IPsec protected packet is first handled by protocol handlers, e.g AH/ESP,
to check packet authentication or encryption rightness. PMTU update is taken
care of in this stage by protocol error handler.

Then the packet is rearranged properly depending on whether it's transport
mode or tunnel mode packed by mode "input" handler. The VTI handler code
takes effects in this stage in tunnel mode only. So it neither need propagate
PMTU, as it has already been done if necessary, nor the VTI handler is
qualified as a xfrm_tunnel.

So this patch introduces xfrm_tunnel_notifier and meanwhile wipe out vti_err
code.

Signed-off-by: Fan Du <fan.du@windriver.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: David S. Miller <davem@davemloft.net>
Reviewed-by: Saurabh Mohan <saurabh.mohan@vyatta.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c            | 67 +-------------------------------------------
 net/ipv4/xfrm4_mode_tunnel.c | 16 +++++------
 2 files changed, 9 insertions(+), 74 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e805e7b3030e..91f69bc883fe 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -49,70 +49,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly;
 static int vti_net_id __read_mostly;
 static int vti_tunnel_init(struct net_device *dev);
 
-static int vti_err(struct sk_buff *skb, u32 info)
-{
-
-	/* All the routers (except for Linux) return only
-	 * 8 bytes of packet payload. It means, that precise relaying of
-	 * ICMP in the real Internet is absolutely infeasible.
-	 */
-	struct net *net = dev_net(skb->dev);
-	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
-	struct iphdr *iph = (struct iphdr *)skb->data;
-	const int type = icmp_hdr(skb)->type;
-	const int code = icmp_hdr(skb)->code;
-	struct ip_tunnel *t;
-	int err;
-
-	switch (type) {
-	default:
-	case ICMP_PARAMETERPROB:
-		return 0;
-
-	case ICMP_DEST_UNREACH:
-		switch (code) {
-		case ICMP_SR_FAILED:
-		case ICMP_PORT_UNREACH:
-			/* Impossible event. */
-			return 0;
-		default:
-			/* All others are translated to HOST_UNREACH. */
-			break;
-		}
-		break;
-	case ICMP_TIME_EXCEEDED:
-		if (code != ICMP_EXC_TTL)
-			return 0;
-		break;
-	}
-
-	err = -ENOENT;
-
-	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
-			     iph->daddr, iph->saddr, 0);
-	if (t == NULL)
-		goto out;
-
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
-				 t->parms.link, 0, IPPROTO_IPIP, 0);
-		err = 0;
-		goto out;
-	}
-
-	err = 0;
-	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-		goto out;
-
-	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
-		t->err_count++;
-	else
-		t->err_count = 1;
-	t->err_time = jiffies;
-out:
-	return err;
-}
-
 /* We dont digest the packet therefore let the packet pass */
 static int vti_rcv(struct sk_buff *skb)
 {
@@ -296,9 +232,8 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
 	iph->ihl		= 5;
 }
 
-static struct xfrm_tunnel vti_handler __read_mostly = {
+static struct xfrm_tunnel_notifier vti_handler __read_mostly = {
 	.handler	=	vti_rcv,
-	.err_handler	=	vti_err,
 	.priority	=	1,
 };
 
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index eb1dd4d643f2..b82cde1ea1b6 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -16,13 +16,13 @@
 #include <net/xfrm.h>
 
 /* Informational hook. The decap is still done here. */
-static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
+static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly;
 static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
 
-int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
+int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler)
 {
-	struct xfrm_tunnel __rcu **pprev;
-	struct xfrm_tunnel *t;
+	struct xfrm_tunnel_notifier __rcu **pprev;
+	struct xfrm_tunnel_notifier *t;
 	int ret = -EEXIST;
 	int priority = handler->priority;
 
@@ -50,10 +50,10 @@ err:
 }
 EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
 
-int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
+int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler)
 {
-	struct xfrm_tunnel __rcu **pprev;
-	struct xfrm_tunnel *t;
+	struct xfrm_tunnel_notifier __rcu **pprev;
+	struct xfrm_tunnel_notifier *t;
 	int ret = -ENOENT;
 
 	mutex_lock(&xfrm4_mode_tunnel_input_mutex);
@@ -134,7 +134,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 
 static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct xfrm_tunnel *handler;
+	struct xfrm_tunnel_notifier *handler;
 	int err = -EINVAL;
 
 	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
-- 
cgit v1.2.3


From 16edfe7ee02dd7fcc13855ea19158333529533b2 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 5 Sep 2013 15:36:09 -0700
Subject: tcp: fix no cwnd growth after timeout

In commit 0f7cc9a3 "tcp: increase throughput when reordering is high",
it only allows cwnd to increase in Open state. This mistakenly disables
slow start after timeout (CA_Loss). Moreover cwnd won't grow if the
state moves from Disorder to Open later in tcp_fastretrans_alert().

Therefore the correct logic should be to allow cwnd to grow as long
as the data is received in order in Open, Loss, or even Disorder state.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1969e16d936d..894bc174f472 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3162,16 +3162,14 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 
 	/* If reordering is high then always grow cwnd whenever data is
 	 * delivered regardless of its ordering. Otherwise stay conservative
-	 * and only grow cwnd on in-order delivery in Open state, and retain
-	 * cwnd in Disordered state (RFC5681). A stretched ACK with
+	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
 	 * new SACK or ECE mark may first advance cwnd here and later reduce
 	 * cwnd in tcp_fastretrans_alert() based on more states.
 	 */
 	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
 		return flag & FLAG_FORWARD_PROGRESS;
 
-	return inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
-	       flag & FLAG_DATA_ACKED;
+	return flag & FLAG_DATA_ACKED;
 }
 
 /* Check that window update is acceptable.
-- 
cgit v1.2.3


From 4e4f1fc226816905c937f9b29dabe351075dfe0f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 6 Sep 2013 10:35:58 -0700
Subject: tcp: properly increase rcv_ssthresh for ofo packets

TCP receive window handling is multi staged.

A socket has a memory budget, static or dynamic, in sk_rcvbuf.

Because we do not really know how this memory budget translates to
a TCP window (payload), TCP announces a small initial window
(about 20 MSS).

When a packet is received, we increase TCP rcv_win depending
on the payload/truesize ratio of this packet. Good citizen
packets give a hint that it's reasonable to have rcv_win = sk_rcvbuf/2

This heuristic takes place in tcp_grow_window()

Problem is : We currently call tcp_grow_window() only for in-order
packets.

This means that reorders or packet losses stop proper grow of
rcv_win, and senders are unable to benefit from fast recovery,
or proper reordering level detection.

Really, a packet being stored in OFO queue is not a bad citizen.
It should be part of the game as in-order packets.

In our traces, we very often see sender is limited by linux small
receive windows, even if linux hosts use autotuning (DRS) and should
allow rcv_win to grow to ~3MB.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 894bc174f472..25a89eaa669d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4139,6 +4139,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
 			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 		} else {
+			tcp_grow_window(sk, skb);
 			kfree_skb_partial(skb, fragstolen);
 			skb = NULL;
 		}
@@ -4214,8 +4215,10 @@ add_sack:
 	if (tcp_is_sack(tp))
 		tcp_sack_new_ofo_skb(sk, seq, end_seq);
 end:
-	if (skb)
+	if (skb) {
+		tcp_grow_window(sk, skb);
 		skb_set_owner_r(skb, sk);
+	}
 }
 
 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
-- 
cgit v1.2.3


From 6de5a8bfcae6e3b427d642eff078d8305b324b52 Mon Sep 17 00:00:00 2001
From: Sha Zhengju <handai.szj@taobao.com>
Date: Thu, 12 Sep 2013 15:13:47 -0700
Subject: memcg: rename RESOURCE_MAX to RES_COUNTER_MAX

RESOURCE_MAX is far too general name, change it to RES_COUNTER_MAX.

Signed-off-by: Sha Zhengju <handai.szj@taobao.com>
Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Jeff Liu <jeff.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h |  2 +-
 kernel/res_counter.c        |  8 ++++----
 mm/memcontrol.c             |  4 ++--
 net/ipv4/tcp_memcontrol.c   | 10 +++++-----
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 586bc7cb1dcf..201a69749659 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -54,7 +54,7 @@ struct res_counter {
 	struct res_counter *parent;
 };
 
-#define RESOURCE_MAX ULLONG_MAX
+#define RES_COUNTER_MAX ULLONG_MAX
 
 /**
  * Helpers to interact with userspace
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247e7049..3f0417f97e76 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -17,8 +17,8 @@
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 	spin_lock_init(&counter->lock);
-	counter->limit = RESOURCE_MAX;
-	counter->soft_limit = RESOURCE_MAX;
+	counter->limit = RES_COUNTER_MAX;
+	counter->soft_limit = RES_COUNTER_MAX;
 	counter->parent = parent;
 }
 
@@ -182,12 +182,12 @@ int res_counter_memparse_write_strategy(const char *buf,
 {
 	char *end;
 
-	/* return RESOURCE_MAX(unlimited) if "-1" is specified */
+	/* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
 	if (*buf == '-') {
 		*res = simple_strtoull(buf + 1, &end, 10);
 		if (*res != 1 || *end != '\0')
 			return -EINVAL;
-		*res = RESOURCE_MAX;
+		*res = RES_COUNTER_MAX;
 		return 0;
 	}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b5cfb509270..2c71f243186e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4967,7 +4967,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
 	 */
 	mutex_lock(&memcg_create_mutex);
 	mutex_lock(&set_limit_mutex);
-	if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+	if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
 		if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
 			ret = -EBUSY;
 			goto out;
@@ -4977,7 +4977,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
 
 		ret = memcg_update_cache_sizes(memcg);
 		if (ret) {
-			res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+			res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
 			goto out;
 		}
 		static_key_slow_inc(&memcg_kmem_enabled_key);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 8a57d79b0b16..559d4ae6ebf4 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 	if (!cg_proto)
 		return -EINVAL;
 
-	if (val > RESOURCE_MAX)
-		val = RESOURCE_MAX;
+	if (val > RES_COUNTER_MAX)
+		val = RES_COUNTER_MAX;
 
 	tcp = tcp_from_cgproto(cg_proto);
 
@@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
 					     net->ipv4.sysctl_tcp_mem[i]);
 
-	if (val == RESOURCE_MAX)
+	if (val == RES_COUNTER_MAX)
 		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-	else if (val != RESOURCE_MAX) {
+	else if (val != RES_COUNTER_MAX) {
 		/*
 		 * The active bit needs to be written after the static_key
 		 * update. This is what guarantees that the socket activation
@@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
 
 	switch (cft->private) {
 	case RES_LIMIT:
-		val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
+		val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
 		break;
 	case RES_USAGE:
 		val = tcp_read_usage(memcg);
-- 
cgit v1.2.3


From bafd4bd4dcfa13145db7f951251eef3e10f8c278 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 9 Sep 2013 10:38:38 +0200
Subject: xfrm: Decode sessions with output interface.

The output interface matching does not work on forward
policy lookups, the output interface of the flowi is
always 0. Fix this by setting the output interface when
we decode the session.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 1 +
 net/ipv6/xfrm6_policy.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9a459be24af7..ccde54248c8c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -107,6 +107,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 
 	memset(fl4, 0, sizeof(struct flowi4));
 	fl4->flowi4_mark = skb->mark;
+	fl4->flowi4_oif = skb_dst(skb)->dev->ifindex;
 
 	if (!ip_is_fragment(iph)) {
 		switch (iph->protocol) {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 23ed03d786c8..08ed2772b7aa 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -138,6 +138,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 
 	memset(fl6, 0, sizeof(struct flowi6));
 	fl6->flowi6_mark = skb->mark;
+	fl6->flowi6_oif = skb_dst(skb)->dev->ifindex;
 
 	fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
 	fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
-- 
cgit v1.2.3


From 269aa759b474570fa642452742741525cfc226a9 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 16 Sep 2013 21:44:20 -0400
Subject: tcp: fix RTO calculated from cached RTT

Commit 1b7fdd2ab5852 ("tcp: do not use cached RTT for RTT estimation")
did not correctly account for the fact that crtt is the RTT shifted
left 3 bits. Fix the calculation to consistently reflect this fact.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-By: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4a22f3e715df..52f3c6b971d2 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -502,7 +502,9 @@ reset:
 	 * ACKs, wait for troubles.
 	 */
 	if (crtt > tp->srtt) {
-		inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
+		/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
+		crtt >>= 3;
+		inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
 	} else if (tp->srtt == 0) {
 		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
 		 * 3WHS. This is most likely due to retransmission,
-- 
cgit v1.2.3


From 749154aa56b57652a282cbde57a57abc278d1205 Mon Sep 17 00:00:00 2001
From: Ansis Atteka <aatteka@nicira.com>
Date: Wed, 18 Sep 2013 15:29:52 -0700
Subject: ip: use ip_hdr() in __ip_make_skb() to retrieve IP header

skb->data already points to IP header, but for the sake of
consistency we can also use ip_hdr() to retrieve it.

Signed-off-by: Ansis Atteka <aatteka@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9ee17e3d11c3..eae2e262fbe5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1316,7 +1316,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	else
 		ttl = ip_select_ttl(inet, &rt->dst);
 
-	iph = (struct iphdr *)skb->data;
+	iph = ip_hdr(skb);
 	iph->version = 4;
 	iph->ihl = 5;
 	iph->tos = inet->tos;
-- 
cgit v1.2.3


From 703133de331a7a7df47f31fb9de51dc6f68a9de8 Mon Sep 17 00:00:00 2001
From: Ansis Atteka <aatteka@nicira.com>
Date: Wed, 18 Sep 2013 15:29:53 -0700
Subject: ip: generate unique IP identificator if local fragmentation is
 allowed

If local fragmentation is allowed, then ip_select_ident() and
ip_select_ident_more() need to generate unique IDs to ensure
correct defragmentation on the peer.

For example, if IPsec (tunnel mode) has to encrypt large skbs
that have local_df bit set, then all IP fragments that belonged
to different ESP datagrams would have used the same identificator.
If one of these IP fragments would get lost or reordered, then
peer could possibly stitch together wrong IP fragments that did
not belong to the same datagram. This would lead to a packet loss
or data corruption.

Signed-off-by: Ansis Atteka <aatteka@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pptp.c          |  2 +-
 include/net/ip.h                | 12 ++++++++----
 net/ipv4/igmp.c                 |  4 ++--
 net/ipv4/inetpeer.c             |  4 ++--
 net/ipv4/ip_output.c            |  6 +++---
 net/ipv4/ipmr.c                 |  2 +-
 net/ipv4/raw.c                  |  2 +-
 net/ipv4/xfrm4_mode_tunnel.c    |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c |  2 +-
 9 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 6fa5ae00039f..01805319e1e0 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -281,7 +281,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	nf_reset(skb);
 
 	skb->ip_summed = CHECKSUM_NONE;
-	ip_select_ident(iph, &rt->dst, NULL);
+	ip_select_ident(skb, &rt->dst, NULL);
 	ip_send_check(iph);
 
 	ip_local_out(skb);
diff --git a/include/net/ip.h b/include/net/ip.h
index 48f55979d842..5e5268807a1c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -264,9 +264,11 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
 
 extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
 
-static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, struct sock *sk)
+static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk)
 {
-	if (iph->frag_off & htons(IP_DF)) {
+	struct iphdr *iph = ip_hdr(skb);
+
+	if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) {
 		/* This is only to work around buggy Windows95/2000
 		 * VJ compression implementations.  If the ID field
 		 * does not change, they drop every other packet in
@@ -278,9 +280,11 @@ static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, str
 		__ip_select_ident(iph, dst, 0);
 }
 
-static inline void ip_select_ident_more(struct iphdr *iph, struct dst_entry *dst, struct sock *sk, int more)
+static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk, int more)
 {
-	if (iph->frag_off & htons(IP_DF)) {
+	struct iphdr *iph = ip_hdr(skb);
+
+	if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) {
 		if (sk && inet_sk(sk)->inet_daddr) {
 			iph->id = htons(inet_sk(sk)->inet_id);
 			inet_sk(sk)->inet_id += 1 + more;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d6c0e64ec97f..dace87f06e5f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	pip->saddr    = fl4.saddr;
 	pip->protocol = IPPROTO_IGMP;
 	pip->tot_len  = 0;	/* filled in later */
-	ip_select_ident(pip, &rt->dst, NULL);
+	ip_select_ident(skb, &rt->dst, NULL);
 	((u8 *)&pip[1])[0] = IPOPT_RA;
 	((u8 *)&pip[1])[1] = 4;
 	((u8 *)&pip[1])[2] = 0;
@@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	iph->daddr    = dst;
 	iph->saddr    = fl4.saddr;
 	iph->protocol = IPPROTO_IGMP;
-	ip_select_ident(iph, &rt->dst, NULL);
+	ip_select_ident(skb, &rt->dst, NULL);
 	((u8 *)&iph[1])[0] = IPOPT_RA;
 	((u8 *)&iph[1])[1] = 4;
 	((u8 *)&iph[1])[2] = 0;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 000e3d239d64..33d5537881ed 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -32,8 +32,8 @@
  *  At the moment of writing this notes identifier of IP packets is generated
  *  to be unpredictable using this code only for packets subjected
  *  (actually or potentially) to defragmentation.  I.e. DF packets less than
- *  PMTU in size uses a constant ID and do not use this code (see
- *  ip_select_ident() in include/net/ip.h).
+ *  PMTU in size when local fragmentation is disabled use a constant ID and do
+ *  not use this code (see ip_select_ident() in include/net/ip.h).
  *
  *  Route cache entries hold references to our nodes.
  *  New cache entries get references via lookup by destination IP address in
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eae2e262fbe5..a04d872c54f9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
 	iph->saddr    = saddr;
 	iph->protocol = sk->sk_protocol;
-	ip_select_ident(iph, &rt->dst, sk);
+	ip_select_ident(skb, &rt->dst, sk);
 
 	if (opt && opt->opt.optlen) {
 		iph->ihl += opt->opt.optlen>>2;
@@ -386,7 +386,7 @@ packet_routed:
 		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
 	}
 
-	ip_select_ident_more(iph, &rt->dst, sk,
+	ip_select_ident_more(skb, &rt->dst, sk,
 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 
 	skb->priority = sk->sk_priority;
@@ -1324,7 +1324,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	iph->ttl = ttl;
 	iph->protocol = sk->sk_protocol;
 	ip_copy_addrs(iph, fl4);
-	ip_select_ident(iph, &rt->dst, sk);
+	ip_select_ident(skb, &rt->dst, sk);
 
 	if (opt) {
 		iph->ihl += opt->optlen>>2;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9ae54b09254f..62212c772a4b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1658,7 +1658,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 	iph->protocol	=	IPPROTO_IPIP;
 	iph->ihl	=	5;
 	iph->tot_len	=	htons(skb->len);
-	ip_select_ident(iph, skb_dst(skb), NULL);
+	ip_select_ident(skb, skb_dst(skb), NULL);
 	ip_send_check(iph);
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a86c7ae71881..bfec521c717f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -387,7 +387,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 		iph->check   = 0;
 		iph->tot_len = htons(length);
 		if (!iph->id)
-			ip_select_ident(iph, &rt->dst, NULL);
+			ip_select_ident(skb, &rt->dst, NULL);
 
 		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 	}
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index eb1dd4d643f2..b5663c37f089 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -117,7 +117,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 
 	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
 		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
-	ip_select_ident(top_iph, dst->child, NULL);
+	ip_select_ident(skb, dst->child, NULL);
 
 	top_iph->ttl = ip4_dst_hoplimit(dst->child);
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b75ff6429a04..c47444e4cf8c 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -883,7 +883,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	iph->daddr		=	cp->daddr.ip;
 	iph->saddr		=	saddr;
 	iph->ttl		=	old_iph->ttl;
-	ip_select_ident(iph, &rt->dst, NULL);
+	ip_select_ident(skb, &rt->dst, NULL);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
-- 
cgit v1.2.3


From 1a462d189280b560bd84af1407e4d848e262b3b3 Mon Sep 17 00:00:00 2001
From: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Date: Fri, 20 Sep 2013 18:20:28 +0800
Subject: net: udp: do not report ICMP redirects to user space

Redirect isn't an error condition, it should leave
the error handler without touching the socket.

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 2 +-
 net/ipv6/udp.c | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 74d2c95db57f..0ca44df51ee9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -658,7 +658,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 		break;
 	case ICMP_REDIRECT:
 		ipv4_sk_redirect(skb, sk);
-		break;
+		goto out;
 	}
 
 	/*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f4058150262b..72b7eaaf3ca0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -525,8 +525,10 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (type == ICMPV6_PKT_TOOBIG)
 		ip6_sk_update_pmtu(skb, sk, info);
-	if (type == NDISC_REDIRECT)
+	if (type == NDISC_REDIRECT) {
 		ip6_sk_redirect(skb, sk);
+		goto out;
+	}
 
 	np = inet6_sk(sk);
 
-- 
cgit v1.2.3


From 8d65b1190ddc548b0411477f308d04f4595bac57 Mon Sep 17 00:00:00 2001
From: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Date: Fri, 20 Sep 2013 18:21:25 +0800
Subject: net: raw: do not report ICMP redirects to user space

Redirect isn't an error condition, it should leave
the error handler without touching the socket.

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 4 +++-
 net/ipv6/raw.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bfec521c717f..193db03540ad 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -218,8 +218,10 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
 		ipv4_sk_update_pmtu(skb, sk, info);
-	else if (type == ICMP_REDIRECT)
+	else if (type == ICMP_REDIRECT) {
 		ipv4_sk_redirect(skb, sk);
+		return;
+	}
 
 	/* Report error on raw socket, if:
 	   1. User requested ip_recverr.
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 58916bbb1728..a4ed2416399e 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -335,8 +335,10 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 		ip6_sk_update_pmtu(skb, sk, info);
 		harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
 	}
-	if (type == NDISC_REDIRECT)
+	if (type == NDISC_REDIRECT) {
 		ip6_sk_redirect(skb, sk);
+		return;
+	}
 	if (np->recverr) {
 		u8 *payload = skb->data;
 		if (!inet->hdrincl)
-- 
cgit v1.2.3


From 8c27bd75f04fb9cb70c69c3cfe24f4e6d8e15906 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 20 Sep 2013 22:32:55 +0200
Subject: tcp: syncookies: reduce cookie lifetime to 128 seconds

We currently accept cookies that were created less than 4 minutes ago
(ie, cookies with counter delta 0-3).  Combined with the 8 mss table
values, this yields 32 possible values (out of 2**32) that will be valid.

Reducing the lifetime to < 2 minutes halves the guessing chance while
still providing a large enough period.

While at it, get rid of jiffies value -- they overflow too quickly on
32 bit platforms.

getnstimeofday is used to create a counter that increments every 64s.
perf shows getnstimeofday cost is negible compared to sha_transform;
normal tcp initial sequence number generation uses getnstimeofday, too.

Reported-by: Jakob Lell <jakob@jakoblell.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 18 ++++++++++++++++++
 net/ipv4/syncookies.c | 31 ++++++++++---------------------
 net/ipv6/syncookies.c | 24 +++++++-----------------
 3 files changed, 35 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0e47551e9bdb..de870ee5582d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -481,6 +481,24 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 			     struct ip_options *opt);
 #ifdef CONFIG_SYN_COOKIES
+#include <linux/ktime.h>
+
+/* Syncookies use a monotonic timer which increments every 64 seconds.
+ * This counter is used both as a hash input and partially encoded into
+ * the cookie value.  A cookie is only validated further if the delta
+ * between the current counter value and the encoded one is less than this,
+ * i.e. a sent cookie is valid only at most for 128 seconds (or less if
+ * the counter advances immediately after a cookie is generated).
+ */
+#define MAX_SYNCOOKIE_AGE 2
+
+static inline u32 tcp_cookie_time(void)
+{
+	struct timespec now;
+	getnstimeofday(&now);
+	return now.tv_sec >> 6; /* 64 seconds granularity */
+}
+
 u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 			      u16 *mssp);
 __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 14a15c49129d..b6ea2979a2b7 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -89,8 +89,7 @@ __u32 cookie_init_timestamp(struct request_sock *req)
 
 
 static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
-				   __be16 dport, __u32 sseq, __u32 count,
-				   __u32 data)
+				   __be16 dport, __u32 sseq, __u32 data)
 {
 	/*
 	 * Compute the secure sequence number.
@@ -102,7 +101,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
 	 * As an extra hack, we add a small "data" value that encodes the
 	 * MSS into the second hash value.
 	 */
-
+	u32 count = tcp_cookie_time();
 	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
 		sseq + (count << COOKIEBITS) +
 		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
@@ -114,22 +113,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
  * If the syncookie is bad, the data returned will be out of
  * range.  This must be checked by the caller.
  *
- * The count value used to generate the cookie must be within
- * "maxdiff" if the current (passed-in) "count".  The return value
- * is (__u32)-1 if this test fails.
+ * The count value used to generate the cookie must be less than
+ * MAX_SYNCOOKIE_AGE minutes in the past.
+ * The return value (__u32)-1 if this test fails.
  */
 static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
-				  __be16 sport, __be16 dport, __u32 sseq,
-				  __u32 count, __u32 maxdiff)
+				  __be16 sport, __be16 dport, __u32 sseq)
 {
-	__u32 diff;
+	u32 diff, count = tcp_cookie_time();
 
 	/* Strip away the layers from the cookie */
 	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
 
 	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
 	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
-	if (diff >= maxdiff)
+	if (diff >= MAX_SYNCOOKIE_AGE)
 		return (__u32)-1;
 
 	return (cookie -
@@ -173,7 +171,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 
 	return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
 				     th->source, th->dest, ntohl(th->seq),
-				     jiffies / (HZ * 60), mssind);
+				     mssind);
 }
 EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
 
@@ -188,13 +186,6 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
 	return __cookie_v4_init_sequence(iph, th, mssp);
 }
 
-/*
- * This (misnamed) value is the age of syncookie which is permitted.
- * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
- * sysctl_tcp_retries1. It's a rather complicated formula (exponential
- * backoff) to compute at runtime so it's currently hardcoded here.
- */
-#define COUNTER_TRIES 4
 /*
  * Check if a ack sequence number is a valid syncookie.
  * Return the decoded mss if it is, or 0 if not.
@@ -204,9 +195,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 {
 	__u32 seq = ntohl(th->seq) - 1;
 	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
-					    th->source, th->dest, seq,
-					    jiffies / (HZ * 60),
-					    COUNTER_TRIES);
+					    th->source, th->dest, seq);
 
 	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index bf63ac8a49b9..13ca0a0ea680 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -36,14 +36,6 @@ static __u16 const msstab[] = {
 	9000 - 60,
 };
 
-/*
- * This (misnamed) value is the age of syncookie which is permitted.
- * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
- * sysctl_tcp_retries1. It's a rather complicated formula (exponential
- * backoff) to compute at runtime so it's currently hardcoded here.
- */
-#define COUNTER_TRIES 4
-
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 					   struct request_sock *req,
 					   struct dst_entry *dst)
@@ -86,8 +78,9 @@ static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *dadd
 static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
 				   const struct in6_addr *daddr,
 				   __be16 sport, __be16 dport, __u32 sseq,
-				   __u32 count, __u32 data)
+				   __u32 data)
 {
+	u32 count = tcp_cookie_time();
 	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
 		sseq + (count << COOKIEBITS) +
 		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
@@ -96,15 +89,14 @@ static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
 
 static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr,
 				  const struct in6_addr *daddr, __be16 sport,
-				  __be16 dport, __u32 sseq, __u32 count,
-				  __u32 maxdiff)
+				  __be16 dport, __u32 sseq)
 {
-	__u32 diff;
+	__u32 diff, count = tcp_cookie_time();
 
 	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
 
 	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
-	if (diff >= maxdiff)
+	if (diff >= MAX_SYNCOOKIE_AGE)
 		return (__u32)-1;
 
 	return (cookie -
@@ -125,8 +117,7 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
 	*mssp = msstab[mssind];
 
 	return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source,
-				     th->dest, ntohl(th->seq),
-				     jiffies / (HZ * 60), mssind);
+				     th->dest, ntohl(th->seq), mssind);
 }
 EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
 
@@ -146,8 +137,7 @@ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
 {
 	__u32 seq = ntohl(th->seq) - 1;
 	__u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
-					    th->source, th->dest, seq,
-					    jiffies / (HZ * 60), COUNTER_TRIES);
+					    th->source, th->dest, seq);
 
 	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
-- 
cgit v1.2.3


From 086293542b991fb88a2e41ae7b4f82ac65a20e1a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 20 Sep 2013 22:32:56 +0200
Subject: tcp: syncookies: reduce mss table to four values

Halve mss table size to make blind cookie guessing more difficult.
This is sad since the tables were already small, but there
is little alternative except perhaps adding more precise mss information
in the tcp timestamp.  Timestamps are unfortunately not ubiquitous.

Guessing all possible cookie values still has 8-in 2**32 chance.

Reported-by: Jakob Lell <jakob@jakoblell.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/syncookies.c | 22 +++++++++++-----------
 net/ipv6/syncookies.c | 15 +++++++++------
 2 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b6ea2979a2b7..15e024105f91 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -136,22 +136,22 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
 }
 
 /*
- * MSS Values are taken from the 2009 paper
- * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
- *  - values 1440 to 1460 accounted for 80% of observed mss values
- *  - values outside the 536-1460 range are rare (<0.2%).
+ * MSS Values are chosen based on the 2011 paper
+ * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
+ * Values ..
+ *  .. lower than 536 are rare (< 0.2%)
+ *  .. between 537 and 1299 account for less than < 1.5% of observed values
+ *  .. in the 1300-1349 range account for about 15 to 20% of observed mss values
+ *  .. exceeding 1460 are very rare (< 0.04%)
  *
- * Table must be sorted.
+ *  1460 is the single most frequently announced mss value (30 to 46% depending
+ *  on monitor location).  Table must be sorted.
  */
 static __u16 const msstab[] = {
-	64,
-	512,
 	536,
-	1024,
-	1440,
+	1300,
+	1440,	/* 1440, 1452: PPPoE */
 	1460,
-	4312,
-	8960,
 };
 
 /*
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 13ca0a0ea680..d703218a653b 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -24,15 +24,18 @@
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-/* Table must be sorted. */
+/* RFC 2460, Section 8.3:
+ * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
+ *
+ * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows
+ * using higher values than ipv4 tcp syncookies.
+ * The other values are chosen based on ethernet (1500 and 9k MTU), plus
+ * one that accounts for common encap (PPPoe) overhead. Table must be sorted.
+ */
 static __u16 const msstab[] = {
-	64,
-	512,
-	536,
-	1280 - 60,
+	1280 - 60, /* IPV6_MIN_MTU - 60 */
 	1480 - 60,
 	1500 - 60,
-	4460 - 60,
 	9000 - 60,
 };
 
-- 
cgit v1.2.3


From b0983d3c9b132c33b6fb2e28d157a1edc18a173c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 20 Sep 2013 13:56:58 -0700
Subject: tcp: fix dynamic right sizing

Dynamic Right Sizing (DRS) is supposed to open TCP receive window
automatically, but suffers from two bugs, presented by order
of importance.

1) tcp_rcv_space_adjust() fix :

Using twice the last received amount is very pessimistic,
because it doesn't allow fast recovery or proper slow start
ramp up, if sender wants to increase cwin by 100% every RTT.

copied = bytes received in previous RTT

2*copied = bytes we expect to receive in next RTT

4*copied = bytes we need to advertise in rwin at end of next RTT

DRS is one RTT late, it needs a 4x factor.

If sender is not using ABC, and increases cwin by 50% every rtt,
then we needed 1.5*1.5 = 2.25 factor.
This is probably why this bug was not really noticed.

2) There is no window adjustment after first RTT. DRS triggers only
  after the second RTT.
  DRS needs two RTT to initialize, so tcp_fixup_rcvbuf() should setup
  sk_rcvbuf to allow proper window grow for first two RTT.

This patch increases TCP efficiency particularly for large RTT flows
when autotuning is used at the receiver, and more particularly
in presence of packet losses.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Cc: Van Jacobson <vanj@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 84 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 31 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 25a89eaa669d..5d083855c111 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -355,6 +355,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
 	rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
 		 tcp_default_init_rwnd(mss);
 
+	/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
+	 * Allow enough cushion so that sender is not limited by our window
+	 */
+	if (sysctl_tcp_moderate_rcvbuf)
+		rcvmem <<= 2;
+
 	if (sk->sk_rcvbuf < rcvmem)
 		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
 }
@@ -373,6 +379,8 @@ void tcp_init_buffer_space(struct sock *sk)
 		tcp_fixup_sndbuf(sk);
 
 	tp->rcvq_space.space = tp->rcv_wnd;
+	tp->rcvq_space.time = tcp_time_stamp;
+	tp->rcvq_space.seq = tp->copied_seq;
 
 	maxwin = tcp_full_space(sk);
 
@@ -512,48 +520,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int time;
-	int space;
-
-	if (tp->rcvq_space.time == 0)
-		goto new_measure;
+	int copied;
 
 	time = tcp_time_stamp - tp->rcvq_space.time;
 	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
 		return;
 
-	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+	/* Number of bytes copied to user in last RTT */
+	copied = tp->copied_seq - tp->rcvq_space.seq;
+	if (copied <= tp->rcvq_space.space)
+		goto new_measure;
+
+	/* A bit of theory :
+	 * copied = bytes received in previous RTT, our base window
+	 * To cope with packet losses, we need a 2x factor
+	 * To cope with slow start, and sender growing its cwin by 100 %
+	 * every RTT, we need a 4x factor, because the ACK we are sending
+	 * now is for the next RTT, not the current one :
+	 * <prev RTT . ><current RTT .. ><next RTT .... >
+	 */
+
+	if (sysctl_tcp_moderate_rcvbuf &&
+	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+		int rcvwin, rcvmem, rcvbuf;
 
-	space = max(tp->rcvq_space.space, space);
+		/* minimal window to cope with packet losses, assuming
+		 * steady state. Add some cushion because of small variations.
+		 */
+		rcvwin = (copied << 1) + 16 * tp->advmss;
 
-	if (tp->rcvq_space.space != space) {
-		int rcvmem;
+		/* If rate increased by 25%,
+		 *	assume slow start, rcvwin = 3 * copied
+		 * If rate increased by 50%,
+		 *	assume sender can use 2x growth, rcvwin = 4 * copied
+		 */
+		if (copied >=
+		    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
+			if (copied >=
+			    tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
+				rcvwin <<= 1;
+			else
+				rcvwin += (rcvwin >> 1);
+		}
 
-		tp->rcvq_space.space = space;
+		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+		while (tcp_win_from_space(rcvmem) < tp->advmss)
+			rcvmem += 128;
 
-		if (sysctl_tcp_moderate_rcvbuf &&
-		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-			int new_clamp = space;
+		rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+		if (rcvbuf > sk->sk_rcvbuf) {
+			sk->sk_rcvbuf = rcvbuf;
 
-			/* Receive space grows, normalize in order to
-			 * take into account packet headers and sk_buff
-			 * structure overhead.
-			 */
-			space /= tp->advmss;
-			if (!space)
-				space = 1;
-			rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
-			while (tcp_win_from_space(rcvmem) < tp->advmss)
-				rcvmem += 128;
-			space *= rcvmem;
-			space = min(space, sysctl_tcp_rmem[2]);
-			if (space > sk->sk_rcvbuf) {
-				sk->sk_rcvbuf = space;
-
-				/* Make the window clamp follow along.  */
-				tp->window_clamp = new_clamp;
-			}
+			/* Make the window clamp follow along.  */
+			tp->window_clamp = rcvwin;
 		}
 	}
+	tp->rcvq_space.space = copied;
 
 new_measure:
 	tp->rcvq_space.seq = tp->copied_seq;
@@ -5674,8 +5696,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			tcp_init_congestion_control(sk);
 
 			tcp_mtup_init(sk);
-			tcp_init_buffer_space(sk);
 			tp->copied_seq = tp->rcv_nxt;
+			tcp_init_buffer_space(sk);
 		}
 		smp_mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
-- 
cgit v1.2.3


From 9a3bab6b05383f1e4c3716b3615500c51285959e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 24 Sep 2013 06:19:57 -0700
Subject: net: net_secret should not depend on TCP

A host might need net_secret[] and never open a single socket.

Problem added in commit aebda156a570782
("net: defer net_secret[] initialization")

Based on prior patch from Hannes Frederic Sowa.

Reported-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@strressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/secure_seq.h |  1 -
 net/core/secure_seq.c    | 27 ++++++++++++++++++++++++---
 net/ipv4/af_inet.c       |  4 +---
 3 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 6ca975bebd37..c2e542b27a5a 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -3,7 +3,6 @@
 
 #include <linux/types.h>
 
-extern void net_secret_init(void);
 extern __u32 secure_ip_id(__be32 daddr);
 extern __u32 secure_ipv6_id(const __be32 daddr[4]);
 extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 6a2f13cee86a..3f1ec1586ae1 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -10,11 +10,24 @@
 
 #include <net/secure_seq.h>
 
-static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
+#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
 
-void net_secret_init(void)
+static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+
+static void net_secret_init(void)
 {
-	get_random_bytes(net_secret, sizeof(net_secret));
+	u32 tmp;
+	int i;
+
+	if (likely(net_secret[0]))
+		return;
+
+	for (i = NET_SECRET_SIZE; i > 0;) {
+		do {
+			get_random_bytes(&tmp, sizeof(tmp));
+		} while (!tmp);
+		cmpxchg(&net_secret[--i], 0, tmp);
+	}
 }
 
 #ifdef CONFIG_INET
@@ -42,6 +55,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 	u32 hash[MD5_DIGEST_WORDS];
 	u32 i;
 
+	net_secret_init();
 	memcpy(hash, saddr, 16);
 	for (i = 0; i < 4; i++)
 		secret[i] = net_secret[i] + (__force u32)daddr[i];
@@ -63,6 +77,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 	u32 hash[MD5_DIGEST_WORDS];
 	u32 i;
 
+	net_secret_init();
 	memcpy(hash, saddr, 16);
 	for (i = 0; i < 4; i++)
 		secret[i] = net_secret[i] + (__force u32) daddr[i];
@@ -82,6 +97,7 @@ __u32 secure_ip_id(__be32 daddr)
 {
 	u32 hash[MD5_DIGEST_WORDS];
 
+	net_secret_init();
 	hash[0] = (__force __u32) daddr;
 	hash[1] = net_secret[13];
 	hash[2] = net_secret[14];
@@ -96,6 +112,7 @@ __u32 secure_ipv6_id(const __be32 daddr[4])
 {
 	__u32 hash[4];
 
+	net_secret_init();
 	memcpy(hash, daddr, 16);
 	md5_transform(hash, net_secret);
 
@@ -107,6 +124,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
 {
 	u32 hash[MD5_DIGEST_WORDS];
 
+	net_secret_init();
 	hash[0] = (__force u32)saddr;
 	hash[1] = (__force u32)daddr;
 	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
@@ -121,6 +139,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
 {
 	u32 hash[MD5_DIGEST_WORDS];
 
+	net_secret_init();
 	hash[0] = (__force u32)saddr;
 	hash[1] = (__force u32)daddr;
 	hash[2] = (__force u32)dport ^ net_secret[14];
@@ -140,6 +159,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 	u32 hash[MD5_DIGEST_WORDS];
 	u64 seq;
 
+	net_secret_init();
 	hash[0] = (__force u32)saddr;
 	hash[1] = (__force u32)daddr;
 	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
@@ -164,6 +184,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
 	u64 seq;
 	u32 i;
 
+	net_secret_init();
 	memcpy(hash, saddr, 16);
 	for (i = 0; i < 4; i++)
 		secret[i] = net_secret[i] + daddr[i];
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7a1874b7b8fd..cfeb85cff4f0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -263,10 +263,8 @@ void build_ehash_secret(void)
 		get_random_bytes(&rnd, sizeof(rnd));
 	} while (rnd == 0);
 
-	if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) {
+	if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0)
 		get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
-		net_secret_init();
-	}
 }
 EXPORT_SYMBOL(build_ehash_secret);
 
-- 
cgit v1.2.3


From f02db315b8d888570cb0d4496cfbb7e4acb047cb Mon Sep 17 00:00:00 2001
From: Francesco Fusco <ffusco@redhat.com>
Date: Tue, 24 Sep 2013 15:43:08 +0200
Subject: ipv4: IP_TOS and IP_TTL can be specified as ancillary data

This patch enables the IP_TTL and IP_TOS values passed from userspace to
be stored in the ipcm_cookie struct. Three fields are added to the struct:

- the TTL, expressed as __u8.
  The allowed values are in the [1-255].
  A value of 0 means that the TTL is not specified.

- the TOS, expressed as __s16.
  The allowed values are in the range [0,255].
  A value of -1 means that the TOS is not specified.

- the priority, expressed as a char and computed when
  handling the ancillary data.

Signed-off-by: Francesco Fusco <ffusco@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       |  3 +++
 net/ipv4/ip_sockglue.c | 20 +++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index c1f192b8cd0e..0135f3823e66 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -56,6 +56,9 @@ struct ipcm_cookie {
 	int			oif;
 	struct ip_options_rcu	*opt;
 	__u8			tx_flags;
+	__u8			ttl;
+	__s16			tos;
+	char			priority;
 };
 
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d9c4f113d709..56e34457ac07 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -189,7 +189,7 @@ EXPORT_SYMBOL(ip_cmsg_recv);
 
 int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
 {
-	int err;
+	int err, val;
 	struct cmsghdr *cmsg;
 
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
@@ -215,6 +215,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
 			ipc->addr = info->ipi_spec_dst.s_addr;
 			break;
 		}
+		case IP_TTL:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+				return -EINVAL;
+			val = *(int *)CMSG_DATA(cmsg);
+			if (val < 1 || val > 255)
+				return -EINVAL;
+			ipc->ttl = val;
+			break;
+		case IP_TOS:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+				return -EINVAL;
+			val = *(int *)CMSG_DATA(cmsg);
+			if (val < 0 || val > 255)
+				return -EINVAL;
+			ipc->tos = val;
+			ipc->priority = rt_tos2priority(ipc->tos);
+			break;
+
 		default:
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From aa6615814533c634190019ee3a5b10490026d545 Mon Sep 17 00:00:00 2001
From: Francesco Fusco <ffusco@redhat.com>
Date: Tue, 24 Sep 2013 15:43:09 +0200
Subject: ipv4: processing ancillary IP_TOS or IP_TTL

If IP_TOS or IP_TTL are specified as ancillary data, then sendmsg() sends out
packets with the specified TTL or TOS overriding the socket values specified
with the traditional setsockopt().

The struct inet_cork stores the values of TOS, TTL and priority that are
passed through the struct ipcm_cookie. If there are user-specified TOS
(tos != -1) or TTL (ttl != 0) in the struct ipcm_cookie, these values are
used to override the per-socket values. In case of TOS also the priority
is changed accordingly.

Two helper functions get_rttos and get_rtconn_flags are defined to take
into account the presence of a user specified TOS value when computing
RT_TOS and RT_CONN_FLAGS.

Signed-off-by: Francesco Fusco <ffusco@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h |  3 +++
 include/net/ip.h        | 11 +++++++++++
 include/net/route.h     |  1 +
 net/ipv4/icmp.c         |  5 +++++
 net/ipv4/ip_output.c    | 13 ++++++++++---
 net/ipv4/ping.c         |  4 +++-
 net/ipv4/raw.c          |  4 +++-
 net/ipv4/udp.c          |  4 +++-
 8 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 636d203727a2..f3141773c14d 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -103,6 +103,9 @@ struct inet_cork {
 	int			length; /* Total length of all frames */
 	struct dst_entry	*dst;
 	u8			tx_flags;
+	__u8			ttl;
+	__s16			tos;
+	char			priority;
 };
 
 struct inet_cork_full {
diff --git a/include/net/ip.h b/include/net/ip.h
index 0135f3823e66..77b4f9b57c28 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -28,6 +28,7 @@
 #include <linux/skbuff.h>
 
 #include <net/inet_sock.h>
+#include <net/route.h>
 #include <net/snmp.h>
 #include <net/flow.h>
 
@@ -140,6 +141,16 @@ static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
 	return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
 }
 
+static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
+{
+	return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos);
+}
+
+static inline __u8 get_rtconn_flags(struct ipcm_cookie* ipc, struct sock* sk)
+{
+	return (ipc->tos != -1) ? RT_CONN_FLAGS_TOS(sk, ipc->tos) : RT_CONN_FLAGS(sk);
+}
+
 /* datagram.c */
 int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 
diff --git a/include/net/route.h b/include/net/route.h
index 6f572ca66d25..0ad8e0102386 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -39,6 +39,7 @@
 #define RTO_ONLINK	0x01
 
 #define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
+#define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
 
 struct fib_nh;
 struct fib_info;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5f7d11a45871..5c0e8bc6e5ba 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -353,6 +353,9 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	saddr = fib_compute_spec_dst(skb);
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
+	ipc.ttl = 0;
+	ipc.tos = -1;
+
 	if (icmp_param->replyopts.opt.opt.optlen) {
 		ipc.opt = &icmp_param->replyopts.opt;
 		if (ipc.opt->opt.srr)
@@ -608,6 +611,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	ipc.addr = iph->saddr;
 	ipc.opt = &icmp_param->replyopts.opt;
 	ipc.tx_flags = 0;
+	ipc.ttl = 0;
+	ipc.tos = -1;
 
 	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
 			       type, code, icmp_param);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a04d872c54f9..7d8357bb2ba6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1060,6 +1060,9 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
 	cork->dst = &rt->dst;
 	cork->length = 0;
+	cork->ttl = ipc->ttl;
+	cork->tos = ipc->tos;
+	cork->priority = ipc->priority;
 	cork->tx_flags = ipc->tx_flags;
 
 	return 0;
@@ -1311,7 +1314,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	if (cork->flags & IPCORK_OPT)
 		opt = cork->opt;
 
-	if (rt->rt_type == RTN_MULTICAST)
+	if (cork->ttl != 0)
+		ttl = cork->ttl;
+	else if (rt->rt_type == RTN_MULTICAST)
 		ttl = inet->mc_ttl;
 	else
 		ttl = ip_select_ttl(inet, &rt->dst);
@@ -1319,7 +1324,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	iph = ip_hdr(skb);
 	iph->version = 4;
 	iph->ihl = 5;
-	iph->tos = inet->tos;
+	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
 	iph->frag_off = df;
 	iph->ttl = ttl;
 	iph->protocol = sk->sk_protocol;
@@ -1331,7 +1336,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 		ip_options_build(skb, opt, cork->addr, rt, 0);
 	}
 
-	skb->priority = sk->sk_priority;
+	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
 	skb->mark = sk->sk_mark;
 	/*
 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
@@ -1481,6 +1486,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 	ipc.addr = daddr;
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
+	ipc.ttl = 0;
+	ipc.tos = -1;
 
 	if (replyopts.opt.opt.optlen) {
 		ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index d7d9882d4cae..706d108e128c 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -713,6 +713,8 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	ipc.opt = NULL;
 	ipc.oif = sk->sk_bound_dev_if;
 	ipc.tx_flags = 0;
+	ipc.ttl = 0;
+	ipc.tos = -1;
 
 	sock_tx_timestamp(sk, &ipc.tx_flags);
 
@@ -744,7 +746,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			return -EINVAL;
 		faddr = ipc.opt->opt.faddr;
 	}
-	tos = RT_TOS(inet->tos);
+	tos = get_rttos(&ipc, inet);
 	if (sock_flag(sk, SOCK_LOCALROUTE) ||
 	    (msg->msg_flags & MSG_DONTROUTE) ||
 	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bfec521c717f..a3fe534c968e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -517,6 +517,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	ipc.addr = inet->inet_saddr;
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
+	ipc.ttl = 0;
+	ipc.tos = -1;
 	ipc.oif = sk->sk_bound_dev_if;
 
 	if (msg->msg_controllen) {
@@ -556,7 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			daddr = ipc.opt->opt.faddr;
 		}
 	}
-	tos = RT_CONN_FLAGS(sk);
+	tos = get_rtconn_flags(&ipc, sk);
 	if (msg->msg_flags & MSG_DONTROUTE)
 		tos |= RTO_ONLINK;
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 74d2c95db57f..22462d947750 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -855,6 +855,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
+	ipc.ttl = 0;
+	ipc.tos = -1;
 
 	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
 
@@ -938,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		faddr = ipc.opt->opt.faddr;
 		connected = 0;
 	}
-	tos = RT_TOS(inet->tos);
+	tos = get_rttos(&ipc, inet);
 	if (sock_flag(sk, SOCK_LOCALROUTE) ||
 	    (msg->msg_flags & MSG_DONTROUTE) ||
 	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
-- 
cgit v1.2.3


From 62748f32d501f5d3712a7c372bbb92abc7c62bc7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 24 Sep 2013 08:20:52 -0700
Subject: net: introduce SO_MAX_PACING_RATE

As mentioned in commit afe4fd062416b ("pkt_sched: fq: Fair Queue packet
scheduler"), this patch adds a new socket option.

SO_MAX_PACING_RATE offers the application the ability to cap the
rate computed by transport layer. Value is in bytes per second.

u32 val = 1000000;
setsockopt(sockfd, SOL_SOCKET, SO_MAX_PACING_RATE, &val, sizeof(val));

To be effectively paced, a flow must use FQ packet scheduler.

Note that a packet scheduler takes into account the headers for its
computations. The effective payload rate depends on MSS and retransmits
if any.

I chose to make this pacing rate a SOL_SOCKET option instead of a
TCP one because this can be used by other protocols.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Steinar H. Gunderson <sesse@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h   |  4 +++-
 arch/avr32/include/uapi/asm/socket.h   |  2 ++
 arch/cris/include/uapi/asm/socket.h    |  2 ++
 arch/frv/include/uapi/asm/socket.h     |  2 ++
 arch/h8300/include/uapi/asm/socket.h   |  2 ++
 arch/ia64/include/uapi/asm/socket.h    |  2 ++
 arch/m32r/include/uapi/asm/socket.h    |  2 ++
 arch/mips/include/uapi/asm/socket.h    |  2 ++
 arch/mn10300/include/uapi/asm/socket.h |  2 ++
 arch/parisc/include/uapi/asm/socket.h  |  2 ++
 arch/powerpc/include/uapi/asm/socket.h |  2 ++
 arch/s390/include/uapi/asm/socket.h    |  2 ++
 arch/sparc/include/uapi/asm/socket.h   |  2 ++
 arch/xtensa/include/uapi/asm/socket.h  |  2 ++
 include/net/sock.h                     |  1 +
 include/uapi/asm-generic/socket.h      |  2 ++
 net/core/sock.c                        | 12 ++++++++++++
 net/ipv4/tcp_input.c                   |  2 +-
 18 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 467de010ea7e..e3a1491d5073 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -81,6 +81,8 @@
 
 #define SO_SELECT_ERR_QUEUE	45
 
-#define SO_BUSY_POLL			46
+#define SO_BUSY_POLL		46
+
+#define SO_MAX_PACING_RATE	47
 
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 11c4259c62fb..439936421434 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* __ASM_AVR32_SOCKET_H */
diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
index eb723e51554e..13829aaaeec5 100644
--- a/arch/cris/include/uapi/asm/socket.h
+++ b/arch/cris/include/uapi/asm/socket.h
@@ -78,6 +78,8 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _ASM_SOCKET_H */
 
 
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index f0cb1c341163..5d4299762426 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -76,5 +76,7 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/h8300/include/uapi/asm/socket.h b/arch/h8300/include/uapi/asm/socket.h
index 9490758c5e2b..214ccaf3554a 100644
--- a/arch/h8300/include/uapi/asm/socket.h
+++ b/arch/h8300/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 556d0701a155..c25302fb48d9 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 24be7c8da86a..52966650114f 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 61c01f054d1b..0df9787cd84d 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index e2a2b203eb00..71dedcae55a6 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 71700e636a8e..7c614d01f1fa 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -75,6 +75,8 @@
 
 #define SO_BUSY_POLL		0x4027
 
+#define SO_MAX_PACING_RATE	0x4048
+
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index a6d74467c9ed..fa698324a1fd 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -83,4 +83,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 92494494692e..c286c2e868f0 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -82,4 +82,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 4e1d66c3ce71..0f21e9a5ca18 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -72,6 +72,8 @@
 
 #define SO_BUSY_POLL		0x0030
 
+#define SO_MAX_PACING_RATE	0x0031
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index c114483010c1..7db5c22faa68 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -87,4 +87,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4625d2eff461..240aa3f08cd6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -363,6 +363,7 @@ struct sock {
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
 	u32			sk_pacing_rate; /* bytes per second */
+	u32			sk_max_pacing_rate;
 	netdev_features_t	sk_route_caps;
 	netdev_features_t	sk_route_nocaps;
 	int			sk_gso_type;
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index f04b69b6abf2..38f14d0264c3 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -78,4 +78,6 @@
 
 #define SO_BUSY_POLL		46
 
+#define SO_MAX_PACING_RATE	47
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 5b6beba494a3..2bd9b3faa0d0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -914,6 +914,13 @@ set_rcvbuf:
 		}
 		break;
 #endif
+
+	case SO_MAX_PACING_RATE:
+		sk->sk_max_pacing_rate = val;
+		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+					 sk->sk_max_pacing_rate);
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1177,6 +1184,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 #endif
 
+	case SO_MAX_PACING_RATE:
+		v.val = sk->sk_max_pacing_rate;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2319,6 +2330,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_ll_usec		=	sysctl_net_busy_read;
 #endif
 
+	sk->sk_max_pacing_rate = ~0U;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5d083855c111..66aa816ad30b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -735,7 +735,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
 	if (tp->srtt > 8 + 2)
 		do_div(rate, tp->srtt);
 
-	sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+	sk->sk_pacing_rate = min_t(u64, rate, sk->sk_max_pacing_rate);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
-- 
cgit v1.2.3


From f4a87e7bd2eaef26a3ca25437ce8b807de2966ad Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 30 Sep 2013 08:51:46 +0100
Subject: netfilter: synproxy: fix BUG_ON triggered by corrupt TCP packets

TCP packets hitting the SYN proxy through the SYNPROXY target are not
validated by TCP conntrack. When th->doff is below 5, an underflow happens
when calculating the options length, causing skb_header_pointer() to
return NULL and triggering the BUG_ON().

Handle this case gracefully by checking for NULL instead of using BUG_ON().

Reported-by: Martin Topholm <mph@one.com>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_synproxy.h |  2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c             | 10 +++++++---
 net/ipv6/netfilter/ip6t_SYNPROXY.c            | 10 +++++++---
 net/netfilter/nf_synproxy_core.c              | 12 +++++++-----
 4 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index 806f54a290d6..f572f313d6f1 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -56,7 +56,7 @@ struct synproxy_options {
 
 struct tcphdr;
 struct xt_synproxy_info;
-extern void synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+extern bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 				   const struct tcphdr *th,
 				   struct synproxy_options *opts);
 extern unsigned int synproxy_options_size(const struct synproxy_options *opts);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 67e17dcda65e..b6346bf2fde3 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -267,7 +267,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	if (th == NULL)
 		return NF_DROP;
 
-	synproxy_parse_options(skb, par->thoff, th, &opts);
+	if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+		return NF_DROP;
 
 	if (th->syn && !(th->ack || th->fin || th->rst)) {
 		/* Initial SYN from client */
@@ -350,7 +351,8 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
 
 		/* fall through */
 	case TCP_CONNTRACK_SYN_SENT:
-		synproxy_parse_options(skb, thoff, th, &opts);
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
 
 		if (!th->syn && th->ack &&
 		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
@@ -373,7 +375,9 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
 		if (!th->syn || !th->ack)
 			break;
 
-		synproxy_parse_options(skb, thoff, th, &opts);
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
+
 		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
 			synproxy->tsoff = opts.tsval - synproxy->its;
 
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 19cfea8dbcaa..2748b042da72 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -282,7 +282,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	if (th == NULL)
 		return NF_DROP;
 
-	synproxy_parse_options(skb, par->thoff, th, &opts);
+	if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+		return NF_DROP;
 
 	if (th->syn && !(th->ack || th->fin || th->rst)) {
 		/* Initial SYN from client */
@@ -372,7 +373,8 @@ static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
 
 		/* fall through */
 	case TCP_CONNTRACK_SYN_SENT:
-		synproxy_parse_options(skb, thoff, th, &opts);
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
 
 		if (!th->syn && th->ack &&
 		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
@@ -395,7 +397,9 @@ static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
 		if (!th->syn || !th->ack)
 			break;
 
-		synproxy_parse_options(skb, thoff, th, &opts);
+		if (!synproxy_parse_options(skb, thoff, th, &opts))
+			return NF_DROP;
+
 		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
 			synproxy->tsoff = opts.tsval - synproxy->its;
 
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 6fd967c6278c..cdf4567ba9b3 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -24,7 +24,7 @@
 int synproxy_net_id;
 EXPORT_SYMBOL_GPL(synproxy_net_id);
 
-void
+bool
 synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 		       const struct tcphdr *th, struct synproxy_options *opts)
 {
@@ -32,7 +32,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 	u8 buf[40], *ptr;
 
 	ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
-	BUG_ON(ptr == NULL);
+	if (ptr == NULL)
+		return false;
 
 	opts->options = 0;
 	while (length > 0) {
@@ -41,16 +42,16 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 
 		switch (opcode) {
 		case TCPOPT_EOL:
-			return;
+			return true;
 		case TCPOPT_NOP:
 			length--;
 			continue;
 		default:
 			opsize = *ptr++;
 			if (opsize < 2)
-				return;
+				return true;
 			if (opsize > length)
-				return;
+				return true;
 
 			switch (opcode) {
 			case TCPOPT_MSS:
@@ -84,6 +85,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
 			length -= opsize;
 		}
 	}
+	return true;
 }
 EXPORT_SYMBOL_GPL(synproxy_parse_options);
 
-- 
cgit v1.2.3


From d4a71b155c12d0d429c6b69d94076d6d57e2a7a7 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 25 Sep 2013 09:57:47 -0700
Subject: ip_tunnel: Do not use stale inner_iph pointer.

While sending packet skb_cow_head() can change skb header which
invalidates inner_iph pointer to skb header. Following patch
avoid using it. Found by code inspection.

This bug was introduced by commit 0e6fbc5b6c6218 (ip_tunnels: extend
iptunnel_xmit()).

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ac9fabe0300f..d3fbad422e0e 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -623,6 +623,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			tunnel->err_count = 0;
 	}
 
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 	ttl = tnl_params->ttl;
 	if (ttl == 0) {
 		if (skb->protocol == htons(ETH_P_IP))
@@ -651,8 +652,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
-			    ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df,
-			    !net_eq(tunnel->net, dev_net(dev)));
+			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 
 	return;
-- 
cgit v1.2.3


From c9eeec26e32e087359160406f96e0949b3cc6f10 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Sep 2013 03:28:54 -0700
Subject: tcp: TSQ can use a dynamic limit

When TCP Small Queues was added, we used a sysctl to limit amount of
packets queues on Qdisc/device queues for a given TCP flow.

Problem is this limit is either too big for low rates, or too small
for high rates.

Now TCP stack has rate estimation in sk->sk_pacing_rate, and TSO
auto sizing, it can better control number of packets in Qdisc/device
queues.

New limit is two packets or at least 1 to 2 ms worth of packets.

Low rates flows benefit from this patch by having even smaller
number of packets in queues, allowing for faster recovery,
better RTT estimations.

High rates flows benefit from this patch by allowing more than 2 packets
in flight as we had reports this was a limiting factor to reach line
rate. [ In particular if TX completion is delayed because of coalescing
parameters ]

Example for a single flow on 10Gbp link controlled by FQ/pacing

14 packets in flight instead of 2

$ tc -s -d qd
qdisc fq 8001: dev eth0 root refcnt 32 limit 10000p flow_limit 100p
buckets 1024 quantum 3028 initial_quantum 15140
 Sent 1168459366606 bytes 771822841 pkt (dropped 0, overlimits 0
requeues 6822476)
 rate 9346Mbit 771713pps backlog 953820b 14p requeues 6822476
  2047 flow, 2046 inactive, 1 throttled, delay 15673 ns
  2372 gc, 0 highprio, 0 retrans, 9739249 throttled, 0 flows_plimit

Note that sk_pacing_rate is currently set to twice the actual rate, but
this might be refined in the future when a flow is in congestion
avoidance.

Additional change : skb->destructor should be set to tcp_wfree().

A future patch (for linux 3.13+) might remove tcp_limit_output_bytes

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7c83cb8bf137..e6bb8256e59f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -895,8 +895,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_orphan(skb);
 	skb->sk = sk;
-	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
-			  tcp_wfree : sock_wfree;
+	skb->destructor = tcp_wfree;
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
 	/* Build TCP header and checksum it. */
@@ -1840,7 +1839,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
-
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
@@ -1869,13 +1867,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		}
 
-		/* TSQ : sk_wmem_alloc accounts skb truesize,
-		 * including skb overhead. But thats OK.
+		/* TCP Small Queues :
+		 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+		 * This allows for :
+		 *  - better RTT estimation and ACK scheduling
+		 *  - faster recovery
+		 *  - high rates
 		 */
-		if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+		limit = max(skb->truesize, sk->sk_pacing_rate >> 10);
+
+		if (atomic_read(&sk->sk_wmem_alloc) > limit) {
 			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
 			break;
 		}
+
 		limit = mss_now;
 		if (tso_segs > 1 && !tcp_urg_mode(tp))
 			limit = tcp_mss_split_point(sk, skb, mss_now,
-- 
cgit v1.2.3


From 0bbf87d852d243680ed7074110ccc1dea003b61a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 28 Sep 2013 14:10:59 -0700
Subject: net ipv4: Convert ipv4.ip_local_port_range to be per netns v3

- Move sysctl_local_ports from a global variable into struct netns_ipv4.
- Modify inet_get_local_port_range to take a struct net, and update all
  of the callers.
- Move the initialization of sysctl_local_ports into
   sysctl_net_ipv4.c:ipv4_sysctl_init_net from inet_connection_sock.c

v2:
- Ensure indentation used tabs
- Fixed ip.h so it applies cleanly to todays net-next

v3:
- Compile fixes of strange callers of inet_get_local_port_range.
  This patch now successfully passes an allmodconfig build.
  Removed manual inlining of inet_get_local_port_range in ipv4_local_port_range

Originally-by: Samya <samya@twitter.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/core/cma.c   |  2 +-
 drivers/net/vxlan.c             |  2 +-
 include/net/ip.h                |  6 +----
 include/net/netns/ipv4.h        |  6 +++++
 net/ipv4/inet_connection_sock.c | 20 +++++-----------
 net/ipv4/inet_hashtables.c      |  2 +-
 net/ipv4/ping.c                 |  4 ++--
 net/ipv4/sysctl_net_ipv4.c      | 52 +++++++++++++++++++++++++++--------------
 net/ipv4/udp.c                  |  2 +-
 net/openvswitch/vport-vxlan.c   |  2 +-
 net/sctp/socket.c               |  2 +-
 security/selinux/hooks.c        |  2 +-
 12 files changed, 56 insertions(+), 46 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index dab4b41f1715..a082fd9e7ebe 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2294,7 +2294,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
 	int low, high, remaining;
 	unsigned int rover;
 
-	inet_get_local_port_range(&low, &high);
+	inet_get_local_port_range(&init_net, &low, &high);
 	remaining = (high - low) + 1;
 	rover = net_random() % remaining + low;
 retry:
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d1292fe746bc..c376be7b528a 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2089,7 +2089,7 @@ static void vxlan_setup(struct net_device *dev)
 	vxlan->age_timer.function = vxlan_cleanup;
 	vxlan->age_timer.data = (unsigned long) vxlan;
 
-	inet_get_local_port_range(&low, &high);
+	inet_get_local_port_range(dev_net(dev), &low, &high);
 	vxlan->port_min = low;
 	vxlan->port_max = high;
 	vxlan->dst_port = htons(vxlan_port);
diff --git a/include/net/ip.h b/include/net/ip.h
index 77b4f9b57c28..16078f422397 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -217,11 +217,7 @@ static inline void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
 	}
 }
 
-extern struct local_ports {
-	seqlock_t	lock;
-	int		range[2];
-} sysctl_local_ports;
-void inet_get_local_port_range(int *low, int *high);
+void inet_get_local_port_range(struct net *net, int *low, int *high);
 
 extern unsigned long *sysctl_local_reserved_ports;
 static inline int inet_is_reserved_local_port(int port)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index bf2ec2202c56..5dbd232e12ff 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -15,6 +15,10 @@ struct fib_rules_ops;
 struct hlist_head;
 struct fib_table;
 struct sock;
+struct local_ports {
+	seqlock_t	lock;
+	int		range[2];
+};
 
 struct netns_ipv4 {
 #ifdef CONFIG_SYSCTL
@@ -62,6 +66,8 @@ struct netns_ipv4 {
 	int sysctl_icmp_ratemask;
 	int sysctl_icmp_errors_use_inbound_ifaddr;
 
+	struct local_ports sysctl_local_ports;
+
 	int sysctl_tcp_ecn;
 
 	kgid_t sysctl_ping_group_range[2];
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6acb541c9091..7ac7aa11130e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,27 +29,19 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
 #endif
 
-/*
- * This struct holds the first and last local port number.
- */
-struct local_ports sysctl_local_ports __read_mostly = {
-	.lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),
-	.range = { 32768, 61000 },
-};
-
 unsigned long *sysctl_local_reserved_ports;
 EXPORT_SYMBOL(sysctl_local_reserved_ports);
 
-void inet_get_local_port_range(int *low, int *high)
+void inet_get_local_port_range(struct net *net, int *low, int *high)
 {
 	unsigned int seq;
 
 	do {
-		seq = read_seqbegin(&sysctl_local_ports.lock);
+		seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
 
-		*low = sysctl_local_ports.range[0];
-		*high = sysctl_local_ports.range[1];
-	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+		*low = net->ipv4.sysctl_local_ports.range[0];
+		*high = net->ipv4.sysctl_local_ports.range[1];
+	} while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
 }
 EXPORT_SYMBOL(inet_get_local_port_range);
 
@@ -116,7 +108,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 		int remaining, rover, low, high;
 
 again:
-		inet_get_local_port_range(&low, &high);
+		inet_get_local_port_range(net, &low, &high);
 		remaining = (high - low) + 1;
 		smallest_rover = rover = net_random() % remaining + low;
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7bd8983dbfcf..2779037bd113 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -494,7 +494,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		u32 offset = hint + port_offset;
 		struct inet_timewait_sock *tw = NULL;
 
-		inet_get_local_port_range(&low, &high);
+		inet_get_local_port_range(net, &low, &high);
 		remaining = (high - low) + 1;
 
 		local_bh_disable();
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 706d108e128c..a62610443152 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -237,11 +237,11 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
 	unsigned int seq;
 
 	do {
-		seq = read_seqbegin(&sysctl_local_ports.lock);
+		seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
 
 		*low = data[0];
 		*high = data[1];
-	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+	} while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
 }
 
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 540279f4c531..c08f096d46b5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -43,12 +43,12 @@ static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
 /* Update system visible IP port range */
-static void set_local_port_range(int range[2])
+static void set_local_port_range(struct net *net, int range[2])
 {
-	write_seqlock(&sysctl_local_ports.lock);
-	sysctl_local_ports.range[0] = range[0];
-	sysctl_local_ports.range[1] = range[1];
-	write_sequnlock(&sysctl_local_ports.lock);
+	write_seqlock(&net->ipv4.sysctl_local_ports.lock);
+	net->ipv4.sysctl_local_ports.range[0] = range[0];
+	net->ipv4.sysctl_local_ports.range[1] = range[1];
+	write_sequnlock(&net->ipv4.sysctl_local_ports.lock);
 }
 
 /* Validate changes from /proc interface. */
@@ -56,6 +56,8 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 				 void __user *buffer,
 				 size_t *lenp, loff_t *ppos)
 {
+	struct net *net =
+		container_of(table->data, struct net, ipv4.sysctl_local_ports.range);
 	int ret;
 	int range[2];
 	struct ctl_table tmp = {
@@ -66,14 +68,15 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 		.extra2 = &ip_local_port_range_max,
 	};
 
-	inet_get_local_port_range(range, range + 1);
+	inet_get_local_port_range(net, &range[0], &range[1]);
+
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
 		if (range[1] < range[0])
 			ret = -EINVAL;
 		else
-			set_local_port_range(range);
+			set_local_port_range(net, range);
 	}
 
 	return ret;
@@ -83,23 +86,27 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
 {
 	kgid_t *data = table->data;
+	struct net *net =
+		container_of(table->data, struct net, ipv4.sysctl_ping_group_range);
 	unsigned int seq;
 	do {
-		seq = read_seqbegin(&sysctl_local_ports.lock);
+		seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
 
 		*low = data[0];
 		*high = data[1];
-	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+	} while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
 }
 
 /* Update system visible IP port range */
 static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
 {
 	kgid_t *data = table->data;
-	write_seqlock(&sysctl_local_ports.lock);
+	struct net *net =
+		container_of(table->data, struct net, ipv4.sysctl_ping_group_range);
+	write_seqlock(&net->ipv4.sysctl_local_ports.lock);
 	data[0] = low;
 	data[1] = high;
-	write_sequnlock(&sysctl_local_ports.lock);
+	write_sequnlock(&net->ipv4.sysctl_local_ports.lock);
 }
 
 /* Validate changes from /proc interface. */
@@ -474,13 +481,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "ip_local_port_range",
-		.data		= &sysctl_local_ports.range,
-		.maxlen		= sizeof(sysctl_local_ports.range),
-		.mode		= 0644,
-		.proc_handler	= ipv4_local_port_range,
-	},
 	{
 		.procname	= "ip_local_reserved_ports",
 		.data		= NULL, /* initialized in sysctl_ipv4_init */
@@ -853,6 +853,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_local_port_range",
+		.maxlen		= sizeof(init_net.ipv4.sysctl_local_ports.range),
+		.data		= &init_net.ipv4.sysctl_local_ports.range,
+		.mode		= 0644,
+		.proc_handler	= ipv4_local_port_range,
+	},
 	{
 		.procname	= "tcp_mem",
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
@@ -888,6 +895,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 			&net->ipv4.sysctl_ping_group_range;
 		table[7].data =
 			&net->ipv4.sysctl_tcp_ecn;
+		table[8].data =
+			&net->ipv4.sysctl_local_ports.range;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -901,6 +910,13 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
 	net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
 
+	/*
+	 * Set defaults for local port range
+	 */
+	seqlock_init(&net->ipv4.sysctl_local_ports.lock);
+	net->ipv4.sysctl_local_ports.range[0] =  32768;
+	net->ipv4.sysctl_local_ports.range[1] =  61000;
+
 	tcp_init_mem(net);
 
 	net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 22462d947750..728ce9503a27 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -219,7 +219,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		unsigned short first, last;
 		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
 
-		inet_get_local_port_range(&low, &high);
+		inet_get_local_port_range(net, &low, &high);
 		remaining = (high - low) + 1;
 
 		rand = net_random();
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index a481c03e2861..56e22b74cf96 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -173,7 +173,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	skb->local_df = 1;
 
-	inet_get_local_port_range(&port_min, &port_max);
+	inet_get_local_port_range(net, &port_min, &port_max);
 	src_port = vxlan_src_port(port_min, port_max, skb);
 
 	err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 911b71b26b0e..72046b9729a8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5890,7 +5890,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 		int low, high, remaining, index;
 		unsigned int rover;
 
-		inet_get_local_port_range(&low, &high);
+		inet_get_local_port_range(sock_net(sk), &low, &high);
 		remaining = (high - low) + 1;
 		rover = net_random() % remaining + low;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a5091ec06aa6..568c7699abf1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3929,7 +3929,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 		if (snum) {
 			int low, high;
 
-			inet_get_local_port_range(&low, &high);
+			inet_get_local_port_range(sock_net(sk), &low, &high);
 
 			if (snum < max(PROT_SOCK, low) || snum > high) {
 				err = sel_netport_sid(sk->sk_protocol,
-- 
cgit v1.2.3


From e2401654dd0f5f3fb7a8d80dad9554d73d7ca394 Mon Sep 17 00:00:00 2001
From: Salam Noureddine <noureddine@aristanetworks.com>
Date: Sun, 29 Sep 2013 13:39:42 -0700
Subject: ipv4 igmp: use in_dev_put in timer handlers instead of __in_dev_put

It is possible for the timer handlers to run after the call to
ip_mc_down so use in_dev_put instead of __in_dev_put in the handler
function in order to do proper cleanup when the refcnt reaches 0.
Otherwise, the refcnt can reach zero without the in_device being
destroyed and we end up leaking a reference to the net_device and
see messages like the following,

unregister_netdevice: waiting for eth0 to become free. Usage count = 1

Tested on linux-3.4.43.

Signed-off-by: Salam Noureddine <noureddine@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index dace87f06e5f..7defdc9ba167 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -736,7 +736,7 @@ static void igmp_gq_timer_expire(unsigned long data)
 
 	in_dev->mr_gq_running = 0;
 	igmpv3_send_report(in_dev, NULL);
-	__in_dev_put(in_dev);
+	in_dev_put(in_dev);
 }
 
 static void igmp_ifc_timer_expire(unsigned long data)
@@ -749,7 +749,7 @@ static void igmp_ifc_timer_expire(unsigned long data)
 		igmp_ifc_start_timer(in_dev,
 				     unsolicited_report_interval(in_dev));
 	}
-	__in_dev_put(in_dev);
+	in_dev_put(in_dev);
 }
 
 static void igmp_ifc_event(struct in_device *in_dev)
-- 
cgit v1.2.3


From 3e08f4a72f689c6296d336c2aab4bddd60c93ae2 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 1 Oct 2013 11:33:59 +0200
Subject: ip_tunnel: Fix a memory corruption in ip_tunnel_xmit

We might extend the used aera of a skb beyond the total
headroom when we install the ipip header. Fix this by
calling skb_cow_head() unconditionally.

Bug was introduced with commit c544193214
("GRE: Refactor GRE tunneling code.")

Cc: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d3fbad422e0e..dfc6d8a3caa7 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -642,13 +642,13 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 			+ rt->dst.header_len;
-	if (max_headroom > dev->needed_headroom) {
+	if (max_headroom > dev->needed_headroom)
 		dev->needed_headroom = max_headroom;
-		if (skb_cow_head(skb, dev->needed_headroom)) {
-			dev->stats.tx_dropped++;
-			dev_kfree_skb(skb);
-			return;
-		}
+
+	if (skb_cow_head(skb, dev->needed_headroom)) {
+		dev->stats.tx_dropped++;
+		dev_kfree_skb(skb);
+		return;
 	}
 
 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
-- 
cgit v1.2.3


From 67013282627185aeec2fb92c75868dcace0d25b4 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 1 Oct 2013 11:34:48 +0200
Subject: ip_tunnel: Add fallback tunnels to the hash lists

Currently we can not update the tunnel parameters of
the fallback tunnels because we don't find them in the
hash lists. Fix this by adding them on initialization.

Bug was introduced with commit c544193214
("GRE: Refactor GRE tunneling code.")

Cc: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index dfc6d8a3caa7..895a3535321b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -853,8 +853,10 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 	/* FB netdevice is special: we have one, and only one per netns.
 	 * Allowing to move it to another netns is clearly unsafe.
 	 */
-	if (!IS_ERR(itn->fb_tunnel_dev))
+	if (!IS_ERR(itn->fb_tunnel_dev)) {
 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+	}
 	rtnl_unlock();
 
 	return PTR_RET(itn->fb_tunnel_dev);
-- 
cgit v1.2.3


From 78a3694d44a029242dd0830b34ab20ef1704be35 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 1 Oct 2013 11:35:51 +0200
Subject: ip_tunnel_core: Change __skb_push back to skb_push

Git commit 0e6fbc5b ("ip_tunnels: extend iptunnel_xmit()")
moved the IP header installation to iptunnel_xmit() and
changed skb_push() to __skb_push(). This makes possible
bugs hard to track down, so change it back to skb_push().

Cc: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index d6c856b17fd4..c31e3ad98ef2 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -61,7 +61,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
 	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 
 	/* Push down and install the IP header. */
-	__skb_push(skb, sizeof(struct iphdr));
+	skb_push(skb, sizeof(struct iphdr));
 	skb_reset_network_header(skb);
 
 	iph = ip_hdr(skb);
-- 
cgit v1.2.3


From cfe4a536927c3186b7e7f9be688e7e0f62bb8ea1 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 1 Oct 2013 11:37:37 +0200
Subject: ip_tunnel: Remove double unregister of the fallback device

When queueing the netdevices for removal, we queue the
fallback device twice in ip_tunnel_destroy(). The first
time when we queue all netdevices in the namespace and
then again explicitly. Fix this by removing the explicit
queueing of the fallback device.

Bug was introduced when network namespace support was added
with commit 6c742e714d8 ("ipip: add x-netns support").

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 895a3535321b..63a6d6d6b875 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -886,8 +886,6 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
 			if (!net_eq(dev_net(t->dev), net))
 				unregister_netdevice_queue(t->dev, head);
 	}
-	if (itn->fb_tunnel_dev)
-		unregister_netdevice_queue(itn->fb_tunnel_dev, head);
 }
 
 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
-- 
cgit v1.2.3


From 278f2b3e2af5f32ea1afe34fa12a2518153e6e49 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Mon, 30 Sep 2013 22:05:08 +0200
Subject: netfilter: ipt_ULOG: fix info leaks

The ulog messages leak heap bytes by the means of padding bytes and
incompletely filled string arrays. Fix those by memset(0)'ing the
whole struct before filling it.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_ULOG.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index cbc22158af49..9cb993cd224b 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net,
 	ub->qlen++;
 
 	pm = nlmsg_data(nlh);
+	memset(pm, 0, sizeof(*pm));
 
 	/* We might not have a timestamp, get one */
 	if (skb->tstamp.tv64 == 0)
@@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net,
 	}
 	else if (loginfo->prefix[0] != '\0')
 		strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
-	else
-		*(pm->prefix) = '\0';
 
 	if (in && in->hard_header_len > 0 &&
 	    skb->mac_header != skb->network_header &&
@@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net,
 
 	if (in)
 		strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
-	else
-		pm->indev_name[0] = '\0';
 
 	if (out)
 		strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
-	else
-		pm->outdev_name[0] = '\0';
 
 	/* copy_len <= skb->len, so can't fail. */
 	if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
-- 
cgit v1.2.3


From 5843ef421311c34f06d5ab82bced5aebfe185b2c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 30 Sep 2013 13:29:11 -0700
Subject: tcp: Always set options to 0 before calling tcp_established_options

tcp_established_options assumes opts->options is 0 before calling,
as it read modify writes it.

For the tcp_current_mss() case the opts structure is not zeroed,
so this can be done with uninitialized values.

This is ok, because ->options is not read in this path.
But it's still better to avoid the operation on the uninitialized
field. This shuts up a static code analyzer, and presumably
may help the optimizer.

Cc: netdev@vger.kernel.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e6bb8256e59f..c6f01f2cdb32 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -637,6 +637,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 	unsigned int size = 0;
 	unsigned int eff_sacks;
 
+	opts->options = 0;
+
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
 	if (unlikely(*md5)) {
-- 
cgit v1.2.3


From bbe34cf8a1a2cc174e6516fc230b91b531da7ddf Mon Sep 17 00:00:00 2001
From: "baker.zhang" <baker.kernel@gmail.com>
Date: Tue, 1 Oct 2013 07:45:09 +0800
Subject: fib_trie: avoid a redundant bit judgement in inflate

Because 'node' is the i'st child of 'oldnode',
thus, here 'i' equals
tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits)

we just get 1 more bit,
and need not care the detail value of this bits.

I apologize for the mistake.

I generated the patch on a branch version,
and did not notice the put_child has been changed.

I have redone the test on HEAD version with my patch.

two cases are used.
case 1. inflate a node which has a leaf child node.
case 2: inflate a node which has a an child node with skipped bits

test env:
  ip link set eth0 up
  ip a add dev eth0 192.168.11.1/32
here, we just focus on route table(MAIN),
so I use a "192.168.11.1/32" address to simplify the test case.

call trace:
+ fib_insert_node
+ + trie_rebalance
+ + + resize
+ + + + inflate

Test case 1:  inflate a node which has a leaf child node.

===========================================================
step 1. prepare a fib trie
------------------------------------------
  ip r a 192.168.0.0/24 via 192.168.11.1
  ip r a 192.168.1.0/24 via 192.168.11.1

we get a fib trie.
root@baker:~# cat /proc/net/fib_trie
Main:
  +-- 192.168.0.0/23 1 0 0
   |-- 192.168.0.0
    /24 universe UNICAST
   |-- 192.168.1.0
    /24 universe UNICAST
Local:
.....

step 2. Add the third route
------------------------------------------
root@baker:~# ip r a 192.168.2.0/24 via 192.168.11.1

A fib_trie leaf will be inserted in fib_insert_node before trie_rebalance.

For function 'inflate':
'inflate' is called with following trie.
  +-- 192.168.0.0/22 1 1 0 <=== tn node
    +-- 192.168.0.0/23 1 0 0    <== node a
        |-- 192.168.0.0
          /24 universe UNICAST
        |-- 192.168.1.0
          /24 universe UNICAST
      |-- 192.168.2.0          <== leaf(node b)

When process node b, which is a leaf. here:
i is 1,
node key "192.168.2.0"
oldnode is (pos:22, bits:1)

unpatch source:
tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1)
it equals:
tkey_extract_bits("192.168,2,0", 22 + 1, 1)

thus got 0, and call put_child(tn, 2*i, node); <== 2*i=2.

patched source:
tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
tkey_extract_bits("192.168,2,0", 22, 1 + 1)  <== get 2.

Test case 2:  inflate a node which has a an child node with skipped bits
==========================================================================
step 1. prepare a fib trie.
  ip link set eth0 up
  ip a add dev eth0 192.168.11.1/32
  ip r a 192.168.128.0/24 via 192.168.11.1
  ip r a 192.168.0.0/24  via 192.168.11.1
  ip r a 192.168.16.0/24   via 192.168.11.1
  ip r a 192.168.32.0/24  via 192.168.11.1
  ip r a 192.168.48.0/24  via 192.168.11.1
  ip r a 192.168.144.0/24   via 192.168.11.1
  ip r a 192.168.160.0/24   via 192.168.11.1
  ip r a 192.168.176.0/24   via 192.168.11.1

check:
root@baker:~# cat /proc/net/fib_trie
Main:
  +-- 192.168.0.0/16 1 0 0
     +-- 192.168.0.0/18 2 0 0
        |-- 192.168.0.0
           /24 universe UNICAST
        |-- 192.168.16.0
           /24 universe UNICAST
        |-- 192.168.32.0
           /24 universe UNICAST
        |-- 192.168.48.0
           /24 universe UNICAST
     +-- 192.168.128.0/18 2 0 0
        |-- 192.168.128.0
           /24 universe UNICAST
        |-- 192.168.144.0
           /24 universe UNICAST
        |-- 192.168.160.0
           /24 universe UNICAST
        |-- 192.168.176.0
           /24 universe UNICAST
Local:
  ...

step 2. add a route to trigger inflate.
  ip r a 192.168.96.0/24   via 192.168.11.1

This command will call serveral times inflate.
In the first time, the fib_trie is:
________________________
+-- 192.168.128.0/(16, 1) <== tn node
 +-- 192.168.0.0/(17, 1)  <== node a
  +-- 192.168.0.0/(18, 2)
   |-- 192.168.0.0
   |-- 192.168.16.0
   |-- 192.168.32.0
   |-- 192.168.48.0
  |-- 192.168.96.0
 +-- 192.168.128.0/(18, 2) <== node b.
  |-- 192.168.128.0
  |-- 192.168.144.0
  |-- 192.168.160.0
  |-- 192.168.176.0

NOTE: node b is a interal node with skipped bits.
here,
i:1,
node->key "192.168.128.0",
oldnode:(pos:16, bits:1)
so
tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1)
it equals:
tkey_extract_bits("192.168,128,0", 16 + 1, 1) <=== 0

tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits, 1)
it equals:
tkey_extract_bits("192.168,128,0", 16, 1+1) <=== 2

2*i + 0 == 2, so the result is same.

Signed-off-by: baker.zhang <baker.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3df6d3edb2a1..45c74ba03970 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -762,12 +762,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 		   tn->pos + tn->bits - 1) {
-			if (tkey_extract_bits(node->key,
-					      oldtnode->pos + oldtnode->bits,
-					      1) == 0)
-				put_child(tn, 2*i, node);
-			else
-				put_child(tn, 2*i+1, node);
+			put_child(tn,
+				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
+				node);
 			continue;
 		}
 
-- 
cgit v1.2.3


From 6ae705323b716ea7a8cc26bee79176398a9b2e89 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 1 Oct 2013 10:23:44 -0700
Subject: tcp: sndbuf autotuning improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tcp_fixup_sndbuf() is underestimating initial send buffer requirements.

It was not noticed because big GSO packets were escaping the limitation,
but with smaller TSO packets (or TSO/GSO/SG off), application hits
sk_sndbuf before having a chance to fill enough packets in socket write
queue.

- initial cwnd can be bigger than 10 for specific routes

- SKB_TRUESIZE() is a bit under real needs in some cases,
  because of power-of-two rounding in kmalloc()

- Fast Recovery (RFC 5681 3.2) : Cubic needs 70% factor

- Extra cushion (application might react slowly to POLLOUT)

tcp_v4_conn_req_fastopen() needs to call tcp_init_metrics() before
calling tcp_init_buffer_space()

Then we realize tcp_new_space() should call tcp_fixup_sndbuf()
instead of duplicating this stuff.

Rename tcp_fixup_sndbuf() to tcp_sndbuf_expand() to be more
descriptive.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 38 +++++++++++++++++++++++++-------------
 net/ipv4/tcp_ipv4.c  |  2 +-
 2 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 66aa816ad30b..cd65674ece92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
  * 1. Tuning sk->sk_sndbuf, when connection enters established state.
  */
 
-static void tcp_fixup_sndbuf(struct sock *sk)
+static void tcp_sndbuf_expand(struct sock *sk)
 {
-	int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	int sndmem, per_mss;
+	u32 nr_segs;
+
+	/* Worst case is non GSO/TSO : each frame consumes one skb
+	 * and skb->head is kmalloced using power of two area of memory
+	 */
+	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+		  MAX_TCP_HEADER +
+		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	per_mss = roundup_pow_of_two(per_mss) +
+		  SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+	/* Fast Recovery (RFC 5681 3.2) :
+	 * Cubic needs 1.7 factor, rounded to 2 to include
+	 * extra cushion (application might react slowly to POLLOUT)
+	 */
+	sndmem = 2 * nr_segs * per_mss;
 
-	sndmem *= TCP_INIT_CWND;
 	if (sk->sk_sndbuf < sndmem)
 		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
 }
@@ -376,7 +396,7 @@ void tcp_init_buffer_space(struct sock *sk)
 	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
 		tcp_fixup_rcvbuf(sk);
 	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
-		tcp_fixup_sndbuf(sk);
+		tcp_sndbuf_expand(sk);
 
 	tp->rcvq_space.space = tp->rcv_wnd;
 	tp->rcvq_space.time = tcp_time_stamp;
@@ -4723,15 +4743,7 @@ static void tcp_new_space(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tcp_should_expand_sndbuf(sk)) {
-		int sndmem = SKB_TRUESIZE(max_t(u32,
-						tp->rx_opt.mss_clamp,
-						tp->mss_cache) +
-					  MAX_TCP_HEADER);
-		int demanded = max_t(unsigned int, tp->snd_cwnd,
-				     tp->reordering + 1);
-		sndmem *= 2 * demanded;
-		if (sndmem > sk->sk_sndbuf)
-			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+		tcp_sndbuf_expand(sk);
 		tp->snd_cwnd_stamp = tcp_time_stamp;
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b14266bb91eb..5d6b1a609da8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1410,8 +1410,8 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
 	inet_csk(child)->icsk_af_ops->rebuild_header(child);
 	tcp_init_congestion_control(child);
 	tcp_mtup_init(child);
-	tcp_init_buffer_space(child);
 	tcp_init_metrics(child);
+	tcp_init_buffer_space(child);
 
 	/* Queue the data carried in the SYN packet. We need to first
 	 * bump skb's refcnt because the caller will attempt to free it.
-- 
cgit v1.2.3


From 80ad1d61e72d626e30ebe8529a0455e660ca4693 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 1 Oct 2013 21:04:11 -0700
Subject: net: do not call sock_put() on TIMEWAIT sockets

commit 3ab5aee7fe84 ("net: Convert TCP & DCCP hash tables to use RCU /
hlist_nulls") incorrectly used sock_put() on TIMEWAIT sockets.

We should instead use inet_twsk_put()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c  | 2 +-
 net/ipv6/inet6_hashtables.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7bd8983dbfcf..96da9c77deca 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -287,7 +287,7 @@ begintw:
 			if (unlikely(!INET_TW_MATCH(sk, net, acookie,
 						    saddr, daddr, ports,
 						    dif))) {
-				sock_put(sk);
+				inet_twsk_put(inet_twsk(sk));
 				goto begintw;
 			}
 			goto out;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 32b4a1675d82..066640e0ba8e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -116,7 +116,7 @@ begintw:
 			}
 			if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr,
 						     ports, dif))) {
-				sock_put(sk);
+				inet_twsk_put(inet_twsk(sk));
 				goto begintw;
 			}
 			goto out;
-- 
cgit v1.2.3


From 5080546682bae3d32734b18e281091684f0ebbe4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Oct 2013 04:29:50 -0700
Subject: inet: consolidate INET_TW_MATCH

TCP listener refactoring, part 2 :

We can use a generic lookup, sockets being in whatever state, if
we are sure all relevant fields are at the same place in all socket
types (ESTABLISH, TIME_WAIT, SYN_RECV)

This patch removes these macros :

 inet_addrpair, inet_addrpair, tw_addrpair, tw_portpair

And adds :

 sk_portpair, sk_addrpair, sk_daddr, sk_rcv_saddr

Then, INET_TW_MATCH() is really the same than INET_MATCH()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             |  4 ++--
 include/net/inet_hashtables.h    | 26 ++++++++------------------
 include/net/inet_sock.h          |  2 --
 include/net/inet_timewait_sock.h |  8 --------
 include/net/sock.h               |  4 ++++
 net/ipv4/inet_connection_sock.c  | 11 +++++------
 net/ipv6/udp.c                   |  6 ++----
 7 files changed, 21 insertions(+), 40 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 28ea38439313..b7f1f3bb346d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -370,7 +370,7 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
 #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	\
-	((inet_sk(__sk)->inet_portpair == (__ports))		&&	\
+	(((__sk)->sk_portpair == (__ports))			&&	\
 	 ((__sk)->sk_family == AF_INET6)			&&	\
 	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&&	\
 	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&&	\
@@ -379,7 +379,7 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 	 net_eq(sock_net(__sk), (__net)))
 
 #define INET6_TW_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	   \
-	((inet_twsk(__sk)->tw_portpair == (__ports))			&& \
+	(((__sk)->sk_portpair == (__ports))				&& \
 	 ((__sk)->sk_family == AF_INET6)				&& \
 	 ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_daddr, (__saddr))	&& \
 	 ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_rcv_saddr, (__daddr)) && \
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 594dfeead70f..10d6838378c3 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -302,35 +302,25 @@ static inline struct sock *inet_lookup_listener(struct net *net,
 				   ((__force __u64)(__be32)(__saddr)));
 #endif /* __BIG_ENDIAN */
 #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)	\
-	((inet_sk(__sk)->inet_portpair == (__ports))		&&	\
-	 (inet_sk(__sk)->inet_addrpair == (__cookie))		&&	\
+	(((__sk)->sk_portpair == (__ports))			&&	\
+	 ((__sk)->sk_addrpair == (__cookie))			&&	\
 	 (!(__sk)->sk_bound_dev_if	||				\
 	   ((__sk)->sk_bound_dev_if == (__dif))) 		&& 	\
 	 net_eq(sock_net(__sk), (__net)))
-#define INET_TW_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)\
-	((inet_twsk(__sk)->tw_portpair == (__ports))	&&		\
-	 (inet_twsk(__sk)->tw_addrpair == (__cookie))	&&		\
-	 (!(__sk)->sk_bound_dev_if	||				\
-	   ((__sk)->sk_bound_dev_if == (__dif)))	&&		\
-	 net_eq(sock_net(__sk), (__net)))
 #else /* 32-bit arch */
 #define INET_ADDR_COOKIE(__name, __saddr, __daddr)
 #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif) \
-	((inet_sk(__sk)->inet_portpair == (__ports))	&&		\
-	 (inet_sk(__sk)->inet_daddr	== (__saddr))	&&		\
-	 (inet_sk(__sk)->inet_rcv_saddr	== (__daddr))	&&		\
-	 (!(__sk)->sk_bound_dev_if	||				\
-	   ((__sk)->sk_bound_dev_if == (__dif))) 	&&		\
-	 net_eq(sock_net(__sk), (__net)))
-#define INET_TW_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif) \
-	((inet_twsk(__sk)->tw_portpair == (__ports))	&&		\
-	 (inet_twsk(__sk)->tw_daddr	== (__saddr))	&&		\
-	 (inet_twsk(__sk)->tw_rcv_saddr	== (__daddr))	&&		\
+	(((__sk)->sk_portpair == (__ports))		&&		\
+	 ((__sk)->sk_daddr	== (__saddr))		&&		\
+	 ((__sk)->sk_rcv_saddr	== (__daddr))		&&		\
 	 (!(__sk)->sk_bound_dev_if	||				\
 	   ((__sk)->sk_bound_dev_if == (__dif))) 	&&		\
 	 net_eq(sock_net(__sk), (__net)))
 #endif /* 64-bit arch */
 
+#define INET_TW_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)\
+	INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)
+
 /*
  * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
  * not check it for lookups anymore, thanks Alexey. -DaveM
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f3141773c14d..6d9a7e6eb5a4 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -146,10 +146,8 @@ struct inet_sock {
 	/* Socket demultiplex comparisons on incoming packets. */
 #define inet_daddr		sk.__sk_common.skc_daddr
 #define inet_rcv_saddr		sk.__sk_common.skc_rcv_saddr
-#define inet_addrpair		sk.__sk_common.skc_addrpair
 #define inet_dport		sk.__sk_common.skc_dport
 #define inet_num		sk.__sk_common.skc_num
-#define inet_portpair		sk.__sk_common.skc_portpair
 
 	__be32			inet_saddr;
 	__s16			uc_ttl;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 828200ab1125..48fd3561722c 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -112,10 +112,8 @@ struct inet_timewait_sock {
 #define tw_net			__tw_common.skc_net
 #define tw_daddr        	__tw_common.skc_daddr
 #define tw_rcv_saddr    	__tw_common.skc_rcv_saddr
-#define tw_addrpair		__tw_common.skc_addrpair
 #define tw_dport		__tw_common.skc_dport
 #define tw_num			__tw_common.skc_num
-#define tw_portpair		__tw_common.skc_portpair
 
 	int			tw_timeout;
 	volatile unsigned char	tw_substate;
@@ -189,12 +187,6 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 	return (struct inet_timewait_sock *)sk;
 }
 
-static inline __be32 sk_rcv_saddr(const struct sock *sk)
-{
-/* both inet_sk() and inet_twsk() store rcv_saddr in skc_rcv_saddr */
-	return sk->__sk_common.skc_rcv_saddr;
-}
-
 void inet_twsk_put(struct inet_timewait_sock *tw);
 
 int inet_twsk_unhash(struct inet_timewait_sock *tw);
diff --git a/include/net/sock.h b/include/net/sock.h
index f0a44cc1ff9d..e3bf213be625 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -300,6 +300,10 @@ struct sock {
 #define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end		__sk_common.skc_dontcopy_end
 #define sk_hash			__sk_common.skc_hash
+#define sk_portpair		__sk_common.skc_portpair
+#define sk_addrpair		__sk_common.skc_addrpair
+#define sk_daddr		__sk_common.skc_daddr
+#define sk_rcv_saddr		__sk_common.skc_rcv_saddr
 #define sk_family		__sk_common.skc_family
 #define sk_state		__sk_common.skc_state
 #define sk_reuse		__sk_common.skc_reuse
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7ac7aa11130e..56e82a4027b4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -71,17 +71,16 @@ int inet_csk_bind_conflict(const struct sock *sk,
 			    (!reuseport || !sk2->sk_reuseport ||
 			    (sk2->sk_state != TCP_TIME_WAIT &&
 			     !uid_eq(uid, sock_i_uid(sk2))))) {
-				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
-				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
-				    sk2_rcv_saddr == sk_rcv_saddr(sk))
+
+				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
 					break;
 			}
 			if (!relax && reuse && sk2->sk_reuse &&
 			    sk2->sk_state != TCP_LISTEN) {
-				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
 
-				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
-				    sk2_rcv_saddr == sk_rcv_saddr(sk))
+				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
 					break;
 			}
 		}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 72b7eaaf3ca0..8119791e8a95 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -57,8 +57,6 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 {
 	const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
 	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	__be32 sk1_rcv_saddr = sk_rcv_saddr(sk);
-	__be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
 	int sk_ipv6only = ipv6_only_sock(sk);
 	int sk2_ipv6only = inet_v6_ipv6only(sk2);
 	int addr_type = ipv6_addr_type(sk_rcv_saddr6);
@@ -67,8 +65,8 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 	/* if both are mapped, treat as IPv4 */
 	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED)
 		return (!sk2_ipv6only &&
-			(!sk1_rcv_saddr || !sk2_rcv_saddr ||
-			  sk1_rcv_saddr == sk2_rcv_saddr));
+			(!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr ||
+			  sk->sk_rcv_saddr == sk2->sk_rcv_saddr));
 
 	if (addr_type2 == IPV6_ADDR_ANY &&
 	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
-- 
cgit v1.2.3


From 34a6eda163975de55c5d6810276c5fa37087e366 Mon Sep 17 00:00:00 2001
From: Peter Senna Tschudin <peter.senna@gmail.com>
Date: Wed, 2 Oct 2013 14:19:51 +0200
Subject: net: ipv4: Change variable type to bool

The variable fully_acked is only assigned the values true and false.
Change its type to bool.

The simplified semantic patch that find this problem is as
follows (http://coccinelle.lip6.fr/):

@exists@
type T;
identifier b;
@@
- T
+ bool
  b = ...;
  ... when any
  b = \(true\|false\)

Signed-off-by: Peter Senna Tschudin <peter.senna@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cd65674ece92..fa6cf1f91ff8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3012,7 +3012,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_buff *skb;
 	u32 now = tcp_time_stamp;
-	int fully_acked = true;
+	bool fully_acked = true;
 	int flag = 0;
 	u32 pkts_acked = 0;
 	u32 reord = tp->packets_out;
-- 
cgit v1.2.3


From 96f817fedec48b59c9e8b22141cec4e56ad47913 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 3 Oct 2013 14:27:25 -0700
Subject: tcp: shrink tcp6_timewait_sock by one cache line

While working on tcp listener refactoring, I found that it
would really make things easier if sock_common could include
the IPv6 addresses needed in the lookups, instead of doing
very complex games to get their values (depending on sock
being SYN_RECV, ESTABLISHED, TIME_WAIT)

For this to happen, I need to be sure that tcp6_timewait_sock
and tcp_timewait_sock consume same number of cache lines.

This is possible if we only use 32bits for tw_ttd, as we remove
one 32bit hole in inet_timewait_sock

inet_tw_time_stamp() is defined and used, even if its current
implementation looks like tcp_time_stamp : We might need finer
resolution for tcp_time_stamp in the future.

Before patch : sizeof(struct tcp6_timewait_sock) = 0xc8

After patch : sizeof(struct tcp6_timewait_sock) = 0xc0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 7 ++++++-
 net/ipv4/inet_diag.c             | 6 +++---
 net/ipv4/inet_timewait_sock.c    | 4 ++--
 net/ipv6/tcp_ipv6.c              | 2 +-
 4 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 48fd3561722c..f528d1b0ac95 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -58,6 +58,11 @@ struct inet_hashinfo;
 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 #endif
 
+static inline u32 inet_tw_time_stamp(void)
+{
+	return jiffies;
+}
+
 /* TIME_WAIT reaping mechanism. */
 #define INET_TWDR_TWKILL_SLOTS	8 /* Please keep this a power of 2. */
 
@@ -130,7 +135,7 @@ struct inet_timewait_sock {
 				tw_tos		: 8,
 				tw_ipv6_offset  : 16;
 	kmemcheck_bitfield_end(flags);
-	unsigned long		tw_ttd;
+	u32			tw_ttd;
 	struct inet_bind_bucket	*tw_tb;
 	struct hlist_node	tw_death_node;
 };
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5f648751fce2..22000279efc8 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -222,7 +222,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 			       u32 portid, u32 seq, u16 nlmsg_flags,
 			       const struct nlmsghdr *unlh)
 {
-	long tmo;
+	s32 tmo;
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 
@@ -234,7 +234,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	r = nlmsg_data(nlh);
 	BUG_ON(tw->tw_state != TCP_TIME_WAIT);
 
-	tmo = tw->tw_ttd - jiffies;
+	tmo = tw->tw_ttd - inet_tw_time_stamp();
 	if (tmo < 0)
 		tmo = 0;
 
@@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	r->id.idiag_dst[0]    = tw->tw_daddr;
 	r->idiag_state	      = tw->tw_substate;
 	r->idiag_timer	      = 3;
-	r->idiag_expires      = DIV_ROUND_UP(tmo * 1000, HZ);
+	r->idiag_expires      = jiffies_to_msecs(tmo);
 	r->idiag_rqueue	      = 0;
 	r->idiag_wqueue	      = 0;
 	r->idiag_uid	      = 0;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1f27c9f4afd0..9bcd8f7234ec 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -387,11 +387,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
 			if (slot >= INET_TWDR_TWKILL_SLOTS)
 				slot = INET_TWDR_TWKILL_SLOTS - 1;
 		}
-		tw->tw_ttd = jiffies + timeo;
+		tw->tw_ttd = inet_tw_time_stamp() + timeo;
 		slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
 		list = &twdr->cells[slot];
 	} else {
-		tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+		tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
 
 		if (twdr->twcal_hand < 0) {
 			twdr->twcal_hand = 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5c71501fc917..dde8bad04481 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1811,7 +1811,7 @@ static void get_timewait6_sock(struct seq_file *seq,
 	const struct in6_addr *dest, *src;
 	__u16 destp, srcp;
 	const struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
-	long delta = tw->tw_ttd - jiffies;
+	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
 
 	dest = &tw6->tw_v6_daddr;
 	src  = &tw6->tw_v6_rcv_saddr;
-- 
cgit v1.2.3


From 5e8a402f831dbe7ee831340a91439e46f0d38acd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 4 Oct 2013 10:31:41 -0700
Subject: tcp: do not forget FIN in tcp_shifted_skb()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Yuchung found following problem :

 There are bugs in the SACK processing code, merging part in
 tcp_shift_skb_data(), that incorrectly resets or ignores the sacked
 skbs FIN flag. When a receiver first SACK the FIN sequence, and later
 throw away ofo queue (e.g., sack-reneging), the sender will stop
 retransmitting the FIN flag, and hangs forever.

Following packetdrill test can be used to reproduce the bug.

$ cat sack-merge-bug.pkt
`sysctl -q net.ipv4.tcp_fack=0`

// Establish a connection and send 10 MSS.
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+.000 bind(3, ..., ...) = 0
+.000 listen(3, 1) = 0

+.050 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+.000 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+.001 < . 1:1(0) ack 1 win 1024
+.000 accept(3, ..., ...) = 4

+.100 write(4, ..., 12000) = 12000
+.000 shutdown(4, SHUT_WR) = 0
+.000 > . 1:10001(10000) ack 1
+.050 < . 1:1(0) ack 2001 win 257
+.000 > FP. 10001:12001(2000) ack 1
+.050 < . 1:1(0) ack 2001 win 257 <sack 10001:11001,nop,nop>
+.050 < . 1:1(0) ack 2001 win 257 <sack 10001:12002,nop,nop>
// SACK reneg
+.050 < . 1:1(0) ack 12001 win 257
+0 %{ print "unacked: ",tcpi_unacked }%
+5 %{ print "" }%

First, a typo inverted left/right of one OR operation, then
code forgot to advance end_seq if the merged skb carried FIN.

Bug was added in 2.6.29 by commit 832d11c5cd076ab
("tcp: Try to restore large SKBs while SACK processing")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 25a89eaa669d..113dc5f17d47 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1284,7 +1284,10 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 		tp->lost_cnt_hint -= tcp_skb_pcount(prev);
 	}
 
-	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags;
+	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		TCP_SKB_CB(prev)->end_seq++;
+
 	if (skb == tcp_highest_sack(sk))
 		tcp_advance_highest_sack(sk, skb);
 
-- 
cgit v1.2.3


From 0a7e22609067ff524fc7bbd45c6951dd08561667 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 4 Oct 2013 17:04:48 +0200
Subject: ipv4: fix ineffective source address selection

When sending out multicast messages, the source address in inet->mc_addr is
ignored and rewritten by an autoselected one. This is caused by a typo in
commit 813b3b5db831 ("ipv4: Use caller's on-stack flowi as-is in output
route lookups").

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 727f4365bcdf..6011615e810d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2072,7 +2072,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 							      RT_SCOPE_LINK);
 			goto make_route;
 		}
-		if (fl4->saddr) {
+		if (!fl4->saddr) {
 			if (ipv4_is_multicast(fl4->daddr))
 				fl4->saddr = inet_select_addr(dev_out, 0,
 							      fl4->flowi4_scope);
-- 
cgit v1.2.3


From 005ec9743394010cd37d86c3fd2e81978231cdbf Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <sbohrer@rgmadvisors.com>
Date: Mon, 7 Oct 2013 11:01:38 -0500
Subject: udp: Only allow busy read/poll on connected sockets

UDP sockets can receive packets from multiple endpoints and thus may be
received on multiple receive queues.  Since packets packets can arrive
on multiple receive queues we should not mark the napi_id for all
packets.  This makes busy read/poll only work for connected UDP sockets.

This additionally enables busy read/poll for UDP multicast packets as
long as the socket is connected by moving the check into
__udp_queue_rcv_skb().

Signed-off-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 5 +++--
 net/ipv6/udp.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c41833e9c083..5950e12bd3ab 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1405,8 +1405,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
-	if (inet_sk(sk)->inet_daddr)
+	if (inet_sk(sk)->inet_daddr) {
 		sock_rps_save_rxhash(sk, skb);
+		sk_mark_napi_id(sk, skb);
+	}
 
 	rc = sock_queue_rcv_skb(sk, skb);
 	if (rc < 0) {
@@ -1716,7 +1718,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		sk_mark_napi_id(sk, skb);
 		ret = udp_queue_rcv_skb(sk, skb);
 		sock_put(sk);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8119791e8a95..37532478e3ba 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -549,8 +549,10 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
-	if (!ipv6_addr_any(&inet6_sk(sk)->daddr))
+	if (!ipv6_addr_any(&inet6_sk(sk)->daddr)) {
 		sock_rps_save_rxhash(sk, skb);
+		sk_mark_napi_id(sk, skb);
+	}
 
 	rc = sock_queue_rcv_skb(sk, skb);
 	if (rc < 0) {
@@ -844,7 +846,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		sk_mark_napi_id(sk, skb);
 		ret = udpv6_queue_rcv_skb(sk, skb);
 		sock_put(sk);
 
-- 
cgit v1.2.3


From 421b3885bf6d56391297844f43fb7154a6396e12 Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <sbohrer@rgmadvisors.com>
Date: Mon, 7 Oct 2013 11:01:39 -0500
Subject: udp: ipv4: Add udp early demux

The removal of the routing cache introduced a performance regression for
some UDP workloads since a dst lookup must be done for each packet.
This change caches the dst per socket in a similar manner to what we do
for TCP by implementing early_demux.

For UDP multicast we can only cache the dst if there is only one
receiving socket on the host.  Since caching only works when there is
one receiving socket we do the multicast socket lookup using RCU.

For UDP unicast we only demux sockets with an exact match in order to
not break forwarding setups.  Additionally since the hash chains may be
long we only check the first socket to see if it is a match and not
waste extra time searching the whole chain when we might not find an
exact match.

Benchmark results from a netperf UDP_RR test:
Before 87961.22 transactions/s
After  89789.68 transactions/s

Benchmark results from a fio 1 byte UDP multicast pingpong test
(Multicast one way unicast response):
Before 12.97us RTT
After  12.63us RTT

Signed-off-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |   2 +-
 include/net/udp.h  |   1 +
 net/ipv4/af_inet.c |   1 +
 net/ipv4/udp.c     | 202 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 187 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/sock.h b/include/net/sock.h
index e3bf213be625..79532540201b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -218,7 +218,7 @@ struct cg_proto;
   *	@sk_lock:	synchronizer
   *	@sk_rcvbuf: size of receive buffer in bytes
   *	@sk_wq: sock wait queue and async head
-  *	@sk_rx_dst: receive input route used by early tcp demux
+  *	@sk_rx_dst: receive input route used by early demux
   *	@sk_dst_cache: destination cache
   *	@sk_dst_lock: destination cache lock
   *	@sk_policy: flow policy
diff --git a/include/net/udp.h b/include/net/udp.h
index 510b8cb4d1a7..fe4ba9f32429 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -175,6 +175,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		     unsigned int hash2_nulladdr);
 
 /* net/ipv4/udp.c */
+void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
 				  const struct sock *));
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cfeb85cff4f0..35913fb77dc8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1546,6 +1546,7 @@ static const struct net_protocol tcp_protocol = {
 };
 
 static const struct net_protocol udp_protocol = {
+	.early_demux =	udp_v4_early_demux,
 	.handler =	udp_rcv,
 	.err_handler =	udp_err,
 	.no_policy =	1,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5950e12bd3ab..262ea3929da6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -103,6 +103,7 @@
 #include <linux/seq_file.h>
 #include <net/net_namespace.h>
 #include <net/icmp.h>
+#include <net/inet_hashtables.h>
 #include <net/route.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -565,6 +566,26 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 }
 EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 
+static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
+				       __be16 loc_port, __be32 loc_addr,
+				       __be16 rmt_port, __be32 rmt_addr,
+				       int dif, unsigned short hnum)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (!net_eq(sock_net(sk), net) ||
+	    udp_sk(sk)->udp_port_hash != hnum ||
+	    (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+	    (inet->inet_dport != rmt_port && inet->inet_dport) ||
+	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
+	    ipv6_only_sock(sk) ||
+	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+		return false;
+	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+		return false;
+	return true;
+}
+
 static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 					     __be16 loc_port, __be32 loc_addr,
 					     __be16 rmt_port, __be32 rmt_addr,
@@ -575,20 +596,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 	unsigned short hnum = ntohs(loc_port);
 
 	sk_nulls_for_each_from(s, node) {
-		struct inet_sock *inet = inet_sk(s);
-
-		if (!net_eq(sock_net(s), net) ||
-		    udp_sk(s)->udp_port_hash != hnum ||
-		    (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
-		    (inet->inet_dport != rmt_port && inet->inet_dport) ||
-		    (inet->inet_rcv_saddr &&
-		     inet->inet_rcv_saddr != loc_addr) ||
-		    ipv6_only_sock(s) ||
-		    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
-			continue;
-		if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
-			continue;
-		goto found;
+		if (__udp_is_mcast_sock(net, s,
+					loc_port, loc_addr,
+					rmt_port, rmt_addr,
+					dif, hnum))
+			goto found;
 	}
 	s = NULL;
 found:
@@ -1581,6 +1593,14 @@ static void flush_stack(struct sock **stack, unsigned int count,
 		kfree_skb(skb1);
 }
 
+static void udp_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+
+	dst_hold(dst);
+	sk->sk_rx_dst = dst;
+}
+
 /*
  *	Multicasts and broadcasts go to each listener.
  *
@@ -1709,11 +1729,28 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (udp4_csum_init(skb, uh, proto))
 		goto csum_error;
 
-	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
-		return __udp4_lib_mcast_deliver(net, skb, uh,
-				saddr, daddr, udptable);
+	if (skb->sk) {
+		int ret;
+		sk = skb->sk;
 
-	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+		if (unlikely(sk->sk_rx_dst == NULL))
+			udp_sk_rx_dst_set(sk, skb);
+
+		ret = udp_queue_rcv_skb(sk, skb);
+
+		/* a return value > 0 means to resubmit the input, but
+		 * it wants the return to be -protocol, or 0
+		 */
+		if (ret > 0)
+			return -ret;
+		return 0;
+	} else {
+		if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+			return __udp4_lib_mcast_deliver(net, skb, uh,
+					saddr, daddr, udptable);
+
+		sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+	}
 
 	if (sk != NULL) {
 		int ret;
@@ -1771,6 +1808,135 @@ drop:
 	return 0;
 }
 
+/* We can only early demux multicast if there is a single matching socket.
+ * If more than one socket found returns NULL
+ */
+static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
+						  __be16 loc_port, __be32 loc_addr,
+						  __be16 rmt_port, __be32 rmt_addr,
+						  int dif)
+{
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(loc_port);
+	unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+	struct udp_hslot *hslot = &udp_table.hash[slot];
+
+	rcu_read_lock();
+begin:
+	count = 0;
+	result = NULL;
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk,
+					loc_port, loc_addr,
+					rmt_port, rmt_addr,
+					dif, hnum)) {
+			result = sk;
+			++count;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begin;
+
+	if (result) {
+		if (count != 1 ||
+		    unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+			result = NULL;
+		else if (unlikely(!__udp_is_mcast_sock(net, sk,
+						       loc_port, loc_addr,
+						       rmt_port, rmt_addr,
+						       dif, hnum))) {
+			sock_put(result);
+			result = NULL;
+		}
+	}
+	rcu_read_unlock();
+	return result;
+}
+
+/* For unicast we should only early demux connected sockets or we can
+ * break forwarding setups.  The chains here can be long so only check
+ * if the first socket is an exact match and if not move on.
+ */
+static struct sock *__udp4_lib_demux_lookup(struct net *net,
+					    __be16 loc_port, __be32 loc_addr,
+					    __be16 rmt_port, __be32 rmt_addr,
+					    int dif)
+{
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(loc_port);
+	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+	unsigned int slot2 = hash2 & udp_table.mask;
+	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr)
+	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+	rcu_read_lock();
+	result = NULL;
+	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+		if (INET_MATCH(sk, net, acookie,
+			       rmt_addr, loc_addr, ports, dif))
+			result = sk;
+		/* Only check first socket in chain */
+		break;
+	}
+
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+			result = NULL;
+		else if (unlikely(!INET_MATCH(sk, net, acookie,
+					      rmt_addr, loc_addr,
+					      ports, dif))) {
+			sock_put(result);
+			result = NULL;
+		}
+	}
+	rcu_read_unlock();
+	return result;
+}
+
+void udp_v4_early_demux(struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct udphdr *uh = udp_hdr(skb);
+	struct sock *sk;
+	struct dst_entry *dst;
+	struct net *net = dev_net(skb->dev);
+	int dif = skb->dev->ifindex;
+
+	/* validate the packet */
+	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
+		return;
+
+	if (skb->pkt_type == PACKET_BROADCAST ||
+	    skb->pkt_type == PACKET_MULTICAST)
+		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
+						   uh->source, iph->saddr, dif);
+	else if (skb->pkt_type == PACKET_HOST)
+		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
+					     uh->source, iph->saddr, dif);
+	else
+		return;
+
+	if (!sk)
+		return;
+
+	skb->sk = sk;
+	skb->destructor = sock_edemux;
+	dst = sk->sk_rx_dst;
+
+	if (dst)
+		dst = dst_check(dst, 0);
+	if (dst)
+		skb_dst_set_noref(skb, dst);
+}
+
 int udp_rcv(struct sk_buff *skb)
 {
 	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
-- 
cgit v1.2.3


From fbf8866d65d5de84f75563eb0edd7fc27dbe9a90 Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <sbohrer@rgmadvisors.com>
Date: Mon, 7 Oct 2013 11:01:40 -0500
Subject: net: ipv4 only populate IP_PKTINFO when needed

The since the removal of the routing cache computing
fib_compute_spec_dst() does a fib_table lookup for each UDP multicast
packet received.  This has introduced a performance regression for some
UDP workloads.

This change skips populating the packet info for sockets that do not have
IP_PKTINFO set.

Benchmark results from a netperf UDP_RR test:
Before 89789.68 transactions/s
After  90587.62 transactions/s

Benchmark results from a fio 1 byte UDP multicast pingpong test
(Multicast one way unicast response):
Before 12.63us RTT
After  12.48us RTT

Signed-off-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       | 2 +-
 net/ipv4/ip_sockglue.c | 5 +++--
 net/ipv4/raw.c         | 2 +-
 net/ipv4/udp.c         | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index 16078f422397..b39ebe5339ac 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -459,7 +459,7 @@ int ip_options_rcv_srr(struct sk_buff *skb);
  *	Functions provided by ip_sockglue.c
  */
 
-void ipv4_pktinfo_prepare(struct sk_buff *skb);
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
 void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb);
 int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc);
 int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 56e34457ac07..0626f2cb192e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1052,11 +1052,12 @@ e_inval:
  * destination in skb->cb[] before dst drop.
  * This way, receiver doesnt make cache line misses to read rtable.
  */
-void ipv4_pktinfo_prepare(struct sk_buff *skb)
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
 {
 	struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
 
-	if (skb_rtable(skb)) {
+	if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) &&
+	    skb_rtable(skb)) {
 		pktinfo->ipi_ifindex = inet_iif(skb);
 		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
 	} else {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index b2fa14c1a6f1..41e1d2845c8f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -299,7 +299,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	/* Charge it to the socket. */
 
-	ipv4_pktinfo_prepare(skb);
+	ipv4_pktinfo_prepare(sk, skb);
 	if (sock_queue_rcv_skb(sk, skb) < 0) {
 		kfree_skb(skb);
 		return NET_RX_DROP;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 262ea3929da6..4226c53daaed 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1544,7 +1544,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	rc = 0;
 
-	ipv4_pktinfo_prepare(skb);
+	ipv4_pktinfo_prepare(sk, skb);
 	bh_lock_sock(sk);
 	if (!sock_owned_by_user(sk))
 		rc = __udp_queue_rcv_skb(sk, skb);
-- 
cgit v1.2.3


From 05dbc7b59481ca891bbcfe6799a562d48159fbf7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 3 Oct 2013 00:22:02 -0700
Subject: tcp/dccp: remove twchain

TCP listener refactoring, part 3 :

Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.

Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.

As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.

If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.

[ INET_TW_MATCH() is no longer needed ]

I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()

This way, SYN_RECV pseudo sockets will be supported the same.

A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].

Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()

Before patch :

dmesg | grep "TCP established"

TCP established hash table entries: 524288 (order: 11, 8388608 bytes)

After patch :

TCP established hash table entries: 524288 (order: 10, 4194304 bytes)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h    |  9 +----
 include/net/inet_timewait_sock.h | 13 +------
 include/net/sock.h               |  8 +++-
 include/net/tcp.h                |  1 -
 net/dccp/proto.c                 |  4 +-
 net/ipv4/inet_diag.c             | 48 +++++++----------------
 net/ipv4/inet_hashtables.c       | 83 ++++++++++++++--------------------------
 net/ipv4/inet_timewait_sock.c    | 55 +++++++++++++-------------
 net/ipv4/tcp.c                   |  5 +--
 net/ipv4/tcp_ipv4.c              | 83 +++++++---------------------------------
 net/ipv6/inet6_hashtables.c      | 75 +++++++++++++++---------------------
 net/ipv6/tcp_ipv6.c              |  9 +++--
 12 files changed, 132 insertions(+), 261 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 10d6838378c3..1bdb47715def 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -37,12 +37,11 @@
 #include <asm/byteorder.h>
 
 /* This is for all connections with a full identity, no wildcards.
- * One chain is dedicated to TIME_WAIT sockets.
- * I'll experiment with dynamic table growth later.
+ * The 'e' prefix stands for Establish, but we really put all sockets
+ * but LISTEN ones.
  */
 struct inet_ehash_bucket {
 	struct hlist_nulls_head chain;
-	struct hlist_nulls_head twchain;
 };
 
 /* There are a few simple rules, which allow for local port reuse by
@@ -123,7 +122,6 @@ struct inet_hashinfo {
 	 *
 	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
 	 *
-	 * TIME_WAIT sockets use a separate chain (twchain).
 	 */
 	struct inet_ehash_bucket	*ehash;
 	spinlock_t			*ehash_locks;
@@ -318,9 +316,6 @@ static inline struct sock *inet_lookup_listener(struct net *net,
 	 net_eq(sock_net(__sk), (__net)))
 #endif /* 64-bit arch */
 
-#define INET_TW_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)\
-	INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)
-
 /*
  * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
  * not check it for lookups anymore, thanks Alexey. -DaveM
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index f528d1b0ac95..de9e3ab7d43d 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -141,18 +141,6 @@ struct inet_timewait_sock {
 };
 #define tw_tclass tw_tos
 
-static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
-				      struct hlist_nulls_head *list)
-{
-	hlist_nulls_add_head_rcu(&tw->tw_node, list);
-}
-
-static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
-					   struct hlist_head *list)
-{
-	hlist_add_head(&tw->tw_bind_node, list);
-}
-
 static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
 {
 	return !hlist_unhashed(&tw->tw_death_node);
@@ -192,6 +180,7 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 	return (struct inet_timewait_sock *)sk;
 }
 
+void inet_twsk_free(struct inet_timewait_sock *tw);
 void inet_twsk_put(struct inet_timewait_sock *tw);
 
 int inet_twsk_unhash(struct inet_timewait_sock *tw);
diff --git a/include/net/sock.h b/include/net/sock.h
index 7cf8d2331afb..3f3e48c4704d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -156,7 +156,7 @@ typedef __u64 __bitwise __addrpair;
  */
 struct sock_common {
 	/* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
-	 * address on 64bit arches : cf INET_MATCH() and INET_TW_MATCH()
+	 * address on 64bit arches : cf INET_MATCH()
 	 */
 	union {
 		__addrpair	skc_addrpair;
@@ -301,6 +301,8 @@ struct sock {
 #define sk_dontcopy_end		__sk_common.skc_dontcopy_end
 #define sk_hash			__sk_common.skc_hash
 #define sk_portpair		__sk_common.skc_portpair
+#define sk_num			__sk_common.skc_num
+#define sk_dport		__sk_common.skc_dport
 #define sk_addrpair		__sk_common.skc_addrpair
 #define sk_daddr		__sk_common.skc_daddr
 #define sk_rcv_saddr		__sk_common.skc_rcv_saddr
@@ -1653,6 +1655,10 @@ static inline void sock_put(struct sock *sk)
 	if (atomic_dec_and_test(&sk->sk_refcnt))
 		sk_free(sk);
 }
+/* Generic version of sock_put(), dealing with all sockets
+ * (TCP_TIMEWAIT, ESTABLISHED...)
+ */
+void sock_gen_put(struct sock *sk);
 
 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de870ee5582d..39bbfa1602b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1519,7 +1519,6 @@ enum tcp_seq_states {
 	TCP_SEQ_STATE_LISTENING,
 	TCP_SEQ_STATE_OPENREQ,
 	TCP_SEQ_STATE_ESTABLISHED,
-	TCP_SEQ_STATE_TIME_WAIT,
 };
 
 int tcp_seq_open(struct inode *inode, struct file *file);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ba64750f0387..eb892b4f4814 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1158,10 +1158,8 @@ static int __init dccp_init(void)
 		goto out_free_bind_bucket_cachep;
 	}
 
-	for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
+	for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
 		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
-		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
-	}
 
 	if (inet_ehash_locks_alloc(&dccp_hashinfo))
 			goto out_free_dccp_ehash;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 22000279efc8..8e1e40653357 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -635,12 +635,14 @@ static int inet_csk_diag_dump(struct sock *sk,
 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
 
-static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
+static int inet_twsk_diag_dump(struct sock *sk,
 			       struct sk_buff *skb,
 			       struct netlink_callback *cb,
 			       struct inet_diag_req_v2 *r,
 			       const struct nlattr *bc)
 {
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+
 	if (bc != NULL) {
 		struct inet_diag_entry entry;
 
@@ -911,8 +913,7 @@ skip_listen_ht:
 
 		num = 0;
 
-		if (hlist_nulls_empty(&head->chain) &&
-			hlist_nulls_empty(&head->twchain))
+		if (hlist_nulls_empty(&head->chain))
 			continue;
 
 		if (i > s_i)
@@ -920,7 +921,7 @@ skip_listen_ht:
 
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &head->chain) {
-			struct inet_sock *inet = inet_sk(sk);
+			int res;
 
 			if (!net_eq(sock_net(sk), net))
 				continue;
@@ -929,15 +930,19 @@ skip_listen_ht:
 			if (!(r->idiag_states & (1 << sk->sk_state)))
 				goto next_normal;
 			if (r->sdiag_family != AF_UNSPEC &&
-					sk->sk_family != r->sdiag_family)
+			    sk->sk_family != r->sdiag_family)
 				goto next_normal;
-			if (r->id.idiag_sport != inet->inet_sport &&
+			if (r->id.idiag_sport != htons(sk->sk_num) &&
 			    r->id.idiag_sport)
 				goto next_normal;
-			if (r->id.idiag_dport != inet->inet_dport &&
+			if (r->id.idiag_dport != sk->sk_dport &&
 			    r->id.idiag_dport)
 				goto next_normal;
-			if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+			if (sk->sk_state == TCP_TIME_WAIT)
+				res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
+			else
+				res = inet_csk_diag_dump(sk, skb, cb, r, bc);
+			if (res < 0) {
 				spin_unlock_bh(lock);
 				goto done;
 			}
@@ -945,33 +950,6 @@ next_normal:
 			++num;
 		}
 
-		if (r->idiag_states & TCPF_TIME_WAIT) {
-			struct inet_timewait_sock *tw;
-
-			inet_twsk_for_each(tw, node,
-				    &head->twchain) {
-				if (!net_eq(twsk_net(tw), net))
-					continue;
-
-				if (num < s_num)
-					goto next_dying;
-				if (r->sdiag_family != AF_UNSPEC &&
-						tw->tw_family != r->sdiag_family)
-					goto next_dying;
-				if (r->id.idiag_sport != tw->tw_sport &&
-				    r->id.idiag_sport)
-					goto next_dying;
-				if (r->id.idiag_dport != tw->tw_dport &&
-				    r->id.idiag_dport)
-					goto next_dying;
-				if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
-					spin_unlock_bh(lock);
-					goto done;
-				}
-next_dying:
-				++num;
-			}
-		}
 		spin_unlock_bh(lock);
 	}
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ae199596b9b0..a4b66bbe4f21 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -230,6 +230,19 @@ begin:
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
 
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
+{
+	if (!atomic_dec_and_test(&sk->sk_refcnt))
+		return;
+
+	if (sk->sk_state == TCP_TIME_WAIT)
+		inet_twsk_free(inet_twsk(sk));
+	else
+		sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
 struct sock *__inet_lookup_established(struct net *net,
 				  struct inet_hashinfo *hashinfo,
 				  const __be32 saddr, const __be16 sport,
@@ -255,13 +268,13 @@ begin:
 		if (likely(INET_MATCH(sk, net, acookie,
 				      saddr, daddr, ports, dif))) {
 			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
-				goto begintw;
+				goto out;
 			if (unlikely(!INET_MATCH(sk, net, acookie,
 						 saddr, daddr, ports, dif))) {
-				sock_put(sk);
+				sock_gen_put(sk);
 				goto begin;
 			}
-			goto out;
+			goto found;
 		}
 	}
 	/*
@@ -271,37 +284,9 @@ begin:
 	 */
 	if (get_nulls_value(node) != slot)
 		goto begin;
-
-begintw:
-	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
-		if (sk->sk_hash != hash)
-			continue;
-		if (likely(INET_TW_MATCH(sk, net, acookie,
-					 saddr, daddr, ports,
-					 dif))) {
-			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
-				sk = NULL;
-				goto out;
-			}
-			if (unlikely(!INET_TW_MATCH(sk, net, acookie,
-						    saddr, daddr, ports,
-						    dif))) {
-				inet_twsk_put(inet_twsk(sk));
-				goto begintw;
-			}
-			goto out;
-		}
-	}
-	/*
-	 * if the nulls value we got at the end of this lookup is
-	 * not the expected one, we must restart lookup.
-	 * We probably met an item that was moved to another chain.
-	 */
-	if (get_nulls_value(node) != slot)
-		goto begintw;
-	sk = NULL;
 out:
+	sk = NULL;
+found:
 	rcu_read_unlock();
 	return sk;
 }
@@ -326,39 +311,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
 	const struct hlist_nulls_node *node;
-	struct inet_timewait_sock *tw;
+	struct inet_timewait_sock *tw = NULL;
 	int twrefcnt = 0;
 
 	spin_lock(lock);
 
-	/* Check TIME-WAIT sockets first. */
-	sk_nulls_for_each(sk2, node, &head->twchain) {
-		if (sk2->sk_hash != hash)
-			continue;
-
-		if (likely(INET_TW_MATCH(sk2, net, acookie,
-					 saddr, daddr, ports, dif))) {
-			tw = inet_twsk(sk2);
-			if (twsk_unique(sk, sk2, twp))
-				goto unique;
-			else
-				goto not_unique;
-		}
-	}
-	tw = NULL;
-
-	/* And established part... */
 	sk_nulls_for_each(sk2, node, &head->chain) {
 		if (sk2->sk_hash != hash)
 			continue;
+
 		if (likely(INET_MATCH(sk2, net, acookie,
-				      saddr, daddr, ports, dif)))
+					 saddr, daddr, ports, dif))) {
+			if (sk2->sk_state == TCP_TIME_WAIT) {
+				tw = inet_twsk(sk2);
+				if (twsk_unique(sk, sk2, twp))
+					break;
+			}
 			goto not_unique;
+		}
 	}
 
-unique:
 	/* Must record num and sport now. Otherwise we will see
-	 * in hash table socket with a funny identity. */
+	 * in hash table socket with a funny identity.
+	 */
 	inet->inet_num = lport;
 	inet->inet_sport = htons(lport);
 	sk->sk_hash = hash;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 9bcd8f7234ec..6d592f8555fb 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -87,19 +87,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	refcnt += inet_twsk_bind_unhash(tw, hashinfo);
 	spin_unlock(&bhead->lock);
 
-#ifdef SOCK_REFCNT_DEBUG
-	if (atomic_read(&tw->tw_refcnt) != 1) {
-		pr_debug("%s timewait_sock %p refcnt=%d\n",
-			 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
-	}
-#endif
-	while (refcnt) {
-		inet_twsk_put(tw);
-		refcnt--;
-	}
+	BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
+	atomic_sub(refcnt, &tw->tw_refcnt);
 }
 
-static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
+void inet_twsk_free(struct inet_timewait_sock *tw)
 {
 	struct module *owner = tw->tw_prot->owner;
 	twsk_destructor((struct sock *)tw);
@@ -118,6 +110,18 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
 }
 EXPORT_SYMBOL_GPL(inet_twsk_put);
 
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+				   struct hlist_nulls_head *list)
+{
+	hlist_nulls_add_head_rcu(&tw->tw_node, list);
+}
+
+static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+				    struct hlist_head *list)
+{
+	hlist_add_head(&tw->tw_bind_node, list);
+}
+
 /*
  * Enter the time wait state. This is called with locally disabled BH.
  * Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -146,26 +150,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	spin_lock(lock);
 
 	/*
-	 * Step 2: Hash TW into TIMEWAIT chain.
-	 * Should be done before removing sk from established chain
-	 * because readers are lockless and search established first.
+	 * Step 2: Hash TW into tcp ehash chain.
+	 * Notes :
+	 * - tw_refcnt is set to 3 because :
+	 * - We have one reference from bhash chain.
+	 * - We have one reference from ehash chain.
+	 * We can use atomic_set() because prior spin_lock()/spin_unlock()
+	 * committed into memory all tw fields.
 	 */
-	inet_twsk_add_node_rcu(tw, &ehead->twchain);
+	atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+	inet_twsk_add_node_rcu(tw, &ehead->chain);
 
-	/* Step 3: Remove SK from established hash. */
+	/* Step 3: Remove SK from hash chain */
 	if (__sk_nulls_del_node_init_rcu(sk))
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 
-	/*
-	 * Notes :
-	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
-	 * - We add one reference for the bhash link
-	 * - We add one reference for the ehash link
-	 * - We want this refcnt update done before allowing other
-	 *   threads to find this tw in ehash chain.
-	 */
-	atomic_add(1 + 1 + 1, &tw->tw_refcnt);
-
 	spin_unlock(lock);
 }
 EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -490,7 +489,9 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
 restart_rcu:
 		rcu_read_lock();
 restart:
-		sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+		sk_nulls_for_each_rcu(sk, node, &head->chain) {
+			if (sk->sk_state != TCP_TIME_WAIT)
+				continue;
 			tw = inet_twsk(sk);
 			if ((tw->tw_family != family) ||
 				atomic_read(&twsk_net(tw)->count))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6e5617b9f9db..be4b161802e8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3137,10 +3137,9 @@ void __init tcp_init(void)
 					&tcp_hashinfo.ehash_mask,
 					0,
 					thash_entries ? 0 : 512 * 1024);
-	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
+	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
 		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
-		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
-	}
+
 	if (inet_ehash_locks_alloc(&tcp_hashinfo))
 		panic("TCP: failed to alloc ehash_locks");
 	tcp_hashinfo.bhash =
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5d6b1a609da8..e4695dde1af6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2194,18 +2194,6 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
 
-static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
-{
-	return hlist_nulls_empty(head) ? NULL :
-		list_entry(head->first, struct inet_timewait_sock, tw_node);
-}
-
-static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
-{
-	return !is_a_nulls(tw->tw_node.next) ?
-		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
-}
-
 /*
  * Get next listener socket follow cur.  If cur is NULL, get first socket
  * starting from bucket given in st->bucket; when st->bucket is zero the
@@ -2309,10 +2297,9 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 	return rc;
 }
 
-static inline bool empty_bucket(struct tcp_iter_state *st)
+static inline bool empty_bucket(const struct tcp_iter_state *st)
 {
-	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
-		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
 }
 
 /*
@@ -2329,7 +2316,6 @@ static void *established_get_first(struct seq_file *seq)
 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
 		struct sock *sk;
 		struct hlist_nulls_node *node;
-		struct inet_timewait_sock *tw;
 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
 
 		/* Lockless fast path for the common case of empty buckets */
@@ -2345,18 +2331,7 @@ static void *established_get_first(struct seq_file *seq)
 			rc = sk;
 			goto out;
 		}
-		st->state = TCP_SEQ_STATE_TIME_WAIT;
-		inet_twsk_for_each(tw, node,
-				   &tcp_hashinfo.ehash[st->bucket].twchain) {
-			if (tw->tw_family != st->family ||
-			    !net_eq(twsk_net(tw), net)) {
-				continue;
-			}
-			rc = tw;
-			goto out;
-		}
 		spin_unlock_bh(lock);
-		st->state = TCP_SEQ_STATE_ESTABLISHED;
 	}
 out:
 	return rc;
@@ -2365,7 +2340,6 @@ out:
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
 	struct sock *sk = cur;
-	struct inet_timewait_sock *tw;
 	struct hlist_nulls_node *node;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
@@ -2373,45 +2347,16 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	++st->num;
 	++st->offset;
 
-	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
-		tw = cur;
-		tw = tw_next(tw);
-get_tw:
-		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
-			tw = tw_next(tw);
-		}
-		if (tw) {
-			cur = tw;
-			goto out;
-		}
-		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
-		st->state = TCP_SEQ_STATE_ESTABLISHED;
-
-		/* Look for next non empty bucket */
-		st->offset = 0;
-		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
-				empty_bucket(st))
-			;
-		if (st->bucket > tcp_hashinfo.ehash_mask)
-			return NULL;
-
-		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
-		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
-	} else
-		sk = sk_nulls_next(sk);
+	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
-			goto found;
+			return sk;
 	}
 
-	st->state = TCP_SEQ_STATE_TIME_WAIT;
-	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
-	goto get_tw;
-found:
-	cur = sk;
-out:
-	return cur;
+	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+	++st->bucket;
+	return established_get_first(seq);
 }
 
 static void *established_get_idx(struct seq_file *seq, loff_t pos)
@@ -2464,10 +2409,9 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 		if (rc)
 			break;
 		st->bucket = 0;
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
 		/* Fallthrough */
 	case TCP_SEQ_STATE_ESTABLISHED:
-	case TCP_SEQ_STATE_TIME_WAIT:
-		st->state = TCP_SEQ_STATE_ESTABLISHED;
 		if (st->bucket > tcp_hashinfo.ehash_mask)
 			break;
 		rc = established_get_first(seq);
@@ -2524,7 +2468,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		}
 		break;
 	case TCP_SEQ_STATE_ESTABLISHED:
-	case TCP_SEQ_STATE_TIME_WAIT:
 		rc = established_get_next(seq, v);
 		break;
 	}
@@ -2548,7 +2491,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 		if (v != SEQ_START_TOKEN)
 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
 		break;
-	case TCP_SEQ_STATE_TIME_WAIT:
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)
 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2707,6 +2649,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
 static int tcp4_seq_show(struct seq_file *seq, void *v)
 {
 	struct tcp_iter_state *st;
+	struct sock *sk = v;
 	int len;
 
 	if (v == SEQ_START_TOKEN) {
@@ -2721,14 +2664,14 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
 	case TCP_SEQ_STATE_ESTABLISHED:
-		get_tcp4_sock(v, seq, st->num, &len);
+		if (sk->sk_state == TCP_TIME_WAIT)
+			get_timewait4_sock(v, seq, st->num, &len);
+		else
+			get_tcp4_sock(v, seq, st->num, &len);
 		break;
 	case TCP_SEQ_STATE_OPENREQ:
 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
 		break;
-	case TCP_SEQ_STATE_TIME_WAIT:
-		get_timewait4_sock(v, seq, st->num, &len);
-		break;
 	}
 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
 out:
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 066640e0ba8e..46440777e1c5 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -89,43 +89,36 @@ begin:
 	sk_nulls_for_each_rcu(sk, node, &head->chain) {
 		if (sk->sk_hash != hash)
 			continue;
-		if (likely(INET6_MATCH(sk, net, saddr, daddr, ports, dif))) {
-			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
-				goto begintw;
+		if (sk->sk_state == TCP_TIME_WAIT) {
+			if (!INET6_TW_MATCH(sk, net, saddr, daddr, ports, dif))
+				continue;
+		} else {
+			if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
+				continue;
+		}
+		if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+			goto out;
+
+		if (sk->sk_state == TCP_TIME_WAIT) {
+			if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr,
+						     ports, dif))) {
+				sock_gen_put(sk);
+				goto begin;
+			}
+		} else {
 			if (unlikely(!INET6_MATCH(sk, net, saddr, daddr,
 						  ports, dif))) {
 				sock_put(sk);
 				goto begin;
 			}
-		goto out;
+		goto found;
 		}
 	}
 	if (get_nulls_value(node) != slot)
 		goto begin;
-
-begintw:
-	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
-		if (sk->sk_hash != hash)
-			continue;
-		if (likely(INET6_TW_MATCH(sk, net, saddr, daddr,
-					  ports, dif))) {
-			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
-				sk = NULL;
-				goto out;
-			}
-			if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr,
-						     ports, dif))) {
-				inet_twsk_put(inet_twsk(sk));
-				goto begintw;
-			}
-			goto out;
-		}
-	}
-	if (get_nulls_value(node) != slot)
-		goto begintw;
-	sk = NULL;
 out:
+	sk = NULL;
+found:
 	rcu_read_unlock();
 	return sk;
 }
@@ -248,31 +241,25 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
 	const struct hlist_nulls_node *node;
-	struct inet_timewait_sock *tw;
+	struct inet_timewait_sock *tw = NULL;
 	int twrefcnt = 0;
 
 	spin_lock(lock);
 
-	/* Check TIME-WAIT sockets first. */
-	sk_nulls_for_each(sk2, node, &head->twchain) {
+	sk_nulls_for_each(sk2, node, &head->chain) {
 		if (sk2->sk_hash != hash)
 			continue;
 
-		if (likely(INET6_TW_MATCH(sk2, net, saddr, daddr,
-					  ports, dif))) {
-			tw = inet_twsk(sk2);
-			if (twsk_unique(sk, sk2, twp))
-				goto unique;
-			else
-				goto not_unique;
+		if (sk2->sk_state == TCP_TIME_WAIT) {
+			if (likely(INET6_TW_MATCH(sk2, net, saddr, daddr,
+						  ports, dif))) {
+				tw = inet_twsk(sk2);
+				if (twsk_unique(sk, sk2, twp))
+					goto unique;
+				else
+					goto not_unique;
+			}
 		}
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_nulls_for_each(sk2, node, &head->chain) {
-		if (sk2->sk_hash != hash)
-			continue;
 		if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif)))
 			goto not_unique;
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index dde8bad04481..528e61afaf5e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1834,6 +1834,7 @@ static void get_timewait6_sock(struct seq_file *seq,
 static int tcp6_seq_show(struct seq_file *seq, void *v)
 {
 	struct tcp_iter_state *st;
+	struct sock *sk = v;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -1849,14 +1850,14 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
 	case TCP_SEQ_STATE_ESTABLISHED:
-		get_tcp6_sock(seq, v, st->num);
+		if (sk->sk_state == TCP_TIME_WAIT)
+			get_timewait6_sock(seq, v, st->num);
+		else
+			get_tcp6_sock(seq, v, st->num);
 		break;
 	case TCP_SEQ_STATE_OPENREQ:
 		get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid);
 		break;
-	case TCP_SEQ_STATE_TIME_WAIT:
-		get_timewait6_sock(seq, v, st->num);
-		break;
 	}
 out:
 	return 0;
-- 
cgit v1.2.3


From efe4208f47f907b86f528788da711e8ab9dea44d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 3 Oct 2013 15:42:29 -0700
Subject: ipv6: make lookups simpler and faster

TCP listener refactoring, part 4 :

To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common

Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.

Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).

inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6

This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.

inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr

And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.

We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h                           | 46 +++-----------------
 include/net/inet6_hashtables.h                 |  5 +--
 include/net/inet_timewait_sock.h               |  4 +-
 include/net/ip.h                               |  2 +-
 include/net/ip6_checksum.h                     |  2 +-
 include/net/sock.h                             |  9 ++++
 net/dccp/ipv6.c                                | 24 +++++------
 net/dccp/ipv6.h                                |  1 -
 net/dccp/minisocks.c                           |  7 +---
 net/ipv4/inet_diag.c                           | 35 +++++++---------
 net/ipv4/ping.c                                | 15 ++++---
 net/ipv4/tcp_metrics.c                         | 10 ++---
 net/ipv4/tcp_minisocks.c                       |  7 +---
 net/ipv4/tcp_probe.c                           | 29 +++++--------
 net/ipv4/tcp_timer.c                           |  3 +-
 net/ipv6/af_inet6.c                            | 10 ++---
 net/ipv6/datagram.c                            | 25 ++++++-----
 net/ipv6/inet6_connection_sock.c               |  7 ++--
 net/ipv6/inet6_hashtables.c                    | 58 +++++++++-----------------
 net/ipv6/ipv6_sockglue.c                       |  7 ++--
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  4 +-
 net/ipv6/ping.c                                |  2 +-
 net/ipv6/raw.c                                 | 17 ++++----
 net/ipv6/tcp_ipv6.c                            | 44 ++++++++++---------
 net/ipv6/udp.c                                 | 48 ++++++++++-----------
 net/l2tp/l2tp_core.c                           | 10 ++---
 net/l2tp/l2tp_debugfs.c                        |  5 ++-
 net/l2tp/l2tp_ip6.c                            | 16 +++----
 net/l2tp/l2tp_netlink.c                        |  4 +-
 net/l2tp/l2tp_ppp.c                            | 12 +++---
 net/netfilter/xt_TPROXY.c                      |  2 +-
 net/netfilter/xt_socket.c                      |  2 +-
 net/sctp/ipv6.c                                | 22 +++++-----
 net/sunrpc/svcsock.c                           |  2 +-
 security/lsm_audit.c                           |  5 +--
 35 files changed, 213 insertions(+), 288 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index b7f1f3bb346d..35f6c1b562c4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -141,8 +141,6 @@ struct ipv6_fl_socklist;
  */
 struct ipv6_pinfo {
 	struct in6_addr 	saddr;
-	struct in6_addr 	rcv_saddr;
-	struct in6_addr		daddr;
 	struct in6_pktinfo	sticky_pktinfo;
 	const struct in6_addr		*daddr_cache;
 #ifdef CONFIG_IPV6_SUBTREES
@@ -256,22 +254,10 @@ struct tcp6_sock {
 
 extern int inet6_sk_rebuild_header(struct sock *sk);
 
-struct inet6_timewait_sock {
-	struct in6_addr tw_v6_daddr;
-	struct in6_addr	tw_v6_rcv_saddr;
-};
-
 struct tcp6_timewait_sock {
 	struct tcp_timewait_sock   tcp6tw_tcp;
-	struct inet6_timewait_sock tcp6tw_inet6;
 };
 
-static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk)
-{
-	return (struct inet6_timewait_sock *)(((u8 *)sk) +
-					      inet_twsk(sk)->tw_ipv6_offset);
-}
-
 #if IS_ENABLED(CONFIG_IPV6)
 static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 {
@@ -321,21 +307,11 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 #define __ipv6_only_sock(sk)	(inet6_sk(sk)->ipv6only)
 #define ipv6_only_sock(sk)	((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
 
-static inline u16 inet6_tw_offset(const struct proto *prot)
-{
-	return prot->twsk_prot->twsk_obj_size -
-			sizeof(struct inet6_timewait_sock);
-}
-
-static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk)
+static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
 {
-	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		&inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr;
-}
-
-static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
-{
-	return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL;
+	if (sk->sk_family == AF_INET6)
+		return &sk->sk_v6_rcv_saddr;
+	return NULL;
 }
 
 static inline int inet_v6_ipv6only(const struct sock *sk)
@@ -363,7 +339,6 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 	return NULL;
 }
 
-#define __inet6_rcv_saddr(__sk)	NULL
 #define inet6_rcv_saddr(__sk)	NULL
 #define tcp_twsk_ipv6only(__sk)		0
 #define inet_v6_ipv6only(__sk)		0
@@ -372,19 +347,10 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	\
 	(((__sk)->sk_portpair == (__ports))			&&	\
 	 ((__sk)->sk_family == AF_INET6)			&&	\
-	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&&	\
-	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&&	\
+	 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))		&&	\
+	 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))	&&	\
 	 (!(__sk)->sk_bound_dev_if	||				\
 	   ((__sk)->sk_bound_dev_if == (__dif))) 		&&	\
 	 net_eq(sock_net(__sk), (__net)))
 
-#define INET6_TW_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	   \
-	(((__sk)->sk_portpair == (__ports))				&& \
-	 ((__sk)->sk_family == AF_INET6)				&& \
-	 ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_daddr, (__saddr))	&& \
-	 ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_rcv_saddr, (__daddr)) && \
-	 (!(__sk)->sk_bound_dev_if	||				   \
-	  ((__sk)->sk_bound_dev_if == (__dif)))				&& \
-	 net_eq(sock_net(__sk), (__net)))
-
 #endif /* _IPV6_H */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index f52fa88feb64..a105d1a2fc00 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -43,9 +43,8 @@ static inline unsigned int inet6_ehashfn(struct net *net,
 static inline int inet6_sk_ehashfn(const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-	const struct in6_addr *laddr = &np->rcv_saddr;
-	const struct in6_addr *faddr = &np->daddr;
+	const struct in6_addr *laddr = &sk->sk_v6_rcv_saddr;
+	const struct in6_addr *faddr = &sk->sk_v6_daddr;
 	const __u16 lport = inet->inet_num;
 	const __be16 fport = inet->inet_dport;
 	struct net *net = sock_net(sk);
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index de9e3ab7d43d..b647c6270eb7 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -116,7 +116,9 @@ struct inet_timewait_sock {
 #define tw_prot			__tw_common.skc_prot
 #define tw_net			__tw_common.skc_net
 #define tw_daddr        	__tw_common.skc_daddr
+#define tw_v6_daddr		__tw_common.skc_v6_daddr
 #define tw_rcv_saddr    	__tw_common.skc_rcv_saddr
+#define tw_v6_rcv_saddr    	__tw_common.skc_v6_rcv_saddr
 #define tw_dport		__tw_common.skc_dport
 #define tw_num			__tw_common.skc_num
 
@@ -133,7 +135,7 @@ struct inet_timewait_sock {
 				tw_transparent  : 1,
 				tw_pad		: 6,	/* 6 bits hole */
 				tw_tos		: 8,
-				tw_ipv6_offset  : 16;
+				tw_pad2		: 16 /* 16 bits hole */
 	kmemcheck_bitfield_end(flags);
 	u32			tw_ttd;
 	struct inet_bind_bucket	*tw_tb;
diff --git a/include/net/ip.h b/include/net/ip.h
index b39ebe5339ac..217bc5bfc6c6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -374,7 +374,7 @@ static __inline__ void inet_reset_saddr(struct sock *sk)
 		struct ipv6_pinfo *np = inet6_sk(sk);
 
 		memset(&np->saddr, 0, sizeof(np->saddr));
-		memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr));
+		memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
 	}
 #endif
 }
diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h
index 7686e3f5033d..1944406949ba 100644
--- a/include/net/ip6_checksum.h
+++ b/include/net/ip6_checksum.h
@@ -70,7 +70,7 @@ static inline void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 
-	__tcp_v6_send_check(skb, &np->saddr, &np->daddr);
+	__tcp_v6_send_check(skb, &np->saddr, &sk->sk_v6_daddr);
 }
 
 int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto);
diff --git a/include/net/sock.h b/include/net/sock.h
index 3f3e48c4704d..7e50df5c71d4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -191,6 +191,12 @@ struct sock_common {
 #ifdef CONFIG_NET_NS
 	struct net	 	*skc_net;
 #endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+	struct in6_addr		skc_v6_daddr;
+	struct in6_addr		skc_v6_rcv_saddr;
+#endif
+
 	/*
 	 * fields between dontcopy_begin/dontcopy_end
 	 * are not copied in sock_copy()
@@ -314,6 +320,9 @@ struct sock {
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
 #define sk_net			__sk_common.skc_net
+#define sk_v6_daddr		__sk_common.skc_v6_daddr
+#define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
+
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
 	/*
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6cf9f7782ad4..7f075b83128a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -67,7 +67,7 @@ static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
 	struct dccp_hdr *dh = dccp_hdr(skb);
 
 	dccp_csum_outgoing(skb);
-	dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &np->daddr);
+	dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr);
 }
 
 static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
@@ -467,11 +467,11 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr);
+		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr);
 
 		ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
 
-		newnp->rcv_saddr = newnp->saddr;
+		newsk->sk_v6_rcv_saddr = newnp->saddr;
 
 		inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
 		newsk->sk_backlog_rcv = dccp_v4_do_rcv;
@@ -538,9 +538,9 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-	newnp->daddr = ireq6->rmt_addr;
+	newsk->sk_v6_daddr = ireq6->rmt_addr;
 	newnp->saddr = ireq6->loc_addr;
-	newnp->rcv_saddr = ireq6->loc_addr;
+	newsk->sk_v6_rcv_saddr = ireq6->loc_addr;
 	newsk->sk_bound_dev_if = ireq6->iif;
 
 	/* Now IPv6 options...
@@ -885,7 +885,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			return -EINVAL;
 	}
 
-	np->daddr = usin->sin6_addr;
+	sk->sk_v6_daddr = usin->sin6_addr;
 	np->flow_label = fl6.flowlabel;
 
 	/*
@@ -915,16 +915,16 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			goto failure;
 		}
 		ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
-		ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &np->rcv_saddr);
+		ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr);
 
 		return err;
 	}
 
-	if (!ipv6_addr_any(&np->rcv_saddr))
-		saddr = &np->rcv_saddr;
+	if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+		saddr = &sk->sk_v6_rcv_saddr;
 
 	fl6.flowi6_proto = IPPROTO_DCCP;
-	fl6.daddr = np->daddr;
+	fl6.daddr = sk->sk_v6_daddr;
 	fl6.saddr = saddr ? *saddr : np->saddr;
 	fl6.flowi6_oif = sk->sk_bound_dev_if;
 	fl6.fl6_dport = usin->sin6_port;
@@ -941,7 +941,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	if (saddr == NULL) {
 		saddr = &fl6.saddr;
-		np->rcv_saddr = *saddr;
+		sk->sk_v6_rcv_saddr = *saddr;
 	}
 
 	/* set the source address */
@@ -963,7 +963,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		goto late_failure;
 
 	dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
-						      np->daddr.s6_addr32,
+						      sk->sk_v6_daddr.s6_addr32,
 						      inet->inet_sport,
 						      inet->inet_dport);
 	err = dccp_connect(sk);
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
index 6eef81fdbe56..6604fc3fe953 100644
--- a/net/dccp/ipv6.h
+++ b/net/dccp/ipv6.h
@@ -30,7 +30,6 @@ struct dccp6_request_sock {
 
 struct dccp6_timewait_sock {
 	struct inet_timewait_sock   inet;
-	struct inet6_timewait_sock  tw6;
 };
 
 #endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 662071b249cc..32e80d96d4c0 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -56,12 +56,9 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
 			const struct ipv6_pinfo *np = inet6_sk(sk);
-			struct inet6_timewait_sock *tw6;
 
-			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
-			tw6 = inet6_twsk((struct sock *)tw);
-			tw6->tw_v6_daddr = np->daddr;
-			tw6->tw_v6_rcv_saddr = np->rcv_saddr;
+			tw->tw_v6_daddr = sk->sk_v6_daddr;
+			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 			tw->tw_ipv6only = np->ipv6only;
 		}
 #endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8e1e40653357..ecc179d676e4 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -121,13 +121,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (r->idiag_family == AF_INET6) {
-		const struct ipv6_pinfo *np = inet6_sk(sk);
 
-		*(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
-		*(struct in6_addr *)r->id.idiag_dst = np->daddr;
+		*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+		*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
 
 		if (ext & (1 << (INET_DIAG_TCLASS - 1)))
-			if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
+			if (nla_put_u8(skb, INET_DIAG_TCLASS,
+				       inet6_sk(sk)->tclass) < 0)
 				goto errout;
 	}
 #endif
@@ -255,11 +255,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	r->idiag_inode	      = 0;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (tw->tw_family == AF_INET6) {
-		const struct inet6_timewait_sock *tw6 =
-						inet6_twsk((struct sock *)tw);
-
-		*(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr;
-		*(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
+		*(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
+		*(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
 	}
 #endif
 
@@ -273,10 +270,11 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct nlmsghdr *unlh)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
-		return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
-					   skb, r, portid, seq, nlmsg_flags,
-					   unlh);
-	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
+		return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+					   nlmsg_flags, unlh);
+
+	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
+				  nlmsg_flags, unlh);
 }
 
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -489,10 +487,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
 	entry.family = sk->sk_family;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (entry.family == AF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
 
-		entry.saddr = np->rcv_saddr.s6_addr32;
-		entry.daddr = np->daddr.s6_addr32;
+		entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+		entry.daddr = sk->sk_v6_daddr.s6_addr32;
 	} else
 #endif
 	{
@@ -649,10 +646,8 @@ static int inet_twsk_diag_dump(struct sock *sk,
 		entry.family = tw->tw_family;
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == AF_INET6) {
-			struct inet6_timewait_sock *tw6 =
-						inet6_twsk((struct sock *)tw);
-			entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32;
-			entry.daddr = tw6->tw_v6_daddr.s6_addr32;
+			entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
+			entry.daddr = tw->tw_v6_daddr.s6_addr32;
 		} else
 #endif
 		{
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index a62610443152..ccefc07beacd 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -202,15 +202,14 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
 #if IS_ENABLED(CONFIG_IPV6)
 		} else if (skb->protocol == htons(ETH_P_IPV6) &&
 			   sk->sk_family == AF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
 
 			pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
 				 (int) isk->inet_num,
-				 &inet6_sk(sk)->rcv_saddr,
+				 &sk->sk_v6_rcv_saddr,
 				 sk->sk_bound_dev_if);
 
-			if (!ipv6_addr_any(&np->rcv_saddr) &&
-			    !ipv6_addr_equal(&np->rcv_saddr,
+			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+			    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
 					     &ipv6_hdr(skb)->daddr))
 				continue;
 #endif
@@ -362,7 +361,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
 	} else if (saddr->sa_family == AF_INET6) {
 		struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
 		struct ipv6_pinfo *np = inet6_sk(sk);
-		np->rcv_saddr = np->saddr = addr->sin6_addr;
+		sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
 #endif
 	}
 }
@@ -376,7 +375,7 @@ static void ping_clear_saddr(struct sock *sk, int dif)
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (sk->sk_family == AF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
-		memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr));
+		memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
 		memset(&np->saddr, 0, sizeof(np->saddr));
 #endif
 	}
@@ -418,7 +417,7 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	err = 0;
 	if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) ||
 	    (sk->sk_family == AF_INET6 &&
-	     !ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)))
+	     !ipv6_addr_any(&sk->sk_v6_rcv_saddr)))
 		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
 
 	if (snum)
@@ -429,7 +428,7 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (sk->sk_family == AF_INET6)
-		memset(&inet6_sk(sk)->daddr, 0, sizeof(inet6_sk(sk)->daddr));
+		memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
 #endif
 
 	sk_dst_reset(sk);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 52f3c6b971d2..27535fd5ea10 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -240,7 +240,6 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 
 static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
 {
-	struct inet6_timewait_sock *tw6;
 	struct tcp_metrics_block *tm;
 	struct inetpeer_addr addr;
 	unsigned int hash;
@@ -253,9 +252,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 		hash = (__force unsigned int) addr.addr.a4;
 		break;
 	case AF_INET6:
-		tw6 = inet6_twsk((struct sock *)tw);
-		*(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
-		hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
+		*(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr;
+		hash = ipv6_addr_hash(&tw->tw_v6_daddr);
 		break;
 	default:
 		return NULL;
@@ -289,8 +287,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 		hash = (__force unsigned int) addr.addr.a4;
 		break;
 	case AF_INET6:
-		*(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
-		hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
+		*(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr;
+		hash = ipv6_addr_hash(&sk->sk_v6_daddr);
 		break;
 	default:
 		return NULL;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 58a3e69aef64..97b684159861 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -293,12 +293,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
-			struct inet6_timewait_sock *tw6;
 
-			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
-			tw6 = inet6_twsk((struct sock *)tw);
-			tw6->tw_v6_daddr = np->daddr;
-			tw6->tw_v6_rcv_saddr = np->rcv_saddr;
+			tw->tw_v6_daddr = sk->sk_v6_daddr;
+			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 			tw->tw_tclass = np->tclass;
 			tw->tw_ipv6only = np->ipv6only;
 		}
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 611beab38a00..8b97d71e193b 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -101,22 +101,6 @@ static inline int tcp_probe_avail(void)
 		si4.sin_addr.s_addr = inet->inet_##mem##addr;	\
 	} while (0)						\
 
-#if IS_ENABLED(CONFIG_IPV6)
-#define tcp_probe_copy_fl_to_si6(inet, si6, mem)		\
-	do {							\
-		struct ipv6_pinfo *pi6 = inet->pinet6;		\
-		si6.sin6_family = AF_INET6;			\
-		si6.sin6_port = inet->inet_##mem##port;		\
-		si6.sin6_addr = pi6->mem##addr;			\
-		si6.sin6_flowinfo = 0; /* No need here. */	\
-		si6.sin6_scope_id = 0;	/* No need here. */	\
-	} while (0)
-#else
-#define tcp_probe_copy_fl_to_si6(fl, si6, mem)			\
-	do {							\
-		memset(&si6, 0, sizeof(si6));			\
-	} while (0)
-#endif
 
 /*
  * Hook inserted to be called before each receive packet.
@@ -147,8 +131,17 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
 				break;
 			case AF_INET6:
-				tcp_probe_copy_fl_to_si6(inet, p->src.v6, s);
-				tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d);
+				memset(&p->src.v6, 0, sizeof(p->src.v6));
+				memset(&p->dst.v6, 0, sizeof(p->dst.v6));
+#if IS_ENABLED(CONFIG_IPV6)
+				p->src.v6.sin6_family = AF_INET6;
+				p->src.v6.sin6_port = inet->inet_sport;
+				p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
+
+				p->dst.v6.sin6_family = AF_INET6;
+				p->dst.v6.sin6_port = inet->inet_dport;
+				p->dst.v6.sin6_addr = sk->sk_v6_daddr;
+#endif
 				break;
 			default:
 				BUG();
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4b85e6f636c9..af07b5b23ebf 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -374,9 +374,8 @@ void tcp_retransmit_timer(struct sock *sk)
 		}
 #if IS_ENABLED(CONFIG_IPV6)
 		else if (sk->sk_family == AF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
 			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
-				       &np->daddr,
+				       &sk->sk_v6_daddr,
 				       ntohs(inet->inet_dport), inet->inet_num,
 				       tp->snd_una, tp->snd_nxt);
 		}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4966b124dc2e..a2cb07cd3850 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -364,7 +364,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	inet->inet_rcv_saddr = v4addr;
 	inet->inet_saddr = v4addr;
 
-	np->rcv_saddr = addr->sin6_addr;
+	sk->sk_v6_rcv_saddr = addr->sin6_addr;
 
 	if (!(addr_type & IPV6_ADDR_MULTICAST))
 		np->saddr = addr->sin6_addr;
@@ -461,14 +461,14 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		    peer == 1)
 			return -ENOTCONN;
 		sin->sin6_port = inet->inet_dport;
-		sin->sin6_addr = np->daddr;
+		sin->sin6_addr = sk->sk_v6_daddr;
 		if (np->sndflow)
 			sin->sin6_flowinfo = np->flow_label;
 	} else {
-		if (ipv6_addr_any(&np->rcv_saddr))
+		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
 			sin->sin6_addr = np->saddr;
 		else
-			sin->sin6_addr = np->rcv_saddr;
+			sin->sin6_addr = sk->sk_v6_rcv_saddr;
 
 		sin->sin6_port = inet->inet_sport;
 	}
@@ -655,7 +655,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 
 		memset(&fl6, 0, sizeof(fl6));
 		fl6.flowi6_proto = sk->sk_protocol;
-		fl6.daddr = np->daddr;
+		fl6.daddr = sk->sk_v6_daddr;
 		fl6.saddr = np->saddr;
 		fl6.flowlabel = np->flow_label;
 		fl6.flowi6_oif = sk->sk_bound_dev_if;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 48b6bd2a9a14..a454b0ff57c7 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -107,16 +107,16 @@ ipv4_connected:
 		if (err)
 			goto out;
 
-		ipv6_addr_set_v4mapped(inet->inet_daddr, &np->daddr);
+		ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr);
 
 		if (ipv6_addr_any(&np->saddr) ||
 		    ipv6_mapped_addr_any(&np->saddr))
 			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
 
-		if (ipv6_addr_any(&np->rcv_saddr) ||
-		    ipv6_mapped_addr_any(&np->rcv_saddr)) {
+		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
+		    ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) {
 			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
-					       &np->rcv_saddr);
+					       &sk->sk_v6_rcv_saddr);
 			if (sk->sk_prot->rehash)
 				sk->sk_prot->rehash(sk);
 		}
@@ -145,7 +145,7 @@ ipv4_connected:
 		}
 	}
 
-	np->daddr = *daddr;
+	sk->sk_v6_daddr = *daddr;
 	np->flow_label = fl6.flowlabel;
 
 	inet->inet_dport = usin->sin6_port;
@@ -156,7 +156,7 @@ ipv4_connected:
 	 */
 
 	fl6.flowi6_proto = sk->sk_protocol;
-	fl6.daddr = np->daddr;
+	fl6.daddr = sk->sk_v6_daddr;
 	fl6.saddr = np->saddr;
 	fl6.flowi6_oif = sk->sk_bound_dev_if;
 	fl6.flowi6_mark = sk->sk_mark;
@@ -183,16 +183,16 @@ ipv4_connected:
 	if (ipv6_addr_any(&np->saddr))
 		np->saddr = fl6.saddr;
 
-	if (ipv6_addr_any(&np->rcv_saddr)) {
-		np->rcv_saddr = fl6.saddr;
+	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+		sk->sk_v6_rcv_saddr = fl6.saddr;
 		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 		if (sk->sk_prot->rehash)
 			sk->sk_prot->rehash(sk);
 	}
 
 	ip6_dst_store(sk, dst,
-		      ipv6_addr_equal(&fl6.daddr, &np->daddr) ?
-		      &np->daddr : NULL,
+		      ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
+		      &sk->sk_v6_daddr : NULL,
 #ifdef CONFIG_IPV6_SUBTREES
 		      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
 		      &np->saddr :
@@ -883,11 +883,10 @@ EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl);
 void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
 			     __u16 srcp, __u16 destp, int bucket)
 {
-	struct ipv6_pinfo *np = inet6_sk(sp);
 	const struct in6_addr *dest, *src;
 
-	dest  = &np->daddr;
-	src   = &np->rcv_saddr;
+	dest  = &sp->sk_v6_daddr;
+	src   = &sp->sk_v6_rcv_saddr;
 	seq_printf(seq,
 		   "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
 		   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e4311cbc8b4e..b7400b480e74 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -165,11 +165,10 @@ EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
 
 void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
 
 	sin6->sin6_family = AF_INET6;
-	sin6->sin6_addr = np->daddr;
+	sin6->sin6_addr = sk->sk_v6_daddr;
 	sin6->sin6_port	= inet_sk(sk)->inet_dport;
 	/* We do not store received flowlabel for TCP */
 	sin6->sin6_flowinfo = 0;
@@ -203,7 +202,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 
 	memset(fl6, 0, sizeof(*fl6));
 	fl6->flowi6_proto = sk->sk_protocol;
-	fl6->daddr = np->daddr;
+	fl6->daddr = sk->sk_v6_daddr;
 	fl6->saddr = np->saddr;
 	fl6->flowlabel = np->flow_label;
 	IP6_ECN_flow_xmit(sk, fl6->flowlabel);
@@ -245,7 +244,7 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
 	skb_dst_set_noref(skb, dst);
 
 	/* Restore final destination back after routing done */
-	fl6.daddr = np->daddr;
+	fl6.daddr = sk->sk_v6_daddr;
 
 	res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
 	rcu_read_unlock();
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 46440777e1c5..842d833dfc18 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -89,30 +89,16 @@ begin:
 	sk_nulls_for_each_rcu(sk, node, &head->chain) {
 		if (sk->sk_hash != hash)
 			continue;
-		if (sk->sk_state == TCP_TIME_WAIT) {
-			if (!INET6_TW_MATCH(sk, net, saddr, daddr, ports, dif))
-				continue;
-		} else {
-			if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
-				continue;
-		}
+		if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
+			continue;
 		if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
 			goto out;
 
-		if (sk->sk_state == TCP_TIME_WAIT) {
-			if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr,
-						     ports, dif))) {
-				sock_gen_put(sk);
-				goto begin;
-			}
-		} else {
-			if (unlikely(!INET6_MATCH(sk, net, saddr, daddr,
-						  ports, dif))) {
-				sock_put(sk);
-				goto begin;
-			}
-		goto found;
+		if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) {
+			sock_gen_put(sk);
+			goto begin;
 		}
+		goto found;
 	}
 	if (get_nulls_value(node) != slot)
 		goto begin;
@@ -133,11 +119,10 @@ static inline int compute_score(struct sock *sk, struct net *net,
 
 	if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
 	    sk->sk_family == PF_INET6) {
-		const struct ipv6_pinfo *np = inet6_sk(sk);
 
 		score = 1;
-		if (!ipv6_addr_any(&np->rcv_saddr)) {
-			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+		if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+			if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
 				return -1;
 			score++;
 		}
@@ -229,9 +214,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-	const struct in6_addr *daddr = &np->rcv_saddr;
-	const struct in6_addr *saddr = &np->daddr;
+	const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
+	const struct in6_addr *saddr = &sk->sk_v6_daddr;
 	const int dif = sk->sk_bound_dev_if;
 	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
 	struct net *net = sock_net(sk);
@@ -250,23 +234,19 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 		if (sk2->sk_hash != hash)
 			continue;
 
-		if (sk2->sk_state == TCP_TIME_WAIT) {
-			if (likely(INET6_TW_MATCH(sk2, net, saddr, daddr,
-						  ports, dif))) {
+		if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) {
+			if (sk2->sk_state == TCP_TIME_WAIT) {
 				tw = inet_twsk(sk2);
 				if (twsk_unique(sk, sk2, twp))
-					goto unique;
-				else
-					goto not_unique;
+					break;
 			}
-		}
-		if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif)))
 			goto not_unique;
+		}
 	}
 
-unique:
 	/* Must record num and sport now. Otherwise we will see
-	 * in hash table socket with a funny identity. */
+	 * in hash table socket with a funny identity.
+	 */
 	inet->inet_num = lport;
 	inet->inet_sport = htons(lport);
 	sk->sk_hash = hash;
@@ -299,9 +279,9 @@ not_unique:
 static inline u32 inet6_sk_port_offset(const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-	return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
-					  np->daddr.s6_addr32,
+
+	return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32,
+					  sk->sk_v6_daddr.s6_addr32,
 					  inet->inet_dport);
 }
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index d1e2e8ef29c5..4919a8e6063e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -174,7 +174,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			}
 
 			if (ipv6_only_sock(sk) ||
-			    !ipv6_addr_v4mapped(&np->daddr)) {
+			    !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
 				retv = -EADDRNOTAVAIL;
 				break;
 			}
@@ -1011,7 +1011,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 				struct in6_pktinfo src_info;
 				src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
 					np->sticky_pktinfo.ipi6_ifindex;
-				src_info.ipi6_addr = np->mcast_oif ? np->daddr : np->sticky_pktinfo.ipi6_addr;
+				src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr;
 				put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
 			}
 			if (np->rxopt.bits.rxhlim) {
@@ -1026,7 +1026,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 				struct in6_pktinfo src_info;
 				src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
 					np->sticky_pktinfo.ipi6_ifindex;
-				src_info.ipi6_addr = np->mcast_oif ? np->daddr : np->sticky_pktinfo.ipi6_addr;
+				src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr :
+								     np->sticky_pktinfo.ipi6_addr;
 				put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
 			}
 			if (np->rxopt.bits.rxohlim) {
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index d6e4dd8b58df..54b75ead5a69 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -297,9 +297,9 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
 	struct nf_conn *ct;
 
-	tuple.src.u3.in6 = inet6->rcv_saddr;
+	tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
 	tuple.src.u.tcp.port = inet->inet_sport;
-	tuple.dst.u3.in6 = inet6->daddr;
+	tuple.dst.u3.in6 = sk->sk_v6_daddr;
 	tuple.dst.u.tcp.port = inet->inet_dport;
 	tuple.dst.protonum = sk->sk_protocol;
 
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 18f19df4189f..8815e31a87fe 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -116,7 +116,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	} else {
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -EDESTADDRREQ;
-		daddr = &np->daddr;
+		daddr = &sk->sk_v6_daddr;
 	}
 
 	if (!iif)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a4ed2416399e..3c00842b0079 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -77,20 +77,19 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 
 	sk_for_each_from(sk)
 		if (inet_sk(sk)->inet_num == num) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
 
 			if (!net_eq(sock_net(sk), net))
 				continue;
 
-			if (!ipv6_addr_any(&np->daddr) &&
-			    !ipv6_addr_equal(&np->daddr, rmt_addr))
+			if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
 				continue;
 
 			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
 				continue;
 
-			if (!ipv6_addr_any(&np->rcv_saddr)) {
-				if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
+			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+				if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
 					goto found;
 				if (is_multicast &&
 				    inet6_mc_check(sk, loc_addr, rmt_addr))
@@ -302,7 +301,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	}
 
 	inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
-	np->rcv_saddr = addr->sin6_addr;
+	sk->sk_v6_rcv_saddr = addr->sin6_addr;
 	if (!(addr_type & IPV6_ADDR_MULTICAST))
 		np->saddr = addr->sin6_addr;
 	err = 0;
@@ -804,8 +803,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		 * sk->sk_dst_cache.
 		 */
 		if (sk->sk_state == TCP_ESTABLISHED &&
-		    ipv6_addr_equal(daddr, &np->daddr))
-			daddr = &np->daddr;
+		    ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+			daddr = &sk->sk_v6_daddr;
 
 		if (addr_len >= sizeof(struct sockaddr_in6) &&
 		    sin6->sin6_scope_id &&
@@ -816,7 +815,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 			return -EDESTADDRREQ;
 
 		proto = inet->inet_num;
-		daddr = &np->daddr;
+		daddr = &sk->sk_v6_daddr;
 		fl6.flowlabel = np->flow_label;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 528e61afaf5e..541dfc40c7b3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -192,13 +192,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	}
 
 	if (tp->rx_opt.ts_recent_stamp &&
-	    !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) {
+	    !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
 		tp->rx_opt.ts_recent = 0;
 		tp->rx_opt.ts_recent_stamp = 0;
 		tp->write_seq = 0;
 	}
 
-	np->daddr = usin->sin6_addr;
+	sk->sk_v6_daddr = usin->sin6_addr;
 	np->flow_label = fl6.flowlabel;
 
 	/*
@@ -237,17 +237,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		} else {
 			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
 			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
-					       &np->rcv_saddr);
+					       &sk->sk_v6_rcv_saddr);
 		}
 
 		return err;
 	}
 
-	if (!ipv6_addr_any(&np->rcv_saddr))
-		saddr = &np->rcv_saddr;
+	if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+		saddr = &sk->sk_v6_rcv_saddr;
 
 	fl6.flowi6_proto = IPPROTO_TCP;
-	fl6.daddr = np->daddr;
+	fl6.daddr = sk->sk_v6_daddr;
 	fl6.saddr = saddr ? *saddr : np->saddr;
 	fl6.flowi6_oif = sk->sk_bound_dev_if;
 	fl6.flowi6_mark = sk->sk_mark;
@@ -266,7 +266,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	if (saddr == NULL) {
 		saddr = &fl6.saddr;
-		np->rcv_saddr = *saddr;
+		sk->sk_v6_rcv_saddr = *saddr;
 	}
 
 	/* set the source address */
@@ -279,7 +279,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	rt = (struct rt6_info *) dst;
 	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
-	    ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr))
+	    ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr))
 		tcp_fetch_timewait_stamp(sk, dst);
 
 	icsk->icsk_ext_hdr_len = 0;
@@ -298,7 +298,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
-							     np->daddr.s6_addr32,
+							     sk->sk_v6_daddr.s6_addr32,
 							     inet->inet_sport,
 							     inet->inet_dport);
 
@@ -515,7 +515,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
 static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk,
 						struct sock *addr_sk)
 {
-	return tcp_v6_md5_do_lookup(sk, &inet6_sk(addr_sk)->daddr);
+	return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr);
 }
 
 static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk,
@@ -621,7 +621,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
 
 	if (sk) {
 		saddr = &inet6_sk(sk)->saddr;
-		daddr = &inet6_sk(sk)->daddr;
+		daddr = &sk->sk_v6_daddr;
 	} else if (req) {
 		saddr = &inet6_rsk(req)->loc_addr;
 		daddr = &inet6_rsk(req)->rmt_addr;
@@ -1116,11 +1116,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr);
+		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr);
 
 		ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
 
-		newnp->rcv_saddr = newnp->saddr;
+		newsk->sk_v6_rcv_saddr = newnp->saddr;
 
 		inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
@@ -1185,9 +1185,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-	newnp->daddr = treq->rmt_addr;
+	newsk->sk_v6_daddr = treq->rmt_addr;
 	newnp->saddr = treq->loc_addr;
-	newnp->rcv_saddr = treq->loc_addr;
+	newsk->sk_v6_rcv_saddr = treq->loc_addr;
 	newsk->sk_bound_dev_if = treq->iif;
 
 	/* Now IPv6 options...
@@ -1244,13 +1244,13 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Copy over the MD5 key from the original socket */
-	if ((key = tcp_v6_md5_do_lookup(sk, &newnp->daddr)) != NULL) {
+	if ((key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr)) != NULL) {
 		/* We're using one, so create a matching key
 		 * on the newsk structure. If we fail to get
 		 * memory, then we end up not copying the key
 		 * across. Shucks.
 		 */
-		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr,
+		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
 			       AF_INET6, key->key, key->keylen,
 			       sk_gfp_atomic(sk, GFP_ATOMIC));
 	}
@@ -1758,10 +1758,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	const struct inet_sock *inet = inet_sk(sp);
 	const struct tcp_sock *tp = tcp_sk(sp);
 	const struct inet_connection_sock *icsk = inet_csk(sp);
-	const struct ipv6_pinfo *np = inet6_sk(sp);
 
-	dest  = &np->daddr;
-	src   = &np->rcv_saddr;
+	dest  = &sp->sk_v6_daddr;
+	src   = &sp->sk_v6_rcv_saddr;
 	destp = ntohs(inet->inet_dport);
 	srcp  = ntohs(inet->inet_sport);
 
@@ -1810,11 +1809,10 @@ static void get_timewait6_sock(struct seq_file *seq,
 {
 	const struct in6_addr *dest, *src;
 	__u16 destp, srcp;
-	const struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
 	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
 
-	dest = &tw6->tw_v6_daddr;
-	src  = &tw6->tw_v6_rcv_saddr;
+	dest = &tw->tw_v6_daddr;
+	src  = &tw->tw_v6_rcv_saddr;
 	destp = ntohs(tw->tw_dport);
 	srcp  = ntohs(tw->tw_sport);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 37532478e3ba..b496de19a341 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -55,11 +55,10 @@
 
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 {
-	const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
 	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
 	int sk_ipv6only = ipv6_only_sock(sk);
 	int sk2_ipv6only = inet_v6_ipv6only(sk2);
-	int addr_type = ipv6_addr_type(sk_rcv_saddr6);
+	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
 	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
 
 	/* if both are mapped, treat as IPv4 */
@@ -77,7 +76,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 		return 1;
 
 	if (sk2_rcv_saddr6 &&
-	    ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6))
+	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
 		return 1;
 
 	return 0;
@@ -105,7 +104,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 	unsigned int hash2_nulladdr =
 		udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
 	unsigned int hash2_partial =
-		udp6_portaddr_hash(sock_net(sk), &inet6_sk(sk)->rcv_saddr, 0);
+		udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -115,7 +114,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 static void udp_v6_rehash(struct sock *sk)
 {
 	u16 new_hash = udp6_portaddr_hash(sock_net(sk),
-					  &inet6_sk(sk)->rcv_saddr,
+					  &sk->sk_v6_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 
 	udp_lib_rehash(sk, new_hash);
@@ -131,7 +130,6 @@ static inline int compute_score(struct sock *sk, struct net *net,
 
 	if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
 			sk->sk_family == PF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
 		struct inet_sock *inet = inet_sk(sk);
 
 		score = 0;
@@ -140,13 +138,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score++;
 		}
-		if (!ipv6_addr_any(&np->rcv_saddr)) {
-			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+		if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+			if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
 				return -1;
 			score++;
 		}
-		if (!ipv6_addr_any(&np->daddr)) {
-			if (!ipv6_addr_equal(&np->daddr, saddr))
+		if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+			if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
 				return -1;
 			score++;
 		}
@@ -169,10 +167,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 
 	if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
 			sk->sk_family == PF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
 		struct inet_sock *inet = inet_sk(sk);
 
-		if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+		if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
 			return -1;
 		score = 0;
 		if (inet->inet_dport) {
@@ -180,8 +177,8 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 				return -1;
 			score++;
 		}
-		if (!ipv6_addr_any(&np->daddr)) {
-			if (!ipv6_addr_equal(&np->daddr, saddr))
+		if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+			if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
 				return -1;
 			score++;
 		}
@@ -549,7 +546,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
-	if (!ipv6_addr_any(&inet6_sk(sk)->daddr)) {
+	if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
 	}
@@ -690,20 +687,19 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
 
 		if (udp_sk(s)->udp_port_hash == num &&
 		    s->sk_family == PF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(s);
 			if (inet->inet_dport) {
 				if (inet->inet_dport != rmt_port)
 					continue;
 			}
-			if (!ipv6_addr_any(&np->daddr) &&
-			    !ipv6_addr_equal(&np->daddr, rmt_addr))
+			if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
 				continue;
 
 			if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)
 				continue;
 
-			if (!ipv6_addr_any(&np->rcv_saddr)) {
-				if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr))
+			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+				if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
 					continue;
 			}
 			if (!inet6_mc_check(s, loc_addr, rmt_addr))
@@ -1063,7 +1059,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 	} else if (!up->pending) {
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -EDESTADDRREQ;
-		daddr = &np->daddr;
+		daddr = &sk->sk_v6_daddr;
 	} else
 		daddr = NULL;
 
@@ -1133,8 +1129,8 @@ do_udp_sendmsg:
 		 * sk->sk_dst_cache.
 		 */
 		if (sk->sk_state == TCP_ESTABLISHED &&
-		    ipv6_addr_equal(daddr, &np->daddr))
-			daddr = &np->daddr;
+		    ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+			daddr = &sk->sk_v6_daddr;
 
 		if (addr_len >= sizeof(struct sockaddr_in6) &&
 		    sin6->sin6_scope_id &&
@@ -1145,7 +1141,7 @@ do_udp_sendmsg:
 			return -EDESTADDRREQ;
 
 		fl6.fl6_dport = inet->inet_dport;
-		daddr = &np->daddr;
+		daddr = &sk->sk_v6_daddr;
 		fl6.flowlabel = np->flow_label;
 		connected = 1;
 	}
@@ -1261,8 +1257,8 @@ do_append_data:
 	if (dst) {
 		if (connected) {
 			ip6_dst_store(sk, dst,
-				      ipv6_addr_equal(&fl6.daddr, &np->daddr) ?
-				      &np->daddr : NULL,
+				      ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
+				      &sk->sk_v6_daddr : NULL,
 #ifdef CONFIG_IPV6_SUBTREES
 				      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
 				      &np->saddr :
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index b076e8309bc2..9af77d9c0ec9 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1181,7 +1181,7 @@ static void l2tp_xmit_ipv6_csum(struct sock *sk, struct sk_buff *skb,
 	    !(skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
 		__wsum csum = skb_checksum(skb, 0, udp_len, 0);
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		uh->check = csum_ipv6_magic(&np->saddr, &np->daddr, udp_len,
+		uh->check = csum_ipv6_magic(&np->saddr, &sk->sk_v6_daddr, udp_len,
 					    IPPROTO_UDP, csum);
 		if (uh->check == 0)
 			uh->check = CSUM_MANGLED_0;
@@ -1189,7 +1189,7 @@ static void l2tp_xmit_ipv6_csum(struct sock *sk, struct sk_buff *skb,
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
-		uh->check = ~csum_ipv6_magic(&np->saddr, &np->daddr,
+		uh->check = ~csum_ipv6_magic(&np->saddr, &sk->sk_v6_daddr,
 					     udp_len, IPPROTO_UDP, 0);
 	}
 }
@@ -1713,13 +1713,13 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
 		struct ipv6_pinfo *np = inet6_sk(sk);
 
 		if (ipv6_addr_v4mapped(&np->saddr) &&
-		    ipv6_addr_v4mapped(&np->daddr)) {
+		    ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
 			struct inet_sock *inet = inet_sk(sk);
 
 			tunnel->v4mapped = true;
 			inet->inet_saddr = np->saddr.s6_addr32[3];
-			inet->inet_rcv_saddr = np->rcv_saddr.s6_addr32[3];
-			inet->inet_daddr = np->daddr.s6_addr32[3];
+			inet->inet_rcv_saddr = sk->sk_v6_rcv_saddr.s6_addr32[3];
+			inet->inet_daddr = sk->sk_v6_daddr.s6_addr32[3];
 		} else {
 			tunnel->v4mapped = false;
 		}
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 072d7202e182..2d6760a2ae34 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -127,9 +127,10 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
 
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tunnel->sock->sk_family == AF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(tunnel->sock);
+			const struct ipv6_pinfo *np = inet6_sk(tunnel->sock);
+
 			seq_printf(m, " from %pI6c to %pI6c\n",
-				&np->saddr, &np->daddr);
+				&np->saddr, &tunnel->sock->sk_v6_daddr);
 		} else
 #endif
 		seq_printf(m, " from %pI4 to %pI4\n",
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index b8a6039314e8..cfd65304be60 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -63,7 +63,7 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
 	struct sock *sk;
 
 	sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
-		struct in6_addr *addr = inet6_rcv_saddr(sk);
+		const struct in6_addr *addr = inet6_rcv_saddr(sk);
 		struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
 
 		if (l2tp == NULL)
@@ -331,7 +331,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	rcu_read_unlock();
 
 	inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
-	np->rcv_saddr = addr->l2tp_addr;
+	sk->sk_v6_rcv_saddr = addr->l2tp_addr;
 	np->saddr = addr->l2tp_addr;
 
 	l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id;
@@ -421,14 +421,14 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
 		if (!lsk->peer_conn_id)
 			return -ENOTCONN;
 		lsa->l2tp_conn_id = lsk->peer_conn_id;
-		lsa->l2tp_addr = np->daddr;
+		lsa->l2tp_addr = sk->sk_v6_daddr;
 		if (np->sndflow)
 			lsa->l2tp_flowinfo = np->flow_label;
 	} else {
-		if (ipv6_addr_any(&np->rcv_saddr))
+		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
 			lsa->l2tp_addr = np->saddr;
 		else
-			lsa->l2tp_addr = np->rcv_saddr;
+			lsa->l2tp_addr = sk->sk_v6_rcv_saddr;
 
 		lsa->l2tp_conn_id = lsk->conn_id;
 	}
@@ -537,8 +537,8 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		 * sk->sk_dst_cache.
 		 */
 		if (sk->sk_state == TCP_ESTABLISHED &&
-		    ipv6_addr_equal(daddr, &np->daddr))
-			daddr = &np->daddr;
+		    ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+			daddr = &sk->sk_v6_daddr;
 
 		if (addr_len >= sizeof(struct sockaddr_in6) &&
 		    lsa->l2tp_scope_id &&
@@ -548,7 +548,7 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -EDESTADDRREQ;
 
-		daddr = &np->daddr;
+		daddr = &sk->sk_v6_daddr;
 		fl6.flowlabel = np->flow_label;
 	}
 
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 0825ff26e113..be446d517bc9 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -306,8 +306,8 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
 		if (np) {
 			if (nla_put(skb, L2TP_ATTR_IP6_SADDR, sizeof(np->saddr),
 				    &np->saddr) ||
-			    nla_put(skb, L2TP_ATTR_IP6_DADDR, sizeof(np->daddr),
-				    &np->daddr))
+			    nla_put(skb, L2TP_ATTR_IP6_DADDR, sizeof(sk->sk_v6_daddr),
+				    &sk->sk_v6_daddr))
 				goto nla_put_failure;
 		} else
 #endif
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 5ebee2ded9e9..f0a7adaef2ea 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -906,8 +906,8 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if ((tunnel->version == 2) &&
 		   (tunnel->sock->sk_family == AF_INET6)) {
-		struct ipv6_pinfo *np = inet6_sk(tunnel->sock);
 		struct sockaddr_pppol2tpin6 sp;
+
 		len = sizeof(sp);
 		memset(&sp, 0, len);
 		sp.sa_family	= AF_PPPOX;
@@ -920,13 +920,13 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
 		sp.pppol2tp.d_session = session->peer_session_id;
 		sp.pppol2tp.addr.sin6_family = AF_INET6;
 		sp.pppol2tp.addr.sin6_port = inet->inet_dport;
-		memcpy(&sp.pppol2tp.addr.sin6_addr, &np->daddr,
-		       sizeof(np->daddr));
+		memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr,
+		       sizeof(tunnel->sock->sk_v6_daddr));
 		memcpy(uaddr, &sp, len);
 	} else if ((tunnel->version == 3) &&
 		   (tunnel->sock->sk_family == AF_INET6)) {
-		struct ipv6_pinfo *np = inet6_sk(tunnel->sock);
 		struct sockaddr_pppol2tpv3in6 sp;
+
 		len = sizeof(sp);
 		memset(&sp, 0, len);
 		sp.sa_family	= AF_PPPOX;
@@ -939,8 +939,8 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
 		sp.pppol2tp.d_session = session->peer_session_id;
 		sp.pppol2tp.addr.sin6_family = AF_INET6;
 		sp.pppol2tp.addr.sin6_port = inet->inet_dport;
-		memcpy(&sp.pppol2tp.addr.sin6_addr, &np->daddr,
-		       sizeof(np->daddr));
+		memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr,
+		       sizeof(tunnel->sock->sk_v6_daddr));
 		memcpy(uaddr, &sp, len);
 #endif
 	} else if (tunnel->version == 3) {
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 5d8a3a3cd5a7..ef8a926752a9 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -200,7 +200,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
 				     in->ifindex);
 		if (sk) {
 			int connected = (sk->sk_state == TCP_ESTABLISHED);
-			int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr);
+			int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
 
 			/* NOTE: we return listeners even if bound to
 			 * 0.0.0.0, those are filtered out in
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 06df2b9110f5..3dd0e374bc2b 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -370,7 +370,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 		 */
 		wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
 			    sk->sk_state != TCP_TIME_WAIT &&
-			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+			    ipv6_addr_any(&sk->sk_v6_rcv_saddr));
 
 		/* Ignore non-transparent sockets,
 		   if XT_SOCKET_TRANSPARENT is used */
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e7b2d4fe2b6a..f6334aa19151 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -426,20 +426,20 @@ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk)
 {
 	addr->v6.sin6_family = AF_INET6;
 	addr->v6.sin6_port = 0;
-	addr->v6.sin6_addr = inet6_sk(sk)->rcv_saddr;
+	addr->v6.sin6_addr = sk->sk_v6_rcv_saddr;
 }
 
 /* Initialize sk->sk_rcv_saddr from sctp_addr. */
 static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
 {
 	if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) {
-		inet6_sk(sk)->rcv_saddr.s6_addr32[0] = 0;
-		inet6_sk(sk)->rcv_saddr.s6_addr32[1] = 0;
-		inet6_sk(sk)->rcv_saddr.s6_addr32[2] = htonl(0x0000ffff);
-		inet6_sk(sk)->rcv_saddr.s6_addr32[3] =
+		sk->sk_v6_rcv_saddr.s6_addr32[0] = 0;
+		sk->sk_v6_rcv_saddr.s6_addr32[1] = 0;
+		sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff);
+		sk->sk_v6_rcv_saddr.s6_addr32[3] =
 			addr->v4.sin_addr.s_addr;
 	} else {
-		inet6_sk(sk)->rcv_saddr = addr->v6.sin6_addr;
+		sk->sk_v6_rcv_saddr = addr->v6.sin6_addr;
 	}
 }
 
@@ -447,12 +447,12 @@ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
 static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
 {
 	if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) {
-		inet6_sk(sk)->daddr.s6_addr32[0] = 0;
-		inet6_sk(sk)->daddr.s6_addr32[1] = 0;
-		inet6_sk(sk)->daddr.s6_addr32[2] = htonl(0x0000ffff);
-		inet6_sk(sk)->daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
+		sk->sk_v6_daddr.s6_addr32[0] = 0;
+		sk->sk_v6_daddr.s6_addr32[1] = 0;
+		sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff);
+		sk->sk_v6_daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
 	} else {
-		inet6_sk(sk)->daddr = addr->v6.sin6_addr;
+		sk->sk_v6_daddr = addr->v6.sin6_addr;
 	}
 }
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9c9caaa5e0d3..0045c7cf1e9e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -294,7 +294,7 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
 	case PF_INET6:
 		len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
 				proto_name,
-				&inet6_sk(sk)->rcv_saddr,
+				&sk->sk_v6_rcv_saddr,
 				inet_sk(sk)->inet_num);
 		break;
 	default:
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 8d8d97dbb389..80554fcf9fcc 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -304,12 +304,11 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			}
 			case AF_INET6: {
 				struct inet_sock *inet = inet_sk(sk);
-				struct ipv6_pinfo *inet6 = inet6_sk(sk);
 
-				print_ipv6_addr(ab, &inet6->rcv_saddr,
+				print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr,
 						inet->inet_sport,
 						"laddr", "lport");
-				print_ipv6_addr(ab, &inet6->daddr,
+				print_ipv6_addr(ab, &sk->sk_v6_daddr,
 						inet->inet_dport,
 						"faddr", "fport");
 				break;
-- 
cgit v1.2.3


From f69b923a758f598fd6bb69e57564b59506f4f1fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Oct 2013 21:47:29 -0700
Subject: udp: fix a typo in __udp4_lib_mcast_demux_lookup

At this point sk might contain garbage.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4226c53daaed..9f27bb800607 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1847,7 +1847,7 @@ begin:
 		if (count != 1 ||
 		    unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
 			result = NULL;
-		else if (unlikely(!__udp_is_mcast_sock(net, sk,
+		else if (unlikely(!__udp_is_mcast_sock(net, result,
 						       loc_port, loc_addr,
 						       rmt_port, rmt_addr,
 						       dif, hnum))) {
-- 
cgit v1.2.3


From c2bb06db59eaf92eb5ca9c6faed590597c6ceccb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Oct 2013 03:05:48 -0700
Subject: net: fix build errors if ipv6 is disabled

CONFIG_IPV6=n is still a valid choice ;)

It appears we can remove dead code.

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_checksum.h | 2 ++
 net/ipv4/ping.c            | 8 +++++---
 net/ipv4/tcp_metrics.c     | 4 ++++
 net/sunrpc/svcsock.c       | 2 ++
 security/lsm_audit.c       | 2 ++
 5 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h
index 1944406949ba..9e3c540c1b11 100644
--- a/include/net/ip6_checksum.h
+++ b/include/net/ip6_checksum.h
@@ -66,12 +66,14 @@ static inline void __tcp_v6_send_check(struct sk_buff *skb,
 	}
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
 static inline void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 
 	__tcp_v6_send_check(skb, &np->saddr, &sk->sk_v6_daddr);
 }
+#endif
 
 int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto);
 #endif
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index ccefc07beacd..9afbdb19f4a2 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -415,10 +415,12 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		 (int)sk->sk_bound_dev_if);
 
 	err = 0;
-	if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) ||
-	    (sk->sk_family == AF_INET6 &&
-	     !ipv6_addr_any(&sk->sk_v6_rcv_saddr)))
+	if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
 		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#endif
 
 	if (snum)
 		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 27535fd5ea10..8fcc2cb9dba4 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -251,10 +251,12 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 		addr.addr.a4 = tw->tw_daddr;
 		hash = (__force unsigned int) addr.addr.a4;
 		break;
+#if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
 		*(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr;
 		hash = ipv6_addr_hash(&tw->tw_v6_daddr);
 		break;
+#endif
 	default:
 		return NULL;
 	}
@@ -286,10 +288,12 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 		addr.addr.a4 = inet_sk(sk)->inet_daddr;
 		hash = (__force unsigned int) addr.addr.a4;
 		break;
+#if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
 		*(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr;
 		hash = ipv6_addr_hash(&sk->sk_v6_daddr);
 		break;
+#endif
 	default:
 		return NULL;
 	}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0045c7cf1e9e..b6e59f0a9475 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -291,12 +291,14 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
 				&inet_sk(sk)->inet_rcv_saddr,
 				inet_sk(sk)->inet_num);
 		break;
+#if IS_ENABLED(CONFIG_IPV6)
 	case PF_INET6:
 		len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
 				proto_name,
 				&sk->sk_v6_rcv_saddr,
 				inet_sk(sk)->inet_num);
 		break;
+#endif
 	default:
 		len = snprintf(buf, remaining, "*unknown-%d*\n",
 				sk->sk_family);
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 80554fcf9fcc..234bc2ab450c 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -302,6 +302,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 						"faddr", "fport");
 				break;
 			}
+#if IS_ENABLED(CONFIG_IPV6)
 			case AF_INET6: {
 				struct inet_sock *inet = inet_sk(sk);
 
@@ -313,6 +314,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 						"faddr", "fport");
 				break;
 			}
+#endif
 			case AF_UNIX:
 				u = unix_sk(sk);
 				if (u->path.dentry) {
-- 
cgit v1.2.3


From 4c60f1d67fae632743df9324301e3cb2682f54d4 Mon Sep 17 00:00:00 2001
From: "baker.zhang" <baker.kernel@gmail.com>
Date: Tue, 8 Oct 2013 11:36:51 +0800
Subject: fib_trie: only calc for the un-first node

This is a enhancement.

for the first node in fib_trie, newpos is 0, bit is 1.
Only for the leaf or node with unmatched key need calc pos.

Signed-off-by: baker.zhang <baker.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 45c74ba03970..ec9a9ef4ce50 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1117,12 +1117,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		 *  first tnode need some special handling
 		 */
 
-		if (tp)
-			pos = tp->pos+tp->bits;
-		else
-			pos = 0;
-
 		if (n) {
+			pos = tp ? tp->pos+tp->bits : 0;
 			newpos = tkey_mismatch(key, pos, n->key);
 			tn = tnode_new(n->key, newpos, 1);
 		} else {
-- 
cgit v1.2.3


From 634fb979e8f3a70f04c1f2f519d0cd1142eb5c1a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Oct 2013 15:21:29 -0700
Subject: inet: includes a sock_common in request_sock

TCP listener refactoring, part 5 :

We want to be able to insert request sockets (SYN_RECV) into main
ehash table instead of the per listener hash table to allow RCU
lookups and remove listener lock contention.

This patch includes the needed struct sock_common in front
of struct request_sock

This means there is no more inet6_request_sock IPv6 specific
structure.

Following inet_request_sock fields were renamed as they became
macros to reference fields from struct sock_common.
Prefix ir_ was chosen to avoid name collisions.

loc_port   -> ir_loc_port
loc_addr   -> ir_loc_addr
rmt_addr   -> ir_rmt_addr
rmt_port   -> ir_rmt_port
iif        -> ir_iif

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             | 26 ++---------------
 include/net/inet_sock.h          | 16 +++++-----
 include/net/request_sock.h       |  1 +
 include/net/tcp.h                |  4 +--
 net/dccp/ipv4.c                  | 18 ++++++------
 net/dccp/ipv6.c                  | 63 ++++++++++++++++++++--------------------
 net/dccp/ipv6.h                  |  1 -
 net/dccp/minisocks.c             |  4 +--
 net/dccp/output.c                |  4 +--
 net/ipv4/inet_connection_sock.c  | 23 ++++++++-------
 net/ipv4/inet_diag.c             | 22 +++++++-------
 net/ipv4/syncookies.c            | 12 ++++----
 net/ipv4/tcp_ipv4.c              | 38 ++++++++++++------------
 net/ipv4/tcp_metrics.c           |  8 +++--
 net/ipv4/tcp_output.c            |  4 +--
 net/ipv6/inet6_connection_sock.c | 26 ++++++++---------
 net/ipv6/syncookies.c            | 24 +++++++--------
 net/ipv6/tcp_ipv6.c              | 61 +++++++++++++++++++-------------------
 net/netlabel/netlabel_kapi.c     |  2 +-
 19 files changed, 169 insertions(+), 188 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 35f6c1b562c4..a80a63cfb70c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -115,16 +115,8 @@ static inline int inet6_iif(const struct sk_buff *skb)
 	return IP6CB(skb)->iif;
 }
 
-struct inet6_request_sock {
-	struct in6_addr		loc_addr;
-	struct in6_addr		rmt_addr;
-	struct sk_buff		*pktopts;
-	int			iif;
-};
-
 struct tcp6_request_sock {
 	struct tcp_request_sock	  tcp6rsk_tcp;
-	struct inet6_request_sock tcp6rsk_inet6;
 };
 
 struct ipv6_mc_socklist;
@@ -264,26 +256,12 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 	return inet_sk(__sk)->pinet6;
 }
 
-static inline struct inet6_request_sock *
-			inet6_rsk(const struct request_sock *rsk)
-{
-	return (struct inet6_request_sock *)(((u8 *)rsk) +
-					     inet_rsk(rsk)->inet6_rsk_offset);
-}
-
-static inline u32 inet6_rsk_offset(struct request_sock *rsk)
-{
-	return rsk->rsk_ops->obj_size - sizeof(struct inet6_request_sock);
-}
-
 static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
 {
 	struct request_sock *req = reqsk_alloc(ops);
 
-	if (req != NULL) {
-		inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req);
-		inet6_rsk(req)->pktopts = NULL;
-	}
+	if (req)
+		inet_rsk(req)->pktopts = NULL;
 
 	return req;
 }
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 6d9a7e6eb5a4..f91204442efa 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -70,13 +70,14 @@ struct ip_options_data {
 
 struct inet_request_sock {
 	struct request_sock	req;
-#if IS_ENABLED(CONFIG_IPV6)
-	u16			inet6_rsk_offset;
-#endif
-	__be16			loc_port;
-	__be32			loc_addr;
-	__be32			rmt_addr;
-	__be16			rmt_port;
+#define ir_loc_addr		req.__req_common.skc_rcv_saddr
+#define ir_rmt_addr		req.__req_common.skc_daddr
+#define ir_loc_port		req.__req_common.skc_num
+#define ir_rmt_port		req.__req_common.skc_dport
+#define ir_v6_rmt_addr		req.__req_common.skc_v6_daddr
+#define ir_v6_loc_addr		req.__req_common.skc_v6_rcv_saddr
+#define ir_iif			req.__req_common.skc_bound_dev_if
+
 	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
 				rcv_wscale : 4,
@@ -88,6 +89,7 @@ struct inet_request_sock {
 				no_srccheck: 1;
 	kmemcheck_bitfield_end(flags);
 	struct ip_options_rcu	*opt;
+	struct sk_buff		*pktopts;
 };
 
 static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 65c3e5164a5c..7f830ff67f08 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -48,6 +48,7 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req);
 /* struct request_sock - mini sock to represent a connection request
  */
 struct request_sock {
+	struct sock_common		__req_common;
 	struct request_sock		*dl_next;
 	u16				mss;
 	u8				num_retrans; /* number of retransmits */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 39bbfa1602b2..24a06161d174 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1109,8 +1109,8 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	ireq->wscale_ok = rx_opt->wscale_ok;
 	ireq->acked = 0;
 	ireq->ecn_ok = 0;
-	ireq->rmt_port = tcp_hdr(skb)->source;
-	ireq->loc_port = tcp_hdr(skb)->dest;
+	ireq->ir_rmt_port = tcp_hdr(skb)->source;
+	ireq->ir_loc_port = tcp_hdr(skb)->dest;
 }
 
 void tcp_enter_memory_pressure(struct sock *sk);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index ebc54fef85a5..720c36225ed9 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -409,9 +409,9 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newinet		   = inet_sk(newsk);
 	ireq		   = inet_rsk(req);
-	newinet->inet_daddr	= ireq->rmt_addr;
-	newinet->inet_rcv_saddr = ireq->loc_addr;
-	newinet->inet_saddr	= ireq->loc_addr;
+	newinet->inet_daddr	= ireq->ir_rmt_addr;
+	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+	newinet->inet_saddr	= ireq->ir_loc_addr;
 	newinet->inet_opt	= ireq->opt;
 	ireq->opt	   = NULL;
 	newinet->mc_index  = inet_iif(skb);
@@ -516,10 +516,10 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
 		const struct inet_request_sock *ireq = inet_rsk(req);
 		struct dccp_hdr *dh = dccp_hdr(skb);
 
-		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
-							      ireq->rmt_addr);
-		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
-					    ireq->rmt_addr,
+		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
+							      ireq->ir_rmt_addr);
+		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+					    ireq->ir_rmt_addr,
 					    ireq->opt);
 		err = net_xmit_eval(err);
 	}
@@ -641,8 +641,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		goto drop_and_free;
 
 	ireq = inet_rsk(req);
-	ireq->loc_addr = ip_hdr(skb)->daddr;
-	ireq->rmt_addr = ip_hdr(skb)->saddr;
+	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
 
 	/*
 	 * Step 3: Process LISTEN state
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 7f075b83128a..5cc5b24a956e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -216,7 +216,7 @@ out:
 
 static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
 {
-	struct inet6_request_sock *ireq6 = inet6_rsk(req);
+	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff *skb;
 	struct in6_addr *final_p, final;
@@ -226,12 +226,12 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
 
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_proto = IPPROTO_DCCP;
-	fl6.daddr = ireq6->rmt_addr;
-	fl6.saddr = ireq6->loc_addr;
+	fl6.daddr = ireq->ir_v6_rmt_addr;
+	fl6.saddr = ireq->ir_v6_loc_addr;
 	fl6.flowlabel = 0;
-	fl6.flowi6_oif = ireq6->iif;
-	fl6.fl6_dport = inet_rsk(req)->rmt_port;
-	fl6.fl6_sport = inet_rsk(req)->loc_port;
+	fl6.flowi6_oif = ireq->ir_iif;
+	fl6.fl6_dport = ireq->ir_rmt_port;
+	fl6.fl6_sport = ireq->ir_loc_port;
 	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
 
 
@@ -249,9 +249,9 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
 		struct dccp_hdr *dh = dccp_hdr(skb);
 
 		dh->dccph_checksum = dccp_v6_csum_finish(skb,
-							 &ireq6->loc_addr,
-							 &ireq6->rmt_addr);
-		fl6.daddr = ireq6->rmt_addr;
+							 &ireq->ir_v6_loc_addr,
+							 &ireq->ir_v6_rmt_addr);
+		fl6.daddr = ireq->ir_v6_rmt_addr;
 		err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
 		err = net_xmit_eval(err);
 	}
@@ -264,8 +264,7 @@ done:
 static void dccp_v6_reqsk_destructor(struct request_sock *req)
 {
 	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
-	if (inet6_rsk(req)->pktopts != NULL)
-		kfree_skb(inet6_rsk(req)->pktopts);
+	kfree_skb(inet_rsk(req)->pktopts);
 }
 
 static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
@@ -359,7 +358,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct request_sock *req;
 	struct dccp_request_sock *dreq;
-	struct inet6_request_sock *ireq6;
+	struct inet_request_sock *ireq;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
 	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -398,22 +397,22 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
 
-	ireq6 = inet6_rsk(req);
-	ireq6->rmt_addr = ipv6_hdr(skb)->saddr;
-	ireq6->loc_addr = ipv6_hdr(skb)->daddr;
+	ireq = inet_rsk(req);
+	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
 
 	if (ipv6_opt_accepted(sk, skb) ||
 	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 	    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
 		atomic_inc(&skb->users);
-		ireq6->pktopts = skb;
+		ireq->pktopts = skb;
 	}
-	ireq6->iif = sk->sk_bound_dev_if;
+	ireq->ir_iif = sk->sk_bound_dev_if;
 
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
-	    ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		ireq6->iif = inet6_iif(skb);
+	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		ireq->ir_iif = inet6_iif(skb);
 
 	/*
 	 * Step 3: Process LISTEN state
@@ -446,7 +445,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 					      struct request_sock *req,
 					      struct dst_entry *dst)
 {
-	struct inet6_request_sock *ireq6 = inet6_rsk(req);
+	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct inet_sock *newinet;
 	struct dccp6_sock *newdp6;
@@ -505,12 +504,12 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 		memset(&fl6, 0, sizeof(fl6));
 		fl6.flowi6_proto = IPPROTO_DCCP;
-		fl6.daddr = ireq6->rmt_addr;
+		fl6.daddr = ireq->ir_v6_rmt_addr;
 		final_p = fl6_update_dst(&fl6, np->opt, &final);
-		fl6.saddr = ireq6->loc_addr;
+		fl6.saddr = ireq->ir_v6_loc_addr;
 		fl6.flowi6_oif = sk->sk_bound_dev_if;
-		fl6.fl6_dport = inet_rsk(req)->rmt_port;
-		fl6.fl6_sport = inet_rsk(req)->loc_port;
+		fl6.fl6_dport = ireq->ir_rmt_port;
+		fl6.fl6_sport = ireq->ir_loc_port;
 		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
 		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
@@ -538,10 +537,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-	newsk->sk_v6_daddr = ireq6->rmt_addr;
-	newnp->saddr = ireq6->loc_addr;
-	newsk->sk_v6_rcv_saddr = ireq6->loc_addr;
-	newsk->sk_bound_dev_if = ireq6->iif;
+	newsk->sk_v6_daddr	= ireq->ir_v6_rmt_addr;
+	newnp->saddr		= ireq->ir_v6_loc_addr;
+	newsk->sk_v6_rcv_saddr	= ireq->ir_v6_loc_addr;
+	newsk->sk_bound_dev_if	= ireq->ir_iif;
 
 	/* Now IPv6 options...
 
@@ -554,10 +553,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	/* Clone pktoptions received with SYN */
 	newnp->pktoptions = NULL;
-	if (ireq6->pktopts != NULL) {
-		newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
-		consume_skb(ireq6->pktopts);
-		ireq6->pktopts = NULL;
+	if (ireq->pktopts != NULL) {
+		newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
+		consume_skb(ireq->pktopts);
+		ireq->pktopts = NULL;
 		if (newnp->pktoptions)
 			skb_set_owner_r(newnp->pktoptions, newsk);
 	}
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
index 6604fc3fe953..af259e15e7f0 100644
--- a/net/dccp/ipv6.h
+++ b/net/dccp/ipv6.h
@@ -25,7 +25,6 @@ struct dccp6_sock {
 
 struct dccp6_request_sock {
 	struct dccp_request_sock  dccp;
-	struct inet6_request_sock inet6;
 };
 
 struct dccp6_timewait_sock {
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 32e80d96d4c0..66afbcec2941 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -266,8 +266,8 @@ int dccp_reqsk_init(struct request_sock *req,
 {
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
-	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
-	inet_rsk(req)->loc_port	  = dccp_hdr(skb)->dccph_dport;
+	inet_rsk(req)->ir_rmt_port	  = dccp_hdr(skb)->dccph_sport;
+	inet_rsk(req)->ir_loc_port	  = dccp_hdr(skb)->dccph_dport;
 	inet_rsk(req)->acked	  = 0;
 	dreq->dreq_timestamp_echo = 0;
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index d17fc90a74b6..9bf195d1b87a 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -424,8 +424,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	/* Build and checksum header */
 	dh = dccp_zeroed_hdr(skb, dccp_header_size);
 
-	dh->dccph_sport	= inet_rsk(req)->loc_port;
-	dh->dccph_dport	= inet_rsk(req)->rmt_port;
+	dh->dccph_sport	= inet_rsk(req)->ir_loc_port;
+	dh->dccph_dport	= inet_rsk(req)->ir_rmt_port;
 	dh->dccph_doff	= (dccp_header_size +
 			   DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
 	dh->dccph_type	= DCCP_PKT_RESPONSE;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 56e82a4027b4..2ffd931d652f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -412,8 +412,8 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol,
 			   flags,
-			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
-			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+			   ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
@@ -448,8 +448,8 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
-			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
-			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+			   ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
@@ -495,9 +495,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
 	     prev = &req->dl_next) {
 		const struct inet_request_sock *ireq = inet_rsk(req);
 
-		if (ireq->rmt_port == rport &&
-		    ireq->rmt_addr == raddr &&
-		    ireq->loc_addr == laddr &&
+		if (ireq->ir_rmt_port == rport &&
+		    ireq->ir_rmt_addr == raddr &&
+		    ireq->ir_loc_addr == laddr &&
 		    AF_INET_FAMILY(req->rsk_ops->family)) {
 			WARN_ON(req->sk);
 			*prevp = prev;
@@ -514,7 +514,8 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+	const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
+				     inet_rsk(req)->ir_rmt_port,
 				     lopt->hash_rnd, lopt->nr_table_entries);
 
 	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
@@ -674,9 +675,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 		newsk->sk_state = TCP_SYN_RECV;
 		newicsk->icsk_bind_hash = NULL;
 
-		inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
-		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
-		inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
+		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
+		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->ir_loc_port);
+		inet_sk(newsk)->inet_sport = inet_rsk(req)->ir_loc_port;
 		newsk->sk_write_space = sk_stream_write_space;
 
 		newicsk->icsk_retransmits = 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ecc179d676e4..41e1c3ea8b51 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -679,12 +679,12 @@ static inline void inet_diag_req_addrs(const struct sock *sk,
 #if IS_ENABLED(CONFIG_IPV6)
 	if (sk->sk_family == AF_INET6) {
 		if (req->rsk_ops->family == AF_INET6) {
-			entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32;
-			entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32;
+			entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
+			entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
 		} else if (req->rsk_ops->family == AF_INET) {
-			ipv6_addr_set_v4mapped(ireq->loc_addr,
+			ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
 					       &entry->saddr_storage);
-			ipv6_addr_set_v4mapped(ireq->rmt_addr,
+			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
 					       &entry->daddr_storage);
 			entry->saddr = entry->saddr_storage.s6_addr32;
 			entry->daddr = entry->daddr_storage.s6_addr32;
@@ -692,8 +692,8 @@ static inline void inet_diag_req_addrs(const struct sock *sk,
 	} else
 #endif
 	{
-		entry->saddr = &ireq->loc_addr;
-		entry->daddr = &ireq->rmt_addr;
+		entry->saddr = &ireq->ir_loc_addr;
+		entry->daddr = &ireq->ir_rmt_addr;
 	}
 }
 
@@ -728,9 +728,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 		tmo = 0;
 
 	r->id.idiag_sport = inet->inet_sport;
-	r->id.idiag_dport = ireq->rmt_port;
-	r->id.idiag_src[0] = ireq->loc_addr;
-	r->id.idiag_dst[0] = ireq->rmt_addr;
+	r->id.idiag_dport = ireq->ir_rmt_port;
+	r->id.idiag_src[0] = ireq->ir_loc_addr;
+	r->id.idiag_dst[0] = ireq->ir_rmt_addr;
 	r->idiag_expires = jiffies_to_msecs(tmo);
 	r->idiag_rqueue = 0;
 	r->idiag_wqueue = 0;
@@ -789,13 +789,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 
 			if (reqnum < s_reqnum)
 				continue;
-			if (r->id.idiag_dport != ireq->rmt_port &&
+			if (r->id.idiag_dport != ireq->ir_rmt_port &&
 			    r->id.idiag_dport)
 				continue;
 
 			if (bc) {
 				inet_diag_req_addrs(sk, req, &entry);
-				entry.dport = ntohs(ireq->rmt_port);
+				entry.dport = ntohs(ireq->ir_rmt_port);
 
 				if (!inet_diag_bc_run(bc, &entry))
 					continue;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 15e024105f91..984e21cf3c50 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -304,10 +304,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	treq->rcv_isn		= ntohl(th->seq) - 1;
 	treq->snt_isn		= cookie;
 	req->mss		= mss;
-	ireq->loc_port		= th->dest;
-	ireq->rmt_port		= th->source;
-	ireq->loc_addr		= ip_hdr(skb)->daddr;
-	ireq->rmt_addr		= ip_hdr(skb)->saddr;
+	ireq->ir_loc_port		= th->dest;
+	ireq->ir_rmt_port		= th->source;
+	ireq->ir_loc_addr		= ip_hdr(skb)->daddr;
+	ireq->ir_rmt_addr		= ip_hdr(skb)->saddr;
 	ireq->ecn_ok		= ecn_ok;
 	ireq->snd_wscale	= tcp_opt.snd_wscale;
 	ireq->sack_ok		= tcp_opt.sack_ok;
@@ -347,8 +347,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
 			   inet_sk_flowi_flags(sk),
-			   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
-			   ireq->loc_addr, th->source, th->dest);
+			   (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
+			   ireq->ir_loc_addr, th->source, th->dest);
 	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(sock_net(sk), &fl4);
 	if (IS_ERR(rt)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e4695dde1af6..114d1b748cbb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -835,11 +835,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	skb = tcp_make_synack(sk, dst, req, NULL);
 
 	if (skb) {
-		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
+		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 
 		skb_set_queue_mapping(skb, queue_mapping);
-		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
-					    ireq->rmt_addr,
+		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+					    ireq->ir_rmt_addr,
 					    ireq->opt);
 		err = net_xmit_eval(err);
 		if (!tcp_rsk(req)->snt_synack && !err)
@@ -972,7 +972,7 @@ static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 {
 	union tcp_md5_addr *addr;
 
-	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
+	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
 	return tcp_md5_do_lookup(sk, addr, AF_INET);
 }
 
@@ -1149,8 +1149,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
 		saddr = inet_sk(sk)->inet_saddr;
 		daddr = inet_sk(sk)->inet_daddr;
 	} else if (req) {
-		saddr = inet_rsk(req)->loc_addr;
-		daddr = inet_rsk(req)->rmt_addr;
+		saddr = inet_rsk(req)->ir_loc_addr;
+		daddr = inet_rsk(req)->ir_rmt_addr;
 	} else {
 		const struct iphdr *iph = ip_hdr(skb);
 		saddr = iph->saddr;
@@ -1366,8 +1366,8 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
 		kfree_skb(skb_synack);
 		return -1;
 	}
-	err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
-				    ireq->rmt_addr, ireq->opt);
+	err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
+				    ireq->ir_rmt_addr, ireq->opt);
 	err = net_xmit_eval(err);
 	if (!err)
 		tcp_rsk(req)->snt_synack = tcp_time_stamp;
@@ -1502,8 +1502,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_openreq_init(req, &tmp_opt, skb);
 
 	ireq = inet_rsk(req);
-	ireq->loc_addr = daddr;
-	ireq->rmt_addr = saddr;
+	ireq->ir_loc_addr = daddr;
+	ireq->ir_rmt_addr = saddr;
 	ireq->no_srccheck = inet_sk(sk)->transparent;
 	ireq->opt = tcp_v4_save_options(skb);
 
@@ -1578,15 +1578,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
 
 	if (skb_synack) {
-		__tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
+		__tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 		skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
 	} else
 		goto drop_and_free;
 
 	if (likely(!do_fastopen)) {
 		int err;
-		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
-		     ireq->rmt_addr, ireq->opt);
+		err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
+		     ireq->ir_rmt_addr, ireq->opt);
 		err = net_xmit_eval(err);
 		if (err || want_cookie)
 			goto drop_and_free;
@@ -1644,9 +1644,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newtp		      = tcp_sk(newsk);
 	newinet		      = inet_sk(newsk);
 	ireq		      = inet_rsk(req);
-	newinet->inet_daddr   = ireq->rmt_addr;
-	newinet->inet_rcv_saddr = ireq->loc_addr;
-	newinet->inet_saddr	      = ireq->loc_addr;
+	newinet->inet_daddr   = ireq->ir_rmt_addr;
+	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+	newinet->inet_saddr	      = ireq->ir_loc_addr;
 	inet_opt	      = ireq->opt;
 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
 	ireq->opt	      = NULL;
@@ -2548,10 +2548,10 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
 		i,
-		ireq->loc_addr,
+		ireq->ir_loc_addr,
 		ntohs(inet_sk(sk)->inet_sport),
-		ireq->rmt_addr,
-		ntohs(ireq->rmt_port),
+		ireq->ir_rmt_addr,
+		ntohs(ireq->ir_rmt_port),
 		TCP_SYN_RECV,
 		0, 0, /* could print option size, but that is af dependent. */
 		1,    /* timers active (only the expire timer) */
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 8fcc2cb9dba4..4a2a84110dfb 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -215,13 +215,15 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 	addr.family = req->rsk_ops->family;
 	switch (addr.family) {
 	case AF_INET:
-		addr.addr.a4 = inet_rsk(req)->rmt_addr;
+		addr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
 		hash = (__force unsigned int) addr.addr.a4;
 		break;
+#if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		*(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
-		hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
+		*(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
+		hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
 		break;
+#endif
 	default:
 		return NULL;
 	}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c6f01f2cdb32..faec81353522 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2734,8 +2734,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	th->syn = 1;
 	th->ack = 1;
 	TCP_ECN_make_synack(req, th);
-	th->source = ireq->loc_port;
-	th->dest = ireq->rmt_port;
+	th->source = ireq->ir_loc_port;
+	th->dest = ireq->ir_rmt_port;
 	/* Setting of flags are superfluous here for callers (and ECE is
 	 * not even correctly set)
 	 */
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index b7400b480e74..1317c569b58f 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -70,20 +70,20 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
 				      struct flowi6 *fl6,
 				      const struct request_sock *req)
 {
-	struct inet6_request_sock *treq = inet6_rsk(req);
+	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *final_p, final;
 	struct dst_entry *dst;
 
 	memset(fl6, 0, sizeof(*fl6));
 	fl6->flowi6_proto = IPPROTO_TCP;
-	fl6->daddr = treq->rmt_addr;
+	fl6->daddr = ireq->ir_v6_rmt_addr;
 	final_p = fl6_update_dst(fl6, np->opt, &final);
-	fl6->saddr = treq->loc_addr;
-	fl6->flowi6_oif = treq->iif;
+	fl6->saddr = ireq->ir_v6_loc_addr;
+	fl6->flowi6_oif = ireq->ir_iif;
 	fl6->flowi6_mark = sk->sk_mark;
-	fl6->fl6_dport = inet_rsk(req)->rmt_port;
-	fl6->fl6_sport = inet_rsk(req)->loc_port;
+	fl6->fl6_dport = ireq->ir_rmt_port;
+	fl6->fl6_sport = ireq->ir_loc_port;
 	security_req_classify_flow(req, flowi6_to_flowi(fl6));
 
 	dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
@@ -129,13 +129,13 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk,
 						     lopt->nr_table_entries)];
 	     (req = *prev) != NULL;
 	     prev = &req->dl_next) {
-		const struct inet6_request_sock *treq = inet6_rsk(req);
+		const struct inet_request_sock *ireq = inet_rsk(req);
 
-		if (inet_rsk(req)->rmt_port == rport &&
+		if (ireq->ir_rmt_port == rport &&
 		    req->rsk_ops->family == AF_INET6 &&
-		    ipv6_addr_equal(&treq->rmt_addr, raddr) &&
-		    ipv6_addr_equal(&treq->loc_addr, laddr) &&
-		    (!treq->iif || treq->iif == iif)) {
+		    ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) &&
+		    ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) &&
+		    (!ireq->ir_iif || ireq->ir_iif == iif)) {
 			WARN_ON(req->sk != NULL);
 			*prevp = prev;
 			return req;
@@ -153,8 +153,8 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-	const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
-				      inet_rsk(req)->rmt_port,
+	const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
+				      inet_rsk(req)->ir_rmt_port,
 				      lopt->hash_rnd, lopt->nr_table_entries);
 
 	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index d703218a653b..bc5698f9e4cd 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -150,7 +150,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_options_received tcp_opt;
 	struct inet_request_sock *ireq;
-	struct inet6_request_sock *ireq6;
 	struct tcp_request_sock *treq;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -187,7 +186,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		goto out;
 
 	ireq = inet_rsk(req);
-	ireq6 = inet6_rsk(req);
 	treq = tcp_rsk(req);
 	treq->listener = NULL;
 
@@ -195,22 +193,22 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		goto out_free;
 
 	req->mss = mss;
-	ireq->rmt_port = th->source;
-	ireq->loc_port = th->dest;
-	ireq6->rmt_addr = ipv6_hdr(skb)->saddr;
-	ireq6->loc_addr = ipv6_hdr(skb)->daddr;
+	ireq->ir_rmt_port = th->source;
+	ireq->ir_loc_port = th->dest;
+	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
 	if (ipv6_opt_accepted(sk, skb) ||
 	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 	    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
 		atomic_inc(&skb->users);
-		ireq6->pktopts = skb;
+		ireq->pktopts = skb;
 	}
 
-	ireq6->iif = sk->sk_bound_dev_if;
+	ireq->ir_iif = sk->sk_bound_dev_if;
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
-	    ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		ireq6->iif = inet6_iif(skb);
+	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		ireq->ir_iif = inet6_iif(skb);
 
 	req->expires = 0UL;
 	req->num_retrans = 0;
@@ -234,12 +232,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		struct flowi6 fl6;
 		memset(&fl6, 0, sizeof(fl6));
 		fl6.flowi6_proto = IPPROTO_TCP;
-		fl6.daddr = ireq6->rmt_addr;
+		fl6.daddr = ireq->ir_v6_rmt_addr;
 		final_p = fl6_update_dst(&fl6, np->opt, &final);
-		fl6.saddr = ireq6->loc_addr;
+		fl6.saddr = ireq->ir_v6_loc_addr;
 		fl6.flowi6_oif = sk->sk_bound_dev_if;
 		fl6.flowi6_mark = sk->sk_mark;
-		fl6.fl6_dport = inet_rsk(req)->rmt_port;
+		fl6.fl6_dport = ireq->ir_rmt_port;
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;
 		security_req_classify_flow(req, flowi6_to_flowi(&fl6));
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 541dfc40c7b3..db234d609b33 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -465,7 +465,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct request_sock *req,
 			      u16 queue_mapping)
 {
-	struct inet6_request_sock *treq = inet6_rsk(req);
+	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff * skb;
 	int err = -ENOMEM;
@@ -477,9 +477,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
 	skb = tcp_make_synack(sk, dst, req, NULL);
 
 	if (skb) {
-		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
+		__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
+				    &ireq->ir_v6_rmt_addr);
 
-		fl6->daddr = treq->rmt_addr;
+		fl6->daddr = ireq->ir_v6_rmt_addr;
 		skb_set_queue_mapping(skb, queue_mapping);
 		err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
 		err = net_xmit_eval(err);
@@ -502,7 +503,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
-	kfree_skb(inet6_rsk(req)->pktopts);
+	kfree_skb(inet_rsk(req)->pktopts);
 }
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -521,7 +522,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk,
 static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk,
 						      struct request_sock *req)
 {
-	return tcp_v6_md5_do_lookup(sk, &inet6_rsk(req)->rmt_addr);
+	return tcp_v6_md5_do_lookup(sk, &inet_rsk(req)->ir_v6_rmt_addr);
 }
 
 static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval,
@@ -623,8 +624,8 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
 		saddr = &inet6_sk(sk)->saddr;
 		daddr = &sk->sk_v6_daddr;
 	} else if (req) {
-		saddr = &inet6_rsk(req)->loc_addr;
-		daddr = &inet6_rsk(req)->rmt_addr;
+		saddr = &inet_rsk(req)->ir_v6_loc_addr;
+		daddr = &inet_rsk(req)->ir_v6_rmt_addr;
 	} else {
 		const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 		saddr = &ip6h->saddr;
@@ -949,7 +950,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_options_received tmp_opt;
 	struct request_sock *req;
-	struct inet6_request_sock *treq;
+	struct inet_request_sock *ireq;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 isn = TCP_SKB_CB(skb)->when;
@@ -994,25 +995,25 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	treq = inet6_rsk(req);
-	treq->rmt_addr = ipv6_hdr(skb)->saddr;
-	treq->loc_addr = ipv6_hdr(skb)->daddr;
+	ireq = inet_rsk(req);
+	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
 	if (!want_cookie || tmp_opt.tstamp_ok)
 		TCP_ECN_create_request(req, skb, sock_net(sk));
 
-	treq->iif = sk->sk_bound_dev_if;
+	ireq->ir_iif = sk->sk_bound_dev_if;
 
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
-	    ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		treq->iif = inet6_iif(skb);
+	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		ireq->ir_iif = inet6_iif(skb);
 
 	if (!isn) {
 		if (ipv6_opt_accepted(sk, skb) ||
 		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
 			atomic_inc(&skb->users);
-			treq->pktopts = skb;
+			ireq->pktopts = skb;
 		}
 
 		if (want_cookie) {
@@ -1051,7 +1052,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 			 * to the moment of synflood.
 			 */
 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
-				       &treq->rmt_addr, ntohs(tcp_hdr(skb)->source));
+				       &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source));
 			goto drop_and_release;
 		}
 
@@ -1086,7 +1087,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 					  struct request_sock *req,
 					  struct dst_entry *dst)
 {
-	struct inet6_request_sock *treq;
+	struct inet_request_sock *ireq;
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct tcp6_sock *newtcp6sk;
 	struct inet_sock *newinet;
@@ -1151,7 +1152,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		return newsk;
 	}
 
-	treq = inet6_rsk(req);
+	ireq = inet_rsk(req);
 
 	if (sk_acceptq_is_full(sk))
 		goto out_overflow;
@@ -1185,10 +1186,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-	newsk->sk_v6_daddr = treq->rmt_addr;
-	newnp->saddr = treq->loc_addr;
-	newsk->sk_v6_rcv_saddr = treq->loc_addr;
-	newsk->sk_bound_dev_if = treq->iif;
+	newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+	newnp->saddr = ireq->ir_v6_loc_addr;
+	newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+	newsk->sk_bound_dev_if = ireq->ir_iif;
 
 	/* Now IPv6 options...
 
@@ -1203,11 +1204,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	/* Clone pktoptions received with SYN */
 	newnp->pktoptions = NULL;
-	if (treq->pktopts != NULL) {
-		newnp->pktoptions = skb_clone(treq->pktopts,
+	if (ireq->pktopts != NULL) {
+		newnp->pktoptions = skb_clone(ireq->pktopts,
 					      sk_gfp_atomic(sk, GFP_ATOMIC));
-		consume_skb(treq->pktopts);
-		treq->pktopts = NULL;
+		consume_skb(ireq->pktopts);
+		ireq->pktopts = NULL;
 		if (newnp->pktoptions)
 			skb_set_owner_r(newnp->pktoptions, newsk);
 	}
@@ -1722,8 +1723,8 @@ static void get_openreq6(struct seq_file *seq,
 			 const struct sock *sk, struct request_sock *req, int i, kuid_t uid)
 {
 	int ttd = req->expires - jiffies;
-	const struct in6_addr *src = &inet6_rsk(req)->loc_addr;
-	const struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
+	const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
+	const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr;
 
 	if (ttd < 0)
 		ttd = 0;
@@ -1734,10 +1735,10 @@ static void get_openreq6(struct seq_file *seq,
 		   i,
 		   src->s6_addr32[0], src->s6_addr32[1],
 		   src->s6_addr32[2], src->s6_addr32[3],
-		   ntohs(inet_rsk(req)->loc_port),
+		   ntohs(inet_rsk(req)->ir_loc_port),
 		   dest->s6_addr32[0], dest->s6_addr32[1],
 		   dest->s6_addr32[2], dest->s6_addr32[3],
-		   ntohs(inet_rsk(req)->rmt_port),
+		   ntohs(inet_rsk(req)->ir_rmt_port),
 		   TCP_SYN_RECV,
 		   0,0, /* could print option size, but that is af dependent. */
 		   1,   /* timers active (only the expire timer) */
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 96a458e12f60..dce1bebf7aec 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -817,7 +817,7 @@ int netlbl_req_setattr(struct request_sock *req,
 	switch (req->rsk_ops->family) {
 	case AF_INET:
 		entry = netlbl_domhsh_getentry_af4(secattr->domain,
-						   inet_rsk(req)->rmt_addr);
+						   inet_rsk(req)->ir_rmt_addr);
 		if (entry == NULL) {
 			ret_val = -ENOENT;
 			goto req_setattr_return;
-- 
cgit v1.2.3


From ba537427d77cf274592f31ce94f4b4cadfad88b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Oct 2013 17:14:52 -0700
Subject: tcp: use ACCESS_ONCE() in tcp_update_pacing_rate()

sk_pacing_rate is read by sch_fq packet scheduler at any time,
with no synchronization, so make sure we update it in a
sensible way. ACCESS_ONCE() is how we instruct compiler
to not do stupid things, like using the memory location
as a temporary variable.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 47b8ab7dce9c..eb651a069a6c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -755,7 +755,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
 	if (tp->srtt > 8 + 2)
 		do_div(rate, tp->srtt);
 
-	sk->sk_pacing_rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	/* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+	 * without any lock. We want to make sure compiler wont store
+	 * intermediate values in this location.
+	 */
+	ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
+						sk->sk_max_pacing_rate);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
-- 
cgit v1.2.3


From b44084c2c822f99dd3f2334b288b7e463d222662 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2013 00:04:37 -0700
Subject: inet: rename ir_loc_port to ir_num

In commit 634fb979e8f ("inet: includes a sock_common in request_sock")
I forgot that the two ports in sock_common do not have same byte order :

skc_dport is __be16 (network order), but skc_num is __u16 (host order)

So sparse complains because ir_loc_port (mapped into skc_num) is
considered as __u16 while it should be __be16

Let rename ir_loc_port to ireq->ir_num (analogy with inet->inet_num),
and perform appropriate htons/ntohs conversions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h          | 2 +-
 include/net/tcp.h                | 2 +-
 net/dccp/ipv6.c                  | 4 ++--
 net/dccp/minisocks.c             | 8 ++++----
 net/dccp/output.c                | 2 +-
 net/ipv4/inet_connection_sock.c  | 4 ++--
 net/ipv4/syncookies.c            | 8 ++++----
 net/ipv4/tcp_output.c            | 2 +-
 net/ipv6/inet6_connection_sock.c | 2 +-
 net/ipv6/syncookies.c            | 2 +-
 net/ipv6/tcp_ipv6.c              | 2 +-
 11 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f91204442efa..06da91efbc83 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -72,7 +72,7 @@ struct inet_request_sock {
 	struct request_sock	req;
 #define ir_loc_addr		req.__req_common.skc_rcv_saddr
 #define ir_rmt_addr		req.__req_common.skc_daddr
-#define ir_loc_port		req.__req_common.skc_num
+#define ir_num			req.__req_common.skc_num
 #define ir_rmt_port		req.__req_common.skc_dport
 #define ir_v6_rmt_addr		req.__req_common.skc_v6_daddr
 #define ir_v6_loc_addr		req.__req_common.skc_v6_rcv_saddr
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 24a06161d174..1db3a016bff6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1110,7 +1110,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	ireq->acked = 0;
 	ireq->ecn_ok = 0;
 	ireq->ir_rmt_port = tcp_hdr(skb)->source;
-	ireq->ir_loc_port = tcp_hdr(skb)->dest;
+	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
 }
 
 void tcp_enter_memory_pressure(struct sock *sk);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 5cc5b24a956e..4ac71ff7c2e4 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -231,7 +231,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
 	fl6.flowlabel = 0;
 	fl6.flowi6_oif = ireq->ir_iif;
 	fl6.fl6_dport = ireq->ir_rmt_port;
-	fl6.fl6_sport = ireq->ir_loc_port;
+	fl6.fl6_sport = htons(ireq->ir_num);
 	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
 
 
@@ -509,7 +509,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		fl6.saddr = ireq->ir_v6_loc_addr;
 		fl6.flowi6_oif = sk->sk_bound_dev_if;
 		fl6.fl6_dport = ireq->ir_rmt_port;
-		fl6.fl6_sport = ireq->ir_loc_port;
+		fl6.fl6_sport = htons(ireq->ir_num);
 		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
 		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 66afbcec2941..9e2f78bc1553 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -266,10 +266,10 @@ int dccp_reqsk_init(struct request_sock *req,
 {
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
-	inet_rsk(req)->ir_rmt_port	  = dccp_hdr(skb)->dccph_sport;
-	inet_rsk(req)->ir_loc_port	  = dccp_hdr(skb)->dccph_dport;
-	inet_rsk(req)->acked	  = 0;
-	dreq->dreq_timestamp_echo = 0;
+	inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
+	inet_rsk(req)->ir_num	   = ntohs(dccp_hdr(skb)->dccph_dport);
+	inet_rsk(req)->acked	   = 0;
+	dreq->dreq_timestamp_echo  = 0;
 
 	/* inherit feature negotiation options from listening socket */
 	return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 9bf195d1b87a..8876078859da 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -424,7 +424,7 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	/* Build and checksum header */
 	dh = dccp_zeroed_hdr(skb, dccp_header_size);
 
-	dh->dccph_sport	= inet_rsk(req)->ir_loc_port;
+	dh->dccph_sport	= htons(inet_rsk(req)->ir_num);
 	dh->dccph_dport	= inet_rsk(req)->ir_rmt_port;
 	dh->dccph_doff	= (dccp_header_size +
 			   DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 2ffd931d652f..fc0e649cc002 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -676,8 +676,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 		newicsk->icsk_bind_hash = NULL;
 
 		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
-		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->ir_loc_port);
-		inet_sk(newsk)->inet_sport = inet_rsk(req)->ir_loc_port;
+		inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+		inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
 		newsk->sk_write_space = sk_stream_write_space;
 
 		newicsk->icsk_retransmits = 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 984e21cf3c50..3b64c59b4109 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -304,10 +304,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	treq->rcv_isn		= ntohl(th->seq) - 1;
 	treq->snt_isn		= cookie;
 	req->mss		= mss;
-	ireq->ir_loc_port		= th->dest;
-	ireq->ir_rmt_port		= th->source;
-	ireq->ir_loc_addr		= ip_hdr(skb)->daddr;
-	ireq->ir_rmt_addr		= ip_hdr(skb)->saddr;
+	ireq->ir_num		= ntohs(th->dest);
+	ireq->ir_rmt_port	= th->source;
+	ireq->ir_loc_addr	= ip_hdr(skb)->daddr;
+	ireq->ir_rmt_addr	= ip_hdr(skb)->saddr;
 	ireq->ecn_ok		= ecn_ok;
 	ireq->snd_wscale	= tcp_opt.snd_wscale;
 	ireq->sack_ok		= tcp_opt.sack_ok;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index faec81353522..2822ad021a48 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2734,7 +2734,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	th->syn = 1;
 	th->ack = 1;
 	TCP_ECN_make_synack(req, th);
-	th->source = ireq->ir_loc_port;
+	th->source = htons(ireq->ir_num);
 	th->dest = ireq->ir_rmt_port;
 	/* Setting of flags are superfluous here for callers (and ECE is
 	 * not even correctly set)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 1317c569b58f..77bb8afb141d 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -83,7 +83,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
 	fl6->flowi6_oif = ireq->ir_iif;
 	fl6->flowi6_mark = sk->sk_mark;
 	fl6->fl6_dport = ireq->ir_rmt_port;
-	fl6->fl6_sport = ireq->ir_loc_port;
+	fl6->fl6_sport = htons(ireq->ir_num);
 	security_req_classify_flow(req, flowi6_to_flowi(fl6));
 
 	dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index bc5698f9e4cd..d04d3f1dd9b7 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -194,7 +194,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	req->mss = mss;
 	ireq->ir_rmt_port = th->source;
-	ireq->ir_loc_port = th->dest;
+	ireq->ir_num = ntohs(th->dest);
 	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
 	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
 	if (ipv6_opt_accepted(sk, skb) ||
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index db234d609b33..b996ee2005a9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1735,7 +1735,7 @@ static void get_openreq6(struct seq_file *seq,
 		   i,
 		   src->s6_addr32[0], src->s6_addr32[1],
 		   src->s6_addr32[2], src->s6_addr32[3],
-		   ntohs(inet_rsk(req)->ir_loc_port),
+		   inet_rsk(req)->ir_num,
 		   dest->s6_addr32[0], dest->s6_addr32[1],
 		   dest->s6_addr32[2], dest->s6_addr32[3],
 		   ntohs(inet_rsk(req)->ir_rmt_port),
-- 
cgit v1.2.3


From 7263a5187f9e9de45fcb51349cf0e031142c19a1 Mon Sep 17 00:00:00 2001
From: Christophe Gouault <christophe.gouault@6wind.com>
Date: Tue, 8 Oct 2013 17:21:22 +0200
Subject: vti: get rid of nf mark rule in prerouting

This patch fixes and improves the use of vti interfaces (while
lightly changing the way of configuring them).

Currently:

- it is necessary to identify and mark inbound IPsec
  packets destined to each vti interface, via netfilter rules in
  the mangle table at prerouting hook.

- the vti module cannot retrieve the right tunnel in input since
  commit b9959fd3: vti tunnels all have an i_key, but the tunnel lookup
  is done with flag TUNNEL_NO_KEY, so there no chance to retrieve them.

- the i_key is used by the outbound processing as a mark to lookup
  for the right SP and SA bundle.

This patch uses the o_key to store the vti mark (instead of i_key) and
enables:

- to avoid the need for previously marking the inbound skbuffs via a
  netfilter rule.
- to properly retrieve the right tunnel in input, only based on the IPsec
  packet outer addresses.
- to properly perform an inbound policy check (using the tunnel o_key
  as a mark).
- to properly perform an outbound SPD and SAD lookup (using the tunnel
  o_key as a mark).
- to keep the current mark of the skbuff. The skbuff mark is neither
  used nor changed by the vti interface. Only the vti interface o_key
  is used.

SAs have a wildcard mark.
SPs have a mark equal to the vti interface o_key.

The vti interface must be created as follows (i_key = 0, o_key = mark):

   ip link add vti1 mode vti local 1.1.1.1 remote 2.2.2.2 okey 1

The SPs attached to vti1 must be created as follows (mark = vti1 o_key):

   ip xfrm policy add dir out mark 1 tmpl src 1.1.1.1 dst 2.2.2.2 \
      proto esp mode tunnel
   ip xfrm policy add dir in  mark 1 tmpl src 2.2.2.2 dst 1.1.1.1 \
      proto esp mode tunnel

The SAs are created with the default wildcard mark. There is no
distinction between global vs. vti SAs. Just their addresses will
possibly link them to a vti interface:

   ip xfrm state add src 1.1.1.1 dst 2.2.2.2 proto esp spi 1000 mode tunnel \
                 enc "cbc(aes)" "azertyuiopqsdfgh"

   ip xfrm state add src 2.2.2.2 dst 1.1.1.1 proto esp spi 2000 mode tunnel \
                 enc "cbc(aes)" "sqbdhgqsdjqjsdfh"

To avoid matching "global" (not vti) SPs in vti interfaces, global SPs
should no use the default wildcard mark, but explicitly match mark 0.

To avoid a double SPD lookup in input and output (in global and vti SPDs),
the NOPOLICY and NOXFRM options should be set on the vti interfaces:

   echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_policy
   echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_xfrm

The outgoing traffic is steered to vti1 by a route via the vti interface:

   ip route add 192.168.0.0/16 dev vti1

The incoming IPsec traffic is steered to vti1 because its outer addresses
match the vti1 tunnel configuration.

Signed-off-by: Christophe Gouault <christophe.gouault@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e805e7b3030e..6e87f853d033 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -125,8 +125,17 @@ static int vti_rcv(struct sk_buff *skb)
 				  iph->saddr, iph->daddr, 0);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
+		u32 oldmark = skb->mark;
+		int ret;
 
-		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+
+		/* temporarily mark the skb with the tunnel o_key, to
+		 * only match policies with this mark.
+		 */
+		skb->mark = be32_to_cpu(tunnel->parms.o_key);
+		ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb);
+		skb->mark = oldmark;
+		if (!ret)
 			return -1;
 
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -135,7 +144,6 @@ static int vti_rcv(struct sk_buff *skb)
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		skb->mark = 0;
 		secpath_reset(skb);
 		skb->dev = tunnel->dev;
 		return 1;
@@ -167,7 +175,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	memset(&fl4, 0, sizeof(fl4));
 	flowi4_init_output(&fl4, tunnel->parms.link,
-			   be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
+			   be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos),
 			   RT_SCOPE_UNIVERSE,
 			   IPPROTO_IPIP, 0,
 			   dst, tiph->saddr, 0, 0);
-- 
cgit v1.2.3


From ccdbb6e96beca362db876d820ac1e560ff6d9579 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2013 08:43:00 -0700
Subject: tcp: tcp_transmit_skb() optimizations

1) We need to take a timestamp only for skb that should be cloned.

Other skbs are not in write queue and no rtt estimation is done on them.

2) the unlikely() hint is wrong for receivers (they send pure ACK)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: MF Nowlan <fitz@cs.yale.edu>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-By: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2822ad021a48..e5ce0e1d13b7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -850,15 +850,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
 
-	/* If congestion control is doing timestamping, we must
-	 * take such a timestamp before we potentially clone/copy.
-	 */
-	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
-		__net_timestamp(skb);
-
-	if (likely(clone_it)) {
+	if (clone_it) {
 		const struct sk_buff *fclone = skb + 1;
 
+		/* If congestion control is doing timestamping, we must
+		 * take such a timestamp before we potentially clone/copy.
+		 */
+		if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
+			__net_timestamp(skb);
+
 		if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
 			     fclone->fclone == SKB_FCLONE_CLONE))
 			NET_INC_STATS_BH(sock_net(sk),
-- 
cgit v1.2.3


From 795aa6ef6a1aba99050735eadd0c2341b789b53b Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Oct 2013 09:21:55 +0200
Subject: netfilter: pass hook ops to hookfn

Pass the hook ops to the hookfn to allow for generic hook
functions. This change is required by nf_tables.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                      |  3 +-
 net/bridge/br_netfilter.c                      | 22 +++++++++-----
 net/bridge/netfilter/ebtable_filter.c          | 16 ++++++----
 net/bridge/netfilter/ebtable_nat.c             | 16 ++++++----
 net/decnet/netfilter/dn_rtmsg.c                |  2 +-
 net/ipv4/netfilter/arptable_filter.c           |  5 +--
 net/ipv4/netfilter/ipt_CLUSTERIP.c             |  2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c              |  2 +-
 net/ipv4/netfilter/iptable_filter.c            |  7 +++--
 net/ipv4/netfilter/iptable_mangle.c            | 10 +++---
 net/ipv4/netfilter/iptable_nat.c               | 26 ++++++++--------
 net/ipv4/netfilter/iptable_raw.c               |  6 ++--
 net/ipv4/netfilter/iptable_security.c          |  7 +++--
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 12 ++++----
 net/ipv4/netfilter/nf_defrag_ipv4.c            |  6 ++--
 net/ipv6/netfilter/ip6t_SYNPROXY.c             |  2 +-
 net/ipv6/netfilter/ip6table_filter.c           |  5 +--
 net/ipv6/netfilter/ip6table_mangle.c           | 10 +++---
 net/ipv6/netfilter/ip6table_nat.c              | 27 +++++++++--------
 net/ipv6/netfilter/ip6table_raw.c              |  5 +--
 net/ipv6/netfilter/ip6table_security.c         |  5 +--
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 14 +++++----
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      |  6 ++--
 net/netfilter/core.c                           |  2 +-
 net/netfilter/ipvs/ip_vs_core.c                | 42 +++++++++++++-------------
 security/selinux/hooks.c                       | 10 +++---
 26 files changed, 148 insertions(+), 122 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 61223c52414f..fef7e67f7101 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -42,7 +42,8 @@ int netfilter_init(void);
 
 struct sk_buff;
 
-typedef unsigned int nf_hookfn(unsigned int hooknum,
+struct nf_hook_ops;
+typedef unsigned int nf_hookfn(const struct nf_hook_ops *ops,
 			       struct sk_buff *skb,
 			       const struct net_device *in,
 			       const struct net_device *out,
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index f87736270eaa..878f008afefa 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -619,7 +619,7 @@ bad:
 
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
  * to ip6tables, which doesn't support NAT, so things are fairly simple. */
-static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
+static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
@@ -669,7 +669,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
  * receiving device) to make netfilter happy, the REDIRECT
  * target in particular.  Save the original destination IP
  * address to be able to detect DNAT afterwards. */
-static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
@@ -691,7 +692,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 			return NF_ACCEPT;
 
 		nf_bridge_pull_encap_header_rcsum(skb);
-		return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
+		return br_nf_pre_routing_ipv6(ops, skb, in, out, okfn);
 	}
 
 	if (!brnf_call_iptables && !br->nf_call_iptables)
@@ -727,7 +728,8 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
  * took place when the packet entered the bridge), but we
  * register an IPv4 PRE_ROUTING 'sabotage' hook that will
  * prevent this from happening. */
-static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_local_in(const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
 				   const struct net_device *in,
 				   const struct net_device *out,
 				   int (*okfn)(struct sk_buff *))
@@ -765,7 +767,8 @@ static int br_nf_forward_finish(struct sk_buff *skb)
  * but we are still able to filter on the 'real' indev/outdev
  * because of the physdev module. For ARP, indev and outdev are the
  * bridge ports. */
-static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
+				     struct sk_buff *skb,
 				     const struct net_device *in,
 				     const struct net_device *out,
 				     int (*okfn)(struct sk_buff *))
@@ -818,7 +821,8 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
 	return NF_STOLEN;
 }
 
-static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
@@ -878,7 +882,8 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 #endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
-static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
 				       const struct net_device *in,
 				       const struct net_device *out,
 				       int (*okfn)(struct sk_buff *))
@@ -923,7 +928,8 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
 /* IP/SABOTAGE *****************************************************/
 /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
  * for the second time. */
-static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff *skb,
+static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
 				   const struct net_device *in,
 				   const struct net_device *out,
 				   int (*okfn)(struct sk_buff *))
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 94b2b700cff8..bb2da7b706e7 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -60,17 +60,21 @@ static const struct ebt_table frame_filter =
 };
 
 static unsigned int
-ebt_in_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in,
-   const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_in_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	    const struct net_device *in, const struct net_device *out,
+	    int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_filter);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(in)->xt.frame_filter);
 }
 
 static unsigned int
-ebt_out_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in,
-   const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_out_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_filter);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(out)->xt.frame_filter);
 }
 
 static struct nf_hook_ops ebt_ops_filter[] __read_mostly = {
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 322555acdd40..bd238f1f105b 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -60,17 +60,21 @@ static struct ebt_table frame_nat =
 };
 
 static unsigned int
-ebt_nat_in(unsigned int hook, struct sk_buff *skb, const struct net_device *in
-   , const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	   const struct net_device *in, const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_nat);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(in)->xt.frame_nat);
 }
 
 static unsigned int
-ebt_nat_out(unsigned int hook, struct sk_buff *skb, const struct net_device *in
-   , const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	    const struct net_device *in, const struct net_device *out,
+	    int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_nat);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(out)->xt.frame_nat);
 }
 
 static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 2a7efe388344..e83015cecfa7 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -87,7 +87,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
 }
 
 
-static unsigned int dnrmg_hook(unsigned int hook,
+static unsigned int dnrmg_hook(const struct nf_hook_ops *ops,
 			struct sk_buff *skb,
 			const struct net_device *in,
 			const struct net_device *out,
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index a865f6f94013..802ddecb30b8 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -27,13 +27,14 @@ static const struct xt_table packet_filter = {
 
 /* The work comes in here from netfilter.c */
 static unsigned int
-arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
+	return arpt_do_table(skb, ops->hooknum, in, out,
+			     net->ipv4.arptable_filter);
 }
 
 static struct nf_hook_ops *arpfilter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0b732efd32e2..a2e2b61cd7da 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -483,7 +483,7 @@ static void arp_print(struct arp_payload *payload)
 #endif
 
 static unsigned int
-arp_mangle(unsigned int hook,
+arp_mangle(const struct nf_hook_ops *ops,
 	   struct sk_buff *skb,
 	   const struct net_device *in,
 	   const struct net_device *out,
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index b6346bf2fde3..01cffeaa0085 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -297,7 +297,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
+static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
 				       struct sk_buff *skb,
 				       const struct net_device *in,
 				       const struct net_device *out,
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 50af5b45c050..e08a74a243a8 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -33,20 +33,21 @@ static const struct xt_table packet_filter = {
 };
 
 static unsigned int
-iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		    const struct net_device *in, const struct net_device *out,
 		    int (*okfn)(struct sk_buff *))
 {
 	const struct net *net;
 
-	if (hook == NF_INET_LOCAL_OUT &&
+	if (ops->hooknum == NF_INET_LOCAL_OUT &&
 	    (skb->len < sizeof(struct iphdr) ||
 	     ip_hdrlen(skb) < sizeof(struct iphdr)))
 		/* root is playing with raw sockets. */
 		return NF_ACCEPT;
 
 	net = dev_net((in != NULL) ? in : out);
-	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
+	return ipt_do_table(skb, ops->hooknum, in, out,
+			    net->ipv4.iptable_filter);
 }
 
 static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 0d8cd82e0fad..6a5079c34bb3 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -79,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-iptable_mangle_hook(unsigned int hook,
+iptable_mangle_hook(const struct nf_hook_ops *ops,
 		     struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	if (hook == NF_INET_LOCAL_OUT)
+	if (ops->hooknum == NF_INET_LOCAL_OUT)
 		return ipt_mangle_out(skb, out);
-	if (hook == NF_INET_POST_ROUTING)
-		return ipt_do_table(skb, hook, in, out,
+	if (ops->hooknum == NF_INET_POST_ROUTING)
+		return ipt_do_table(skb, ops->hooknum, in, out,
 				    dev_net(out)->ipv4.iptable_mangle);
 	/* PREROUTING/INPUT/FORWARD: */
-	return ipt_do_table(skb, hook, in, out,
+	return ipt_do_table(skb, ops->hooknum, in, out,
 			    dev_net(in)->ipv4.iptable_mangle);
 }
 
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 683bfaffed65..ee2886126e3d 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -61,7 +61,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv4_fn(unsigned int hooknum,
+nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -71,7 +71,7 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_nat *nat;
 	/* maniptype == SRC for postrouting. */
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
 
 	/* We never see fragments: conntrack defrags on pre-routing
 	 * and local-out, and nf_nat_out protects post-routing.
@@ -108,7 +108,7 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 	case IP_CT_RELATED_REPLY:
 		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
 			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-							   hooknum))
+							   ops->hooknum))
 				return NF_DROP;
 			else
 				return NF_ACCEPT;
@@ -121,14 +121,14 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+			ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
 			if (ret != NF_ACCEPT)
 				return ret;
 		} else {
 			pr_debug("Already setup manip %s for ct %p\n",
 				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
 				 ct);
-			if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+			if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 				goto oif_changed;
 		}
 		break;
@@ -137,11 +137,11 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 		/* ESTABLISHED */
 		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
 			     ctinfo == IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+		if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
 
 oif_changed:
 	nf_ct_kill_acct(ct, ctinfo, skb);
@@ -149,7 +149,7 @@ oif_changed:
 }
 
 static unsigned int
-nf_nat_ipv4_in(unsigned int hooknum,
+nf_nat_ipv4_in(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -158,7 +158,7 @@ nf_nat_ipv4_in(unsigned int hooknum,
 	unsigned int ret;
 	__be32 daddr = ip_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    daddr != ip_hdr(skb)->daddr)
 		skb_dst_drop(skb);
@@ -167,7 +167,7 @@ nf_nat_ipv4_in(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv4_out(unsigned int hooknum,
+nf_nat_ipv4_out(const struct nf_hook_ops *ops,
 		struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
@@ -185,7 +185,7 @@ nf_nat_ipv4_out(unsigned int hooknum,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -207,7 +207,7 @@ nf_nat_ipv4_out(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv4_local_fn(unsigned int hooknum,
+nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
 		     struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
@@ -223,7 +223,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1f82aea11df6..b2f7e8f98316 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -20,20 +20,20 @@ static const struct xt_table packet_raw = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
+iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 int (*okfn)(struct sk_buff *))
 {
 	const struct net *net;
 
-	if (hook == NF_INET_LOCAL_OUT && 
+	if (ops->hooknum == NF_INET_LOCAL_OUT &&
 	    (skb->len < sizeof(struct iphdr) ||
 	     ip_hdrlen(skb) < sizeof(struct iphdr)))
 		/* root is playing with raw sockets. */
 		return NF_ACCEPT;
 
 	net = dev_net((in != NULL) ? in : out);
-	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
+	return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);
 }
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index f867a8d38bf7..c86647ed2078 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -37,21 +37,22 @@ static const struct xt_table security_table = {
 };
 
 static unsigned int
-iptable_security_hook(unsigned int hook, struct sk_buff *skb,
+iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
 	const struct net *net;
 
-	if (hook == NF_INET_LOCAL_OUT &&
+	if (ops->hooknum == NF_INET_LOCAL_OUT &&
 	    (skb->len < sizeof(struct iphdr) ||
 	     ip_hdrlen(skb) < sizeof(struct iphdr)))
 		/* Somebody is playing with raw sockets. */
 		return NF_ACCEPT;
 
 	net = dev_net((in != NULL) ? in : out);
-	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
+	return ipt_do_table(skb, ops->hooknum, in, out,
+			    net->ipv4.iptable_security);
 }
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 86f5b34a4ed1..ecd8bec411c9 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 	return NF_ACCEPT;
 }
 
-static unsigned int ipv4_helper(unsigned int hooknum,
+static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
 				struct sk_buff *skb,
 				const struct net_device *in,
 				const struct net_device *out,
@@ -121,7 +121,7 @@ static unsigned int ipv4_helper(unsigned int hooknum,
 			    ct, ctinfo);
 }
 
-static unsigned int ipv4_confirm(unsigned int hooknum,
+static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
 				 struct sk_buff *skb,
 				 const struct net_device *in,
 				 const struct net_device *out,
@@ -147,16 +147,16 @@ out:
 	return nf_conntrack_confirm(skb);
 }
 
-static unsigned int ipv4_conntrack_in(unsigned int hooknum,
+static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
 				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
-	return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
+	return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
 }
 
-static unsigned int ipv4_conntrack_local(unsigned int hooknum,
+static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -166,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
 	if (skb->len < sizeof(struct iphdr) ||
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
-	return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
+	return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
 }
 
 /* Connection tracking may drop packets, but never alters them, so
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 742815518b0f..12e13bd82b5b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -60,7 +60,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 		return IP_DEFRAG_CONNTRACK_OUT + zone;
 }
 
-static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
+static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
 					  struct sk_buff *skb,
 					  const struct net_device *in,
 					  const struct net_device *out,
@@ -83,7 +83,9 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
 #endif
 	/* Gather fragments. */
 	if (ip_is_fragment(ip_hdr(skb))) {
-		enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
+		enum ip_defrag_users user =
+			nf_ct_defrag_user(ops->hooknum, skb);
+
 		if (nf_ct_ipv4_gather_frags(skb, user))
 			return NF_STOLEN;
 	}
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 2748b042da72..bf9f612c1bc2 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -312,7 +312,7 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
+static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops,
 				       struct sk_buff *skb,
 				       const struct net_device *in,
 				       const struct net_device *out,
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 29b44b14c5ea..ca7f6c128086 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -32,13 +32,14 @@ static const struct xt_table packet_filter = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6table_filter_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_filter);
+	return ip6t_do_table(skb, ops->hooknum, in, out,
+			     net->ipv6.ip6table_filter);
 }
 
 static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index c705907ae6ab..307bbb782d14 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -76,17 +76,17 @@ ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out)
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6table_mangle_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	if (hook == NF_INET_LOCAL_OUT)
+	if (ops->hooknum == NF_INET_LOCAL_OUT)
 		return ip6t_mangle_out(skb, out);
-	if (hook == NF_INET_POST_ROUTING)
-		return ip6t_do_table(skb, hook, in, out,
+	if (ops->hooknum == NF_INET_POST_ROUTING)
+		return ip6t_do_table(skb, ops->hooknum, in, out,
 				     dev_net(out)->ipv6.ip6table_mangle);
 	/* INPUT/FORWARD */
-	return ip6t_do_table(skb, hook, in, out,
+	return ip6t_do_table(skb, ops->hooknum, in, out,
 			     dev_net(in)->ipv6.ip6table_mangle);
 }
 
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 9b076d2d3a7b..84c7f33d0cf8 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -63,7 +63,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv6_fn(unsigned int hooknum,
+nf_nat_ipv6_fn(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -72,7 +72,7 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_nat *nat;
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
 	__be16 frag_off;
 	int hdrlen;
 	u8 nexthdr;
@@ -111,7 +111,8 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 
 		if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
 			if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
-							     hooknum, hdrlen))
+							     ops->hooknum,
+							     hdrlen))
 				return NF_DROP;
 			else
 				return NF_ACCEPT;
@@ -124,14 +125,14 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+			ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
 			if (ret != NF_ACCEPT)
 				return ret;
 		} else {
 			pr_debug("Already setup manip %s for ct %p\n",
 				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
 				 ct);
-			if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+			if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 				goto oif_changed;
 		}
 		break;
@@ -140,11 +141,11 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 		/* ESTABLISHED */
 		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
 			     ctinfo == IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+		if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
 
 oif_changed:
 	nf_ct_kill_acct(ct, ctinfo, skb);
@@ -152,7 +153,7 @@ oif_changed:
 }
 
 static unsigned int
-nf_nat_ipv6_in(unsigned int hooknum,
+nf_nat_ipv6_in(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -161,7 +162,7 @@ nf_nat_ipv6_in(unsigned int hooknum,
 	unsigned int ret;
 	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
 		skb_dst_drop(skb);
@@ -170,7 +171,7 @@ nf_nat_ipv6_in(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv6_out(unsigned int hooknum,
+nf_nat_ipv6_out(const struct nf_hook_ops *ops,
 		struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
@@ -187,7 +188,7 @@ nf_nat_ipv6_out(unsigned int hooknum,
 	if (skb->len < sizeof(struct ipv6hdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -209,7 +210,7 @@ nf_nat_ipv6_out(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv6_local_fn(unsigned int hooknum,
+nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops,
 		     struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
@@ -224,7 +225,7 @@ nf_nat_ipv6_local_fn(unsigned int hooknum,
 	if (skb->len < sizeof(struct ipv6hdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 9a626d86720f..5274740acecc 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -19,13 +19,14 @@ static const struct xt_table packet_raw = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6table_raw_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		  const struct net_device *in, const struct net_device *out,
 		  int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_raw);
+	return ip6t_do_table(skb, ops->hooknum, in, out,
+			     net->ipv6.ip6table_raw);
 }
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index ce88d1d7e525..ab3b0219ecfa 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -36,14 +36,15 @@ static const struct xt_table security_table = {
 };
 
 static unsigned int
-ip6table_security_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		       const struct net_device *in,
 		       const struct net_device *out,
 		       int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_security);
+	return ip6t_do_table(skb, ops->hooknum, in, out,
+			     net->ipv6.ip6table_security);
 }
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 54b75ead5a69..486545eb42ce 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -95,7 +95,7 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 	return NF_ACCEPT;
 }
 
-static unsigned int ipv6_helper(unsigned int hooknum,
+static unsigned int ipv6_helper(const struct nf_hook_ops *ops,
 				struct sk_buff *skb,
 				const struct net_device *in,
 				const struct net_device *out,
@@ -133,7 +133,7 @@ static unsigned int ipv6_helper(unsigned int hooknum,
 	return helper->help(skb, protoff, ct, ctinfo);
 }
 
-static unsigned int ipv6_confirm(unsigned int hooknum,
+static unsigned int ipv6_confirm(const struct nf_hook_ops *ops,
 				 struct sk_buff *skb,
 				 const struct net_device *in,
 				 const struct net_device *out,
@@ -219,16 +219,17 @@ static unsigned int __ipv6_conntrack_in(struct net *net,
 	return nf_conntrack_in(net, PF_INET6, hooknum, skb);
 }
 
-static unsigned int ipv6_conntrack_in(unsigned int hooknum,
+static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops,
 				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
-	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, in, out, okfn);
+	return __ipv6_conntrack_in(dev_net(in), ops->hooknum, skb, in, out,
+				   okfn);
 }
 
-static unsigned int ipv6_conntrack_local(unsigned int hooknum,
+static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -239,7 +240,8 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
 		net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
 		return NF_ACCEPT;
 	}
-	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, in, out, okfn);
+	return __ipv6_conntrack_in(dev_net(out), ops->hooknum, skb, in, out,
+				   okfn);
 }
 
 static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index aacd121fe8c5..ec483aa3f60f 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -52,7 +52,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 
 }
 
-static unsigned int ipv6_defrag(unsigned int hooknum,
+static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
 				struct sk_buff *skb,
 				const struct net_device *in,
 				const struct net_device *out,
@@ -66,7 +66,7 @@ static unsigned int ipv6_defrag(unsigned int hooknum,
 		return NF_ACCEPT;
 #endif
 
-	reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
+	reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb));
 	/* queued */
 	if (reasm == NULL)
 		return NF_STOLEN;
@@ -75,7 +75,7 @@ static unsigned int ipv6_defrag(unsigned int hooknum,
 	if (reasm == skb)
 		return NF_ACCEPT;
 
-	nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
+	nf_ct_frag6_output(ops->hooknum, reasm, (struct net_device *)in,
 			   (struct net_device *)out, okfn);
 
 	return NF_STOLEN;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 593b16ea45e0..1fbab0cdd302 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -146,7 +146,7 @@ unsigned int nf_iterate(struct list_head *head,
 		/* Optimization: we don't need to hold module
 		   reference here, since function can't sleep. --RR */
 repeat:
-		verdict = (*elemp)->hook(hook, skb, indev, outdev, okfn);
+		verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn);
 		if (verdict != NF_ACCEPT) {
 #ifdef CONFIG_NETFILTER_DEBUG
 			if (unlikely((verdict & NF_VERDICT_MASK)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 74fd00c27210..34fda62f40f6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1239,11 +1239,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	     const struct net_device *in, const struct net_device *out,
 	     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET);
+	return ip_vs_out(ops->hooknum, skb, AF_INET);
 }
 
 /*
@@ -1251,11 +1251,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET);
+	return ip_vs_out(ops->hooknum, skb, AF_INET);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1266,11 +1266,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	     const struct net_device *in, const struct net_device *out,
 	     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET6);
+	return ip_vs_out(ops->hooknum, skb, AF_INET6);
 }
 
 /*
@@ -1278,11 +1278,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET6);
+	return ip_vs_out(ops->hooknum, skb, AF_INET6);
 }
 
 #endif
@@ -1733,12 +1733,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
  *	Schedule and forward packets from remote clients
  */
 static unsigned int
-ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET);
+	return ip_vs_in(ops->hooknum, skb, AF_INET);
 }
 
 /*
@@ -1746,11 +1746,11 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
  *	Schedule and forward packets from local clients
  */
 static unsigned int
-ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET);
+	return ip_vs_in(ops->hooknum, skb, AF_INET);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1760,7 +1760,7 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
  * Copy info from first fragment, to the rest of them.
  */
 static unsigned int
-ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_preroute_frag6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
@@ -1792,12 +1792,12 @@ ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
  *	Schedule and forward packets from remote clients
  */
 static unsigned int
-ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET6);
+	return ip_vs_in(ops->hooknum, skb, AF_INET6);
 }
 
 /*
@@ -1805,11 +1805,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
  *	Schedule and forward packets from local clients
  */
 static unsigned int
-ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET6);
+	return ip_vs_in(ops->hooknum, skb, AF_INET6);
 }
 
 #endif
@@ -1825,7 +1825,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
  *      and send them to ip_vs_in_icmp.
  */
 static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
@@ -1842,12 +1842,12 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
-	return ip_vs_in_icmp(skb, &r, hooknum);
+	return ip_vs_in_icmp(skb, &r, ops->hooknum);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
 static unsigned int
-ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in, const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
@@ -1866,7 +1866,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
-	return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr);
+	return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
 }
 #endif
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 568c7699abf1..3f224d7795f5 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4668,7 +4668,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
 	return NF_ACCEPT;
 }
 
-static unsigned int selinux_ipv4_forward(unsigned int hooknum,
+static unsigned int selinux_ipv4_forward(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -4678,7 +4678,7 @@ static unsigned int selinux_ipv4_forward(unsigned int hooknum,
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static unsigned int selinux_ipv6_forward(unsigned int hooknum,
+static unsigned int selinux_ipv6_forward(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -4710,7 +4710,7 @@ static unsigned int selinux_ip_output(struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
-static unsigned int selinux_ipv4_output(unsigned int hooknum,
+static unsigned int selinux_ipv4_output(const struct nf_hook_ops *ops,
 					struct sk_buff *skb,
 					const struct net_device *in,
 					const struct net_device *out,
@@ -4837,7 +4837,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
 	return NF_ACCEPT;
 }
 
-static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
+static unsigned int selinux_ipv4_postroute(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
@@ -4847,7 +4847,7 @@ static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static unsigned int selinux_ipv6_postroute(unsigned int hooknum,
+static unsigned int selinux_ipv6_postroute(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
-- 
cgit v1.2.3


From 96518518cc417bb0a8c80b9fb736202e28acdf96 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 14 Oct 2013 11:00:02 +0200
Subject: netfilter: add nftables

This patch adds nftables which is the intended successor of iptables.
This packet filtering framework reuses the existing netfilter hooks,
the connection tracking system, the NAT subsystem, the transparent
proxying engine, the logging infrastructure and the userspace packet
queueing facilities.

In a nutshell, nftables provides a pseudo-state machine with 4 general
purpose registers of 128 bits and 1 specific purpose register to store
verdicts. This pseudo-machine comes with an extensible instruction set,
a.k.a. "expressions" in the nftables jargon. The expressions included
in this patch provide the basic functionality, they are:

* bitwise: to perform bitwise operations.
* byteorder: to change from host/network endianess.
* cmp: to compare data with the content of the registers.
* counter: to enable counters on rules.
* ct: to store conntrack keys into register.
* exthdr: to match IPv6 extension headers.
* immediate: to load data into registers.
* limit: to limit matching based on packet rate.
* log: to log packets.
* meta: to match metainformation that usually comes with the skbuff.
* nat: to perform Network Address Translation.
* payload: to fetch data from the packet payload and store it into
  registers.
* reject (IPv4 only): to explicitly close connection, eg. TCP RST.

Using this instruction-set, the userspace utility 'nft' can transform
the rules expressed in human-readable text representation (using a
new syntax, inspired by tcpdump) to nftables bytecode.

nftables also inherits the table, chain and rule objects from
iptables, but in a more configurable way, and it also includes the
original datatype-agnostic set infrastructure with mapping support.
This set infrastructure is enhanced in the follow up patch (netfilter:
nf_tables: add netlink set API).

This patch includes the following components:

* the netlink API: net/netfilter/nf_tables_api.c and
  include/uapi/netfilter/nf_tables.h
* the packet filter core: net/netfilter/nf_tables_core.c
* the expressions (described above): net/netfilter/nft_*.c
* the filter tables: arp, IPv4, IPv6 and bridge:
  net/ipv4/netfilter/nf_tables_ipv4.c
  net/ipv6/netfilter/nf_tables_ipv6.c
  net/ipv4/netfilter/nf_tables_arp.c
  net/bridge/netfilter/nf_tables_bridge.c
* the NAT table (IPv4 only):
  net/ipv4/netfilter/nf_table_nat_ipv4.c
* the route table (similar to mangle):
  net/ipv4/netfilter/nf_table_route_ipv4.c
  net/ipv6/netfilter/nf_table_route_ipv6.c
* internal definitions under:
  include/net/netfilter/nf_tables.h
  include/net/netfilter/nf_tables_core.h
* It also includes an skeleton expression:
  net/netfilter/nft_expr_template.c
  and the preliminary implementation of the meta target
  net/netfilter/nft_meta_target.c

It also includes a change in struct nf_hook_ops to add a new
pointer to store private data to the hook, that is used to store
the rule list per chain.

This patch is based on the patch from Patrick McHardy, plus merged
accumulated cleanups, fixes and small enhancements to the nftables
code that has been done since 2009, which are:

From Patrick McHardy:
* nf_tables: adjust netlink handler function signatures
* nf_tables: only retry table lookup after successful table module load
* nf_tables: fix event notification echo and avoid unnecessary messages
* nft_ct: add l3proto support
* nf_tables: pass expression context to nft_validate_data_load()
* nf_tables: remove redundant definition
* nft_ct: fix maxattr initialization
* nf_tables: fix invalid event type in nf_tables_getrule()
* nf_tables: simplify nft_data_init() usage
* nf_tables: build in more core modules
* nf_tables: fix double lookup expression unregistation
* nf_tables: move expression initialization to nf_tables_core.c
* nf_tables: build in payload module
* nf_tables: use NFPROTO constants
* nf_tables: rename pid variables to portid
* nf_tables: save 48 bits per rule
* nf_tables: introduce chain rename
* nf_tables: check for duplicate names on chain rename
* nf_tables: remove ability to specify handles for new rules
* nf_tables: return error for rule change request
* nf_tables: return error for NLM_F_REPLACE without rule handle
* nf_tables: include NLM_F_APPEND/NLM_F_REPLACE flags in rule notification
* nf_tables: fix NLM_F_MULTI usage in netlink notifications
* nf_tables: include NLM_F_APPEND in rule dumps

From Pablo Neira Ayuso:
* nf_tables: fix stack overflow in nf_tables_newrule
* nf_tables: nft_ct: fix compilation warning
* nf_tables: nft_ct: fix crash with invalid packets
* nft_log: group and qthreshold are 2^16
* nf_tables: nft_meta: fix socket uid,gid handling
* nft_counter: allow to restore counters
* nf_tables: fix module autoload
* nf_tables: allow to remove all rules placed in one chain
* nf_tables: use 64-bits rule handle instead of 16-bits
* nf_tables: fix chain after rule deletion
* nf_tables: improve deletion performance
* nf_tables: add missing code in route chain type
* nf_tables: rise maximum number of expressions from 12 to 128
* nf_tables: don't delete table if in use
* nf_tables: fix basechain release

From Tomasz Bursztyka:
* nf_tables: Add support for changing users chain's name
* nf_tables: Change chain's name to be fixed sized
* nf_tables: Add support for replacing a rule by another one
* nf_tables: Update uapi nftables netlink header documentation

From Florian Westphal:
* nft_log: group is u16, snaplen u32

From Phil Oester:
* nf_tables: operational limit match

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                          |   11 +-
 include/net/netfilter/nf_tables.h                  |  301 ++++
 include/net/netfilter/nf_tables_core.h             |   25 +
 include/uapi/linux/netfilter/Kbuild                |    1 +
 include/uapi/linux/netfilter/nf_conntrack_common.h |    4 +
 include/uapi/linux/netfilter/nf_tables.h           |  582 +++++++
 include/uapi/linux/netfilter/nfnetlink.h           |    5 +-
 net/bridge/netfilter/Kconfig                       |    3 +
 net/bridge/netfilter/Makefile                      |    2 +
 net/bridge/netfilter/nf_tables_bridge.c            |   37 +
 net/ipv4/netfilter/Kconfig                         |   16 +
 net/ipv4/netfilter/Makefile                        |    5 +
 net/ipv4/netfilter/nf_table_nat_ipv4.c             |  409 +++++
 net/ipv4/netfilter/nf_table_route_ipv4.c           |   97 ++
 net/ipv4/netfilter/nf_tables_ipv4.c                |   59 +
 net/ipv4/netfilter/nft_reject_ipv4.c               |  117 ++
 net/ipv6/netfilter/Kconfig                         |    8 +
 net/ipv6/netfilter/Makefile                        |    4 +
 net/ipv6/netfilter/nf_table_route_ipv6.c           |   93 ++
 net/ipv6/netfilter/nf_tables_ipv6.c                |   57 +
 net/netfilter/Kconfig                              |   37 +
 net/netfilter/Makefile                             |   16 +
 net/netfilter/nf_tables_api.c                      | 1760 ++++++++++++++++++++
 net/netfilter/nf_tables_core.c                     |  152 ++
 net/netfilter/nft_bitwise.c                        |  140 ++
 net/netfilter/nft_byteorder.c                      |  167 ++
 net/netfilter/nft_cmp.c                            |  146 ++
 net/netfilter/nft_counter.c                        |  107 ++
 net/netfilter/nft_ct.c                             |  252 +++
 net/netfilter/nft_expr_template.c                  |   88 +
 net/netfilter/nft_exthdr.c                         |  127 ++
 net/netfilter/nft_hash.c                           |  348 ++++
 net/netfilter/nft_immediate.c                      |  113 ++
 net/netfilter/nft_limit.c                          |  113 ++
 net/netfilter/nft_log.c                            |  140 ++
 net/netfilter/nft_meta.c                           |  222 +++
 net/netfilter/nft_meta_target.c                    |  117 ++
 net/netfilter/nft_payload.c                        |  137 ++
 net/netfilter/nft_set.c                            |  381 +++++
 39 files changed, 6393 insertions(+), 6 deletions(-)
 create mode 100644 include/net/netfilter/nf_tables.h
 create mode 100644 include/net/netfilter/nf_tables_core.h
 create mode 100644 include/uapi/linux/netfilter/nf_tables.h
 create mode 100644 net/bridge/netfilter/nf_tables_bridge.c
 create mode 100644 net/ipv4/netfilter/nf_table_nat_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_table_route_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_tables_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_reject_ipv4.c
 create mode 100644 net/ipv6/netfilter/nf_table_route_ipv6.c
 create mode 100644 net/ipv6/netfilter/nf_tables_ipv6.c
 create mode 100644 net/netfilter/nf_tables_api.c
 create mode 100644 net/netfilter/nf_tables_core.c
 create mode 100644 net/netfilter/nft_bitwise.c
 create mode 100644 net/netfilter/nft_byteorder.c
 create mode 100644 net/netfilter/nft_cmp.c
 create mode 100644 net/netfilter/nft_counter.c
 create mode 100644 net/netfilter/nft_ct.c
 create mode 100644 net/netfilter/nft_expr_template.c
 create mode 100644 net/netfilter/nft_exthdr.c
 create mode 100644 net/netfilter/nft_hash.c
 create mode 100644 net/netfilter/nft_immediate.c
 create mode 100644 net/netfilter/nft_limit.c
 create mode 100644 net/netfilter/nft_log.c
 create mode 100644 net/netfilter/nft_meta.c
 create mode 100644 net/netfilter/nft_meta_target.c
 create mode 100644 net/netfilter/nft_payload.c
 create mode 100644 net/netfilter/nft_set.c

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index fef7e67f7101..2077489f9887 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -53,12 +53,13 @@ struct nf_hook_ops {
 	struct list_head list;
 
 	/* User fills in from here down. */
-	nf_hookfn *hook;
-	struct module *owner;
-	u_int8_t pf;
-	unsigned int hooknum;
+	nf_hookfn	*hook;
+	struct module	*owner;
+	void		*priv;
+	u_int8_t	pf;
+	unsigned int	hooknum;
 	/* Hooks are ordered in ascending priority. */
-	int priority;
+	int		priority;
 };
 
 struct nf_sockopt_ops {
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
new file mode 100644
index 000000000000..d26dfa345f49
--- /dev/null
+++ b/include/net/netfilter/nf_tables.h
@@ -0,0 +1,301 @@
+#ifndef _NET_NF_TABLES_H
+#define _NET_NF_TABLES_H
+
+#include <linux/list.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netlink.h>
+
+struct nft_pktinfo {
+	struct sk_buff			*skb;
+	const struct net_device		*in;
+	const struct net_device		*out;
+	u8				hooknum;
+	u8				nhoff;
+	u8				thoff;
+};
+
+struct nft_data {
+	union {
+		u32				data[4];
+		struct {
+			u32			verdict;
+			struct nft_chain	*chain;
+		};
+	};
+} __attribute__((aligned(__alignof__(u64))));
+
+static inline int nft_data_cmp(const struct nft_data *d1,
+			       const struct nft_data *d2,
+			       unsigned int len)
+{
+	return memcmp(d1->data, d2->data, len);
+}
+
+static inline void nft_data_copy(struct nft_data *dst,
+				 const struct nft_data *src)
+{
+	BUILD_BUG_ON(__alignof__(*dst) != __alignof__(u64));
+	*(u64 *)&dst->data[0] = *(u64 *)&src->data[0];
+	*(u64 *)&dst->data[2] = *(u64 *)&src->data[2];
+}
+
+static inline void nft_data_debug(const struct nft_data *data)
+{
+	pr_debug("data[0]=%x data[1]=%x data[2]=%x data[3]=%x\n",
+		 data->data[0], data->data[1],
+		 data->data[2], data->data[3]);
+}
+
+/**
+ *	struct nft_ctx - nf_tables rule context
+ *
+ * 	@afi: address family info
+ * 	@table: the table the chain is contained in
+ * 	@chain: the chain the rule is contained in
+ */
+struct nft_ctx {
+	const struct nft_af_info	*afi;
+	const struct nft_table		*table;
+	const struct nft_chain		*chain;
+};
+
+enum nft_data_types {
+	NFT_DATA_VALUE,
+	NFT_DATA_VERDICT,
+};
+
+struct nft_data_desc {
+	enum nft_data_types		type;
+	unsigned int			len;
+};
+
+extern int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+			 struct nft_data_desc *desc, const struct nlattr *nla);
+extern void nft_data_uninit(const struct nft_data *data,
+			    enum nft_data_types type);
+extern int nft_data_dump(struct sk_buff *skb, int attr,
+			 const struct nft_data *data,
+			 enum nft_data_types type, unsigned int len);
+
+static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
+{
+	return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE;
+}
+
+extern int nft_validate_input_register(enum nft_registers reg);
+extern int nft_validate_output_register(enum nft_registers reg);
+extern int nft_validate_data_load(const struct nft_ctx *ctx,
+				  enum nft_registers reg,
+				  const struct nft_data *data,
+				  enum nft_data_types type);
+
+/**
+ *	struct nft_expr_ops - nf_tables expression operations
+ *
+ *	@eval: Expression evaluation function
+ *	@init: initialization function
+ *	@destroy: destruction function
+ *	@dump: function to dump parameters
+ *	@list: used internally
+ *	@name: Identifier
+ *	@owner: module reference
+ *	@policy: netlink attribute policy
+ *	@maxattr: highest netlink attribute number
+ *	@size: full expression size, including private data size
+ */
+struct nft_expr;
+struct nft_expr_ops {
+	void				(*eval)(const struct nft_expr *expr,
+						struct nft_data data[NFT_REG_MAX + 1],
+						const struct nft_pktinfo *pkt);
+	int				(*init)(const struct nft_ctx *ctx,
+						const struct nft_expr *expr,
+						const struct nlattr * const tb[]);
+	void				(*destroy)(const struct nft_expr *expr);
+	int				(*dump)(struct sk_buff *skb,
+						const struct nft_expr *expr);
+
+	struct list_head		list;
+	const char			*name;
+	struct module			*owner;
+	const struct nla_policy		*policy;
+	unsigned int			maxattr;
+	unsigned int			size;
+};
+
+#define NFT_EXPR_SIZE(size)		(sizeof(struct nft_expr) + \
+					 ALIGN(size, __alignof__(struct nft_expr)))
+
+/**
+ *	struct nft_expr - nf_tables expression
+ *
+ *	@ops: expression ops
+ *	@data: expression private data
+ */
+struct nft_expr {
+	const struct nft_expr_ops	*ops;
+	unsigned char			data[];
+};
+
+static inline void *nft_expr_priv(const struct nft_expr *expr)
+{
+	return (void *)expr->data;
+}
+
+/**
+ *	struct nft_rule - nf_tables rule
+ *
+ *	@list: used internally
+ *	@rcu_head: used internally for rcu
+ *	@handle: rule handle
+ *	@dlen: length of expression data
+ *	@data: expression data
+ */
+struct nft_rule {
+	struct list_head		list;
+	struct rcu_head			rcu_head;
+	u64				handle:48,
+					dlen:16;
+	unsigned char			data[]
+		__attribute__((aligned(__alignof__(struct nft_expr))));
+};
+
+static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule)
+{
+	return (struct nft_expr *)&rule->data[0];
+}
+
+static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr)
+{
+	return ((void *)expr) + expr->ops->size;
+}
+
+static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
+{
+	return (struct nft_expr *)&rule->data[rule->dlen];
+}
+
+/*
+ * The last pointer isn't really necessary, but the compiler isn't able to
+ * determine that the result of nft_expr_last() is always the same since it
+ * can't assume that the dlen value wasn't changed within calls in the loop.
+ */
+#define nft_rule_for_each_expr(expr, last, rule) \
+	for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \
+	     (expr) != (last); \
+	     (expr) = nft_expr_next(expr))
+
+enum nft_chain_flags {
+	NFT_BASE_CHAIN			= 0x1,
+	NFT_CHAIN_BUILTIN		= 0x2,
+};
+
+/**
+ *	struct nft_chain - nf_tables chain
+ *
+ *	@rules: list of rules in the chain
+ *	@list: used internally
+ *	@rcu_head: used internally
+ *	@handle: chain handle
+ *	@flags: bitmask of enum nft_chain_flags
+ *	@use: number of jump references to this chain
+ *	@level: length of longest path to this chain
+ *	@name: name of the chain
+ */
+struct nft_chain {
+	struct list_head		rules;
+	struct list_head		list;
+	struct rcu_head			rcu_head;
+	u64				handle;
+	u8				flags;
+	u16				use;
+	u16				level;
+	char				name[NFT_CHAIN_MAXNAMELEN];
+};
+
+/**
+ *	struct nft_base_chain - nf_tables base chain
+ *
+ *	@ops: netfilter hook ops
+ *	@chain: the chain
+ */
+struct nft_base_chain {
+	struct nf_hook_ops		ops;
+	struct nft_chain		chain;
+};
+
+static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain)
+{
+	return container_of(chain, struct nft_base_chain, chain);
+}
+
+extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
+				 struct sk_buff *skb,
+				 const struct net_device *in,
+				 const struct net_device *out,
+				 int (*okfn)(struct sk_buff *));
+
+enum nft_table_flags {
+	NFT_TABLE_BUILTIN		= 0x1,
+};
+
+/**
+ *	struct nft_table - nf_tables table
+ *
+ *	@list: used internally
+ *	@chains: chains in the table
+ *	@sets: sets in the table
+ *	@hgenerator: handle generator state
+ *	@use: number of chain references to this table
+ *	@flags: table flag (see enum nft_table_flags)
+ *	@name: name of the table
+ */
+struct nft_table {
+	struct list_head		list;
+	struct list_head		chains;
+	struct list_head		sets;
+	u64				hgenerator;
+	u32				use;
+	u16				flags;
+	char				name[];
+};
+
+/**
+ *	struct nft_af_info - nf_tables address family info
+ *
+ *	@list: used internally
+ *	@family: address family
+ *	@nhooks: number of hooks in this family
+ *	@owner: module owner
+ *	@tables: used internally
+ *	@hooks: hookfn overrides for packet validation
+ */
+struct nft_af_info {
+	struct list_head		list;
+	int				family;
+	unsigned int			nhooks;
+	struct module			*owner;
+	struct list_head		tables;
+	nf_hookfn			*hooks[NF_MAX_HOOKS];
+};
+
+extern int nft_register_afinfo(struct nft_af_info *);
+extern void nft_unregister_afinfo(struct nft_af_info *);
+
+extern int nft_register_table(struct nft_table *, int family);
+extern void nft_unregister_table(struct nft_table *, int family);
+
+extern int nft_register_expr(struct nft_expr_ops *);
+extern void nft_unregister_expr(struct nft_expr_ops *);
+
+#define MODULE_ALIAS_NFT_FAMILY(family)	\
+	MODULE_ALIAS("nft-afinfo-" __stringify(family))
+
+#define MODULE_ALIAS_NFT_TABLE(family, name) \
+	MODULE_ALIAS("nft-table-" __stringify(family) "-" name)
+
+#define MODULE_ALIAS_NFT_EXPR(name) \
+	MODULE_ALIAS("nft-expr-" name)
+
+#endif /* _NET_NF_TABLES_H */
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
new file mode 100644
index 000000000000..283396c916e0
--- /dev/null
+++ b/include/net/netfilter/nf_tables_core.h
@@ -0,0 +1,25 @@
+#ifndef _NET_NF_TABLES_CORE_H
+#define _NET_NF_TABLES_CORE_H
+
+extern int nf_tables_core_module_init(void);
+extern void nf_tables_core_module_exit(void);
+
+extern int nft_immediate_module_init(void);
+extern void nft_immediate_module_exit(void);
+
+extern int nft_cmp_module_init(void);
+extern void nft_cmp_module_exit(void);
+
+extern int nft_lookup_module_init(void);
+extern void nft_lookup_module_exit(void);
+
+extern int nft_bitwise_module_init(void);
+extern void nft_bitwise_module_exit(void);
+
+extern int nft_byteorder_module_init(void);
+extern void nft_byteorder_module_exit(void);
+
+extern int nft_payload_module_init(void);
+extern void nft_payload_module_exit(void);
+
+#endif /* _NET_NF_TABLES_CORE_H */
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 174915420d3f..6ce0b7f566a7 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -5,6 +5,7 @@ header-y += nf_conntrack_ftp.h
 header-y += nf_conntrack_sctp.h
 header-y += nf_conntrack_tcp.h
 header-y += nf_conntrack_tuple_common.h
+header-y += nf_tables.h
 header-y += nf_nat.h
 header-y += nfnetlink.h
 header-y += nfnetlink_acct.h
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 8dd803818ebe..319f47128db8 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -25,6 +25,10 @@ enum ip_conntrack_info {
 	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
 };
 
+#define NF_CT_STATE_INVALID_BIT			(1 << 0)
+#define NF_CT_STATE_BIT(ctinfo)			(1 << ((ctinfo) % IP_CT_IS_REPLY + 1))
+#define NF_CT_STATE_UNTRACKED_BIT		(1 << (IP_CT_NUMBER + 1))
+
 /* Bitset representing status of connection. */
 enum ip_conntrack_status {
 	/* It's an expected connection: bit 0 set.  This bit never changed */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
new file mode 100644
index 000000000000..ec6d84a8ed1e
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -0,0 +1,582 @@
+#ifndef _LINUX_NF_TABLES_H
+#define _LINUX_NF_TABLES_H
+
+#define NFT_CHAIN_MAXNAMELEN 32
+
+enum nft_registers {
+	NFT_REG_VERDICT,
+	NFT_REG_1,
+	NFT_REG_2,
+	NFT_REG_3,
+	NFT_REG_4,
+	__NFT_REG_MAX
+};
+#define NFT_REG_MAX	(__NFT_REG_MAX - 1)
+
+/**
+ * enum nft_verdicts - nf_tables internal verdicts
+ *
+ * @NFT_CONTINUE: continue evaluation of the current rule
+ * @NFT_BREAK: terminate evaluation of the current rule
+ * @NFT_JUMP: push the current chain on the jump stack and jump to a chain
+ * @NFT_GOTO: jump to a chain without pushing the current chain on the jump stack
+ * @NFT_RETURN: return to the topmost chain on the jump stack
+ *
+ * The nf_tables verdicts share their numeric space with the netfilter verdicts.
+ */
+enum nft_verdicts {
+	NFT_CONTINUE	= -1,
+	NFT_BREAK	= -2,
+	NFT_JUMP	= -3,
+	NFT_GOTO	= -4,
+	NFT_RETURN	= -5,
+};
+
+/**
+ * enum nf_tables_msg_types - nf_tables netlink message types
+ *
+ * @NFT_MSG_NEWTABLE: create a new table (enum nft_table_attributes)
+ * @NFT_MSG_GETTABLE: get a table (enum nft_table_attributes)
+ * @NFT_MSG_DELTABLE: delete a table (enum nft_table_attributes)
+ * @NFT_MSG_NEWCHAIN: create a new chain (enum nft_chain_attributes)
+ * @NFT_MSG_GETCHAIN: get a chain (enum nft_chain_attributes)
+ * @NFT_MSG_DELCHAIN: delete a chain (enum nft_chain_attributes)
+ * @NFT_MSG_NEWRULE: create a new rule (enum nft_rule_attributes)
+ * @NFT_MSG_GETRULE: get a rule (enum nft_rule_attributes)
+ * @NFT_MSG_DELRULE: delete a rule (enum nft_rule_attributes)
+ */
+enum nf_tables_msg_types {
+	NFT_MSG_NEWTABLE,
+	NFT_MSG_GETTABLE,
+	NFT_MSG_DELTABLE,
+	NFT_MSG_NEWCHAIN,
+	NFT_MSG_GETCHAIN,
+	NFT_MSG_DELCHAIN,
+	NFT_MSG_NEWRULE,
+	NFT_MSG_GETRULE,
+	NFT_MSG_DELRULE,
+	NFT_MSG_MAX,
+};
+
+enum nft_list_attributes {
+	NFTA_LIST_UNPEC,
+	NFTA_LIST_ELEM,
+	__NFTA_LIST_MAX
+};
+#define NFTA_LIST_MAX		(__NFTA_LIST_MAX - 1)
+
+/**
+ * enum nft_hook_attributes - nf_tables netfilter hook netlink attributes
+ *
+ * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32)
+ * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32)
+ */
+enum nft_hook_attributes {
+	NFTA_HOOK_UNSPEC,
+	NFTA_HOOK_HOOKNUM,
+	NFTA_HOOK_PRIORITY,
+	__NFTA_HOOK_MAX
+};
+#define NFTA_HOOK_MAX		(__NFTA_HOOK_MAX - 1)
+
+/**
+ * enum nft_table_attributes - nf_tables table netlink attributes
+ *
+ * @NFTA_TABLE_NAME: name of the table (NLA_STRING)
+ */
+enum nft_table_attributes {
+	NFTA_TABLE_UNSPEC,
+	NFTA_TABLE_NAME,
+	__NFTA_TABLE_MAX
+};
+#define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
+
+/**
+ * enum nft_chain_attributes - nf_tables chain netlink attributes
+ *
+ * @NFTA_CHAIN_TABLE: name of the table containing the chain (NLA_STRING)
+ * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
+ * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
+ * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ */
+enum nft_chain_attributes {
+	NFTA_CHAIN_UNSPEC,
+	NFTA_CHAIN_TABLE,
+	NFTA_CHAIN_HANDLE,
+	NFTA_CHAIN_NAME,
+	NFTA_CHAIN_HOOK,
+	__NFTA_CHAIN_MAX
+};
+#define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
+
+/**
+ * enum nft_rule_attributes - nf_tables rule netlink attributes
+ *
+ * @NFTA_RULE_TABLE: name of the table containing the rule (NLA_STRING)
+ * @NFTA_RULE_CHAIN: name of the chain containing the rule (NLA_STRING)
+ * @NFTA_RULE_HANDLE: numeric handle of the rule (NLA_U64)
+ * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
+ */
+enum nft_rule_attributes {
+	NFTA_RULE_UNSPEC,
+	NFTA_RULE_TABLE,
+	NFTA_RULE_CHAIN,
+	NFTA_RULE_HANDLE,
+	NFTA_RULE_EXPRESSIONS,
+	__NFTA_RULE_MAX
+};
+#define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
+
+enum nft_data_attributes {
+	NFTA_DATA_UNSPEC,
+	NFTA_DATA_VALUE,
+	NFTA_DATA_VERDICT,
+	__NFTA_DATA_MAX
+};
+#define NFTA_DATA_MAX		(__NFTA_DATA_MAX - 1)
+
+/**
+ * enum nft_verdict_attributes - nf_tables verdict netlink attributes
+ *
+ * @NFTA_VERDICT_CODE: nf_tables verdict (NLA_U32: enum nft_verdicts)
+ * @NFTA_VERDICT_CHAIN: jump target chain name (NLA_STRING)
+ */
+enum nft_verdict_attributes {
+	NFTA_VERDICT_UNSPEC,
+	NFTA_VERDICT_CODE,
+	NFTA_VERDICT_CHAIN,
+	__NFTA_VERDICT_MAX
+};
+#define NFTA_VERDICT_MAX	(__NFTA_VERDICT_MAX - 1)
+
+/**
+ * enum nft_expr_attributes - nf_tables expression netlink attributes
+ *
+ * @NFTA_EXPR_NAME: name of the expression type (NLA_STRING)
+ * @NFTA_EXPR_DATA: type specific data (NLA_NESTED)
+ */
+enum nft_expr_attributes {
+	NFTA_EXPR_UNSPEC,
+	NFTA_EXPR_NAME,
+	NFTA_EXPR_DATA,
+	__NFTA_EXPR_MAX
+};
+#define NFTA_EXPR_MAX		(__NFTA_EXPR_MAX - 1)
+
+/**
+ * enum nft_immediate_attributes - nf_tables immediate expression netlink attributes
+ *
+ * @NFTA_IMMEDIATE_DREG: destination register to load data into (NLA_U32)
+ * @NFTA_IMMEDIATE_DATA: data to load (NLA_NESTED: nft_data_attributes)
+ */
+enum nft_immediate_attributes {
+	NFTA_IMMEDIATE_UNSPEC,
+	NFTA_IMMEDIATE_DREG,
+	NFTA_IMMEDIATE_DATA,
+	__NFTA_IMMEDIATE_MAX
+};
+#define NFTA_IMMEDIATE_MAX	(__NFTA_IMMEDIATE_MAX - 1)
+
+/**
+ * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes
+ *
+ * @NFTA_BITWISE_SREG: source register (NLA_U32: nft_registers)
+ * @NFTA_BITWISE_DREG: destination register (NLA_U32: nft_registers)
+ * @NFTA_BITWISE_LEN: length of operands (NLA_U32)
+ * @NFTA_BITWISE_MASK: mask value (NLA_NESTED: nft_data_attributes)
+ * @NFTA_BITWISE_XOR: xor value (NLA_NESTED: nft_data_attributes)
+ *
+ * The bitwise expression performs the following operation:
+ *
+ * dreg = (sreg & mask) ^ xor
+ *
+ * which allow to express all bitwise operations:
+ *
+ * 		mask	xor
+ * NOT:		1	1
+ * OR:		0	x
+ * XOR:		1	x
+ * AND:		x	0
+ */
+enum nft_bitwise_attributes {
+	NFTA_BITWISE_UNSPEC,
+	NFTA_BITWISE_SREG,
+	NFTA_BITWISE_DREG,
+	NFTA_BITWISE_LEN,
+	NFTA_BITWISE_MASK,
+	NFTA_BITWISE_XOR,
+	__NFTA_BITWISE_MAX
+};
+#define NFTA_BITWISE_MAX	(__NFTA_BITWISE_MAX - 1)
+
+/**
+ * enum nft_byteorder_ops - nf_tables byteorder operators
+ *
+ * @NFT_BYTEORDER_NTOH: network to host operator
+ * @NFT_BYTEORDER_HTON: host to network opertaor
+ */
+enum nft_byteorder_ops {
+	NFT_BYTEORDER_NTOH,
+	NFT_BYTEORDER_HTON,
+};
+
+/**
+ * enum nft_byteorder_attributes - nf_tables byteorder expression netlink attributes
+ *
+ * @NFTA_BYTEORDER_SREG: source register (NLA_U32: nft_registers)
+ * @NFTA_BYTEORDER_DREG: destination register (NLA_U32: nft_registers)
+ * @NFTA_BYTEORDER_OP: operator (NLA_U32: enum nft_byteorder_ops)
+ * @NFTA_BYTEORDER_LEN: length of the data (NLA_U32)
+ * @NFTA_BYTEORDER_SIZE: data size in bytes (NLA_U32: 2 or 4)
+ */
+enum nft_byteorder_attributes {
+	NFTA_BYTEORDER_UNSPEC,
+	NFTA_BYTEORDER_SREG,
+	NFTA_BYTEORDER_DREG,
+	NFTA_BYTEORDER_OP,
+	NFTA_BYTEORDER_LEN,
+	NFTA_BYTEORDER_SIZE,
+	__NFTA_BYTEORDER_MAX
+};
+#define NFTA_BYTEORDER_MAX	(__NFTA_BYTEORDER_MAX - 1)
+
+/**
+ * enum nft_cmp_ops - nf_tables relational operator
+ *
+ * @NFT_CMP_EQ: equal
+ * @NFT_CMP_NEQ: not equal
+ * @NFT_CMP_LT: less than
+ * @NFT_CMP_LTE: less than or equal to
+ * @NFT_CMP_GT: greater than
+ * @NFT_CMP_GTE: greater than or equal to
+ */
+enum nft_cmp_ops {
+	NFT_CMP_EQ,
+	NFT_CMP_NEQ,
+	NFT_CMP_LT,
+	NFT_CMP_LTE,
+	NFT_CMP_GT,
+	NFT_CMP_GTE,
+};
+
+/**
+ * enum nft_cmp_attributes - nf_tables cmp expression netlink attributes
+ *
+ * @NFTA_CMP_SREG: source register of data to compare (NLA_U32: nft_registers)
+ * @NFTA_CMP_OP: cmp operation (NLA_U32: nft_cmp_ops)
+ * @NFTA_CMP_DATA: data to compare against (NLA_NESTED: nft_data_attributes)
+ */
+enum nft_cmp_attributes {
+	NFTA_CMP_UNSPEC,
+	NFTA_CMP_SREG,
+	NFTA_CMP_OP,
+	NFTA_CMP_DATA,
+	__NFTA_CMP_MAX
+};
+#define NFTA_CMP_MAX		(__NFTA_CMP_MAX - 1)
+
+enum nft_set_elem_flags {
+	NFT_SE_INTERVAL_END	= 0x1,
+};
+
+enum nft_set_elem_attributes {
+	NFTA_SE_UNSPEC,
+	NFTA_SE_KEY,
+	NFTA_SE_DATA,
+	NFTA_SE_FLAGS,
+	__NFTA_SE_MAX
+};
+#define NFTA_SE_MAX		(__NFTA_SE_MAX - 1)
+
+enum nft_set_flags {
+	NFT_SET_INTERVAL	= 0x1,
+	NFT_SET_MAP		= 0x2,
+};
+
+enum nft_set_attributes {
+	NFTA_SET_UNSPEC,
+	NFTA_SET_FLAGS,
+	NFTA_SET_SREG,
+	NFTA_SET_DREG,
+	NFTA_SET_KLEN,
+	NFTA_SET_DLEN,
+	NFTA_SET_ELEMENTS,
+	__NFTA_SET_MAX
+};
+#define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
+
+enum nft_hash_flags {
+	NFT_HASH_MAP		= 0x1,
+};
+
+enum nft_hash_elem_attributes {
+	NFTA_HE_UNSPEC,
+	NFTA_HE_KEY,
+	NFTA_HE_DATA,
+	__NFTA_HE_MAX
+};
+#define NFTA_HE_MAX		(__NFTA_HE_MAX - 1)
+
+enum nft_hash_attributes {
+	NFTA_HASH_UNSPEC,
+	NFTA_HASH_FLAGS,
+	NFTA_HASH_SREG,
+	NFTA_HASH_DREG,
+	NFTA_HASH_KLEN,
+	NFTA_HASH_ELEMENTS,
+	__NFTA_HASH_MAX
+};
+#define NFTA_HASH_MAX		(__NFTA_HASH_MAX - 1)
+
+/**
+ * enum nft_payload_bases - nf_tables payload expression offset bases
+ *
+ * @NFT_PAYLOAD_LL_HEADER: link layer header
+ * @NFT_PAYLOAD_NETWORK_HEADER: network header
+ * @NFT_PAYLOAD_TRANSPORT_HEADER: transport header
+ */
+enum nft_payload_bases {
+	NFT_PAYLOAD_LL_HEADER,
+	NFT_PAYLOAD_NETWORK_HEADER,
+	NFT_PAYLOAD_TRANSPORT_HEADER,
+};
+
+/**
+ * enum nft_payload_attributes - nf_tables payload expression netlink attributes
+ *
+ * @NFTA_PAYLOAD_DREG: destination register to load data into (NLA_U32: nft_registers)
+ * @NFTA_PAYLOAD_BASE: payload base (NLA_U32: nft_payload_bases)
+ * @NFTA_PAYLOAD_OFFSET: payload offset relative to base (NLA_U32)
+ * @NFTA_PAYLOAD_LEN: payload length (NLA_U32)
+ */
+enum nft_payload_attributes {
+	NFTA_PAYLOAD_UNSPEC,
+	NFTA_PAYLOAD_DREG,
+	NFTA_PAYLOAD_BASE,
+	NFTA_PAYLOAD_OFFSET,
+	NFTA_PAYLOAD_LEN,
+	__NFTA_PAYLOAD_MAX
+};
+#define NFTA_PAYLOAD_MAX	(__NFTA_PAYLOAD_MAX - 1)
+
+/**
+ * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes
+ *
+ * @NFTA_EXTHDR_DREG: destination register (NLA_U32: nft_registers)
+ * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8)
+ * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
+ * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
+ */
+enum nft_exthdr_attributes {
+	NFTA_EXTHDR_UNSPEC,
+	NFTA_EXTHDR_DREG,
+	NFTA_EXTHDR_TYPE,
+	NFTA_EXTHDR_OFFSET,
+	NFTA_EXTHDR_LEN,
+	__NFTA_EXTHDR_MAX
+};
+#define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
+
+/**
+ * enum nft_meta_keys - nf_tables meta expression keys
+ *
+ * @NFT_META_LEN: packet length (skb->len)
+ * @NFT_META_PROTOCOL: packet ethertype protocol (skb->protocol), invalid in OUTPUT
+ * @NFT_META_PRIORITY: packet priority (skb->priority)
+ * @NFT_META_MARK: packet mark (skb->mark)
+ * @NFT_META_IIF: packet input interface index (dev->ifindex)
+ * @NFT_META_OIF: packet output interface index (dev->ifindex)
+ * @NFT_META_IIFNAME: packet input interface name (dev->name)
+ * @NFT_META_OIFNAME: packet output interface name (dev->name)
+ * @NFT_META_IIFTYPE: packet input interface type (dev->type)
+ * @NFT_META_OIFTYPE: packet output interface type (dev->type)
+ * @NFT_META_SKUID: originating socket UID (fsuid)
+ * @NFT_META_SKGID: originating socket GID (fsgid)
+ * @NFT_META_NFTRACE: packet nftrace bit
+ * @NFT_META_RTCLASSID: realm value of packet's route (skb->dst->tclassid)
+ * @NFT_META_SECMARK: packet secmark (skb->secmark)
+ */
+enum nft_meta_keys {
+	NFT_META_LEN,
+	NFT_META_PROTOCOL,
+	NFT_META_PRIORITY,
+	NFT_META_MARK,
+	NFT_META_IIF,
+	NFT_META_OIF,
+	NFT_META_IIFNAME,
+	NFT_META_OIFNAME,
+	NFT_META_IIFTYPE,
+	NFT_META_OIFTYPE,
+	NFT_META_SKUID,
+	NFT_META_SKGID,
+	NFT_META_NFTRACE,
+	NFT_META_RTCLASSID,
+	NFT_META_SECMARK,
+};
+
+/**
+ * enum nft_meta_attributes - nf_tables meta expression netlink attributes
+ *
+ * @NFTA_META_DREG: destination register (NLA_U32)
+ * @NFTA_META_KEY: meta data item to load (NLA_U32: nft_meta_keys)
+ */
+enum nft_meta_attributes {
+	NFTA_META_UNSPEC,
+	NFTA_META_DREG,
+	NFTA_META_KEY,
+	__NFTA_META_MAX
+};
+#define NFTA_META_MAX		(__NFTA_META_MAX - 1)
+
+/**
+ * enum nft_ct_keys - nf_tables ct expression keys
+ *
+ * @NFT_CT_STATE: conntrack state (bitmask of enum ip_conntrack_info)
+ * @NFT_CT_DIRECTION: conntrack direction (enum ip_conntrack_dir)
+ * @NFT_CT_STATUS: conntrack status (bitmask of enum ip_conntrack_status)
+ * @NFT_CT_MARK: conntrack mark value
+ * @NFT_CT_SECMARK: conntrack secmark value
+ * @NFT_CT_EXPIRATION: relative conntrack expiration time in ms
+ * @NFT_CT_HELPER: connection tracking helper assigned to conntrack
+ * @NFT_CT_L3PROTOCOL: conntrack layer 3 protocol
+ * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address)
+ * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address)
+ * @NFT_CT_PROTOCOL: conntrack layer 4 protocol
+ * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source
+ * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination
+ */
+enum nft_ct_keys {
+	NFT_CT_STATE,
+	NFT_CT_DIRECTION,
+	NFT_CT_STATUS,
+	NFT_CT_MARK,
+	NFT_CT_SECMARK,
+	NFT_CT_EXPIRATION,
+	NFT_CT_HELPER,
+	NFT_CT_L3PROTOCOL,
+	NFT_CT_SRC,
+	NFT_CT_DST,
+	NFT_CT_PROTOCOL,
+	NFT_CT_PROTO_SRC,
+	NFT_CT_PROTO_DST,
+};
+
+/**
+ * enum nft_ct_attributes - nf_tables ct expression netlink attributes
+ *
+ * @NFTA_CT_DREG: destination register (NLA_U32)
+ * @NFTA_CT_KEY: conntrack data item to load (NLA_U32: nft_ct_keys)
+ * @NFTA_CT_DIRECTION: direction in case of directional keys (NLA_U8)
+ */
+enum nft_ct_attributes {
+	NFTA_CT_UNSPEC,
+	NFTA_CT_DREG,
+	NFTA_CT_KEY,
+	NFTA_CT_DIRECTION,
+	__NFTA_CT_MAX
+};
+#define NFTA_CT_MAX		(__NFTA_CT_MAX - 1)
+
+/**
+ * enum nft_limit_attributes - nf_tables limit expression netlink attributes
+ *
+ * @NFTA_LIMIT_RATE: refill rate (NLA_U64)
+ * @NFTA_LIMIT_UNIT: refill unit (NLA_U64)
+ */
+enum nft_limit_attributes {
+	NFTA_LIMIT_UNSPEC,
+	NFTA_LIMIT_RATE,
+	NFTA_LIMIT_UNIT,
+	__NFTA_LIMIT_MAX
+};
+#define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
+
+/**
+ * enum nft_counter_attributes - nf_tables counter expression netlink attributes
+ *
+ * @NFTA_COUNTER_BYTES: number of bytes (NLA_U64)
+ * @NFTA_COUNTER_PACKETS: number of packets (NLA_U64)
+ */
+enum nft_counter_attributes {
+	NFTA_COUNTER_UNSPEC,
+	NFTA_COUNTER_BYTES,
+	NFTA_COUNTER_PACKETS,
+	__NFTA_COUNTER_MAX
+};
+#define NFTA_COUNTER_MAX	(__NFTA_COUNTER_MAX - 1)
+
+/**
+ * enum nft_log_attributes - nf_tables log expression netlink attributes
+ *
+ * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U32)
+ * @NFTA_LOG_PREFIX: prefix to prepend to log messages (NLA_STRING)
+ * @NFTA_LOG_SNAPLEN: length of payload to include in netlink message (NLA_U32)
+ * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U32)
+ */
+enum nft_log_attributes {
+	NFTA_LOG_UNSPEC,
+	NFTA_LOG_GROUP,
+	NFTA_LOG_PREFIX,
+	NFTA_LOG_SNAPLEN,
+	NFTA_LOG_QTHRESHOLD,
+	__NFTA_LOG_MAX
+};
+#define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
+
+/**
+ * enum nft_reject_types - nf_tables reject expression reject types
+ *
+ * @NFT_REJECT_ICMP_UNREACH: reject using ICMP unreachable
+ * @NFT_REJECT_TCP_RST: reject using TCP RST
+ */
+enum nft_reject_types {
+	NFT_REJECT_ICMP_UNREACH,
+	NFT_REJECT_TCP_RST,
+};
+
+/**
+ * enum nft_reject_attributes - nf_tables reject expression netlink attributes
+ *
+ * @NFTA_REJECT_TYPE: packet type to use (NLA_U32: nft_reject_types)
+ * @NFTA_REJECT_ICMP_CODE: ICMP code to use (NLA_U8)
+ */
+enum nft_reject_attributes {
+	NFTA_REJECT_UNSPEC,
+	NFTA_REJECT_TYPE,
+	NFTA_REJECT_ICMP_CODE,
+	__NFTA_REJECT_MAX
+};
+#define NFTA_REJECT_MAX		(__NFTA_REJECT_MAX - 1)
+
+/**
+ * enum nft_nat_types - nf_tables nat expression NAT types
+ *
+ * @NFT_NAT_SNAT: source NAT
+ * @NFT_NAT_DNAT: destination NAT
+ */
+enum nft_nat_types {
+	NFT_NAT_SNAT,
+	NFT_NAT_DNAT,
+};
+
+/**
+ * enum nft_nat_attributes - nf_tables nat expression netlink attributes
+ *
+ * @NFTA_NAT_TYPE: NAT type (NLA_U32: nft_nat_types)
+ * @NFTA_NAT_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
+ */
+enum nft_nat_attributes {
+	NFTA_NAT_UNSPEC,
+	NFTA_NAT_TYPE,
+	NFTA_NAT_ADDR_MIN,
+	NFTA_NAT_ADDR_MAX,
+	NFTA_NAT_PROTO_MIN,
+	NFTA_NAT_PROTO_MAX,
+	__NFTA_NAT_MAX
+};
+#define NFTA_NAT_MAX		(__NFTA_NAT_MAX - 1)
+
+#endif /* _LINUX_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 4a4efafad5f4..d276c3bd55b8 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -18,6 +18,8 @@ enum nfnetlink_groups {
 #define NFNLGRP_CONNTRACK_EXP_UPDATE	NFNLGRP_CONNTRACK_EXP_UPDATE
 	NFNLGRP_CONNTRACK_EXP_DESTROY,
 #define NFNLGRP_CONNTRACK_EXP_DESTROY	NFNLGRP_CONNTRACK_EXP_DESTROY
+	NFNLGRP_NFTABLES,
+#define NFNLGRP_NFTABLES                NFNLGRP_NFTABLES
 	__NFNLGRP_MAX,
 };
 #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
@@ -51,6 +53,7 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_ACCT		7
 #define NFNL_SUBSYS_CTNETLINK_TIMEOUT	8
 #define NFNL_SUBSYS_CTHELPER		9
-#define NFNL_SUBSYS_COUNT		10
+#define NFNL_SUBSYS_NFTABLES		10
+#define NFNL_SUBSYS_COUNT		11
 
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index a9aff9c7d027..68f8128147be 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -1,6 +1,9 @@
 #
 # Bridge netfilter configuration
 #
+#
+config NF_TABLES_BRIDGE
+	tristate "Ethernet Bridge nf_tables support"
 
 menuconfig BRIDGE_NF_EBTABLES
 	tristate "Ethernet Bridge tables (ebtables) support"
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 0718699540b0..ea7629f58b3d 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -2,6 +2,8 @@
 # Makefile for the netfilter modules for Link Layer filtering on a bridge.
 #
 
+obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o
+
 obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o
 
 # tables
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
new file mode 100644
index 000000000000..bc5c21c911c0
--- /dev/null
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/nf_tables.h>
+
+static struct nft_af_info nft_af_bridge __read_mostly = {
+	.family		= NFPROTO_BRIDGE,
+	.nhooks		= NF_BR_NUMHOOKS,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nf_tables_bridge_init(void)
+{
+	return nft_register_afinfo(&nft_af_bridge);
+}
+
+static void __exit nf_tables_bridge_exit(void)
+{
+	nft_unregister_afinfo(&nft_af_bridge);
+}
+
+module_init(nf_tables_bridge_init);
+module_exit(nf_tables_bridge_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_BRIDGE);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1657e39b291f..eb1d56ece361 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,22 @@ config NF_CONNTRACK_PROC_COMPAT
 
 	  If unsure, say Y.
 
+config NF_TABLES_IPV4
+	depends on NF_TABLES
+	tristate "IPv4 nf_tables support"
+
+config NFT_REJECT_IPV4
+	depends on NF_TABLES_IPV4
+	tristate "nf_tables IPv4 reject support"
+
+config NF_TABLE_ROUTE_IPV4
+	depends on NF_TABLES_IPV4
+	tristate "IPv4 nf_tables route table support"
+
+config NF_TABLE_NAT_IPV4
+	depends on NF_TABLES_IPV4
+	tristate "IPv4 nf_tables nat table support"
+
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3622b248b6dd..b2f01cd2cd65 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -27,6 +27,11 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
+obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NF_TABLE_ROUTE_IPV4) += nf_table_route_ipv4.o
+obj-$(CONFIG_NF_TABLE_NAT_IPV4) += nf_table_nat_ipv4.o
+
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 
diff --git a/net/ipv4/netfilter/nf_table_nat_ipv4.c b/net/ipv4/netfilter/nf_table_nat_ipv4.c
new file mode 100644
index 000000000000..2a6f184c10bd
--- /dev/null
+++ b/net/ipv4/netfilter/nf_table_nat_ipv4.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+	enum nft_registers	sreg_addr_min:8;
+	enum nft_registers	sreg_addr_max:8;
+	enum nft_registers	sreg_proto_min:8;
+	enum nft_registers	sreg_proto_max:8;
+	enum nf_nat_manip_type	type;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+	struct nf_nat_range range;
+
+	memset(&range, 0, sizeof(range));
+	if (priv->sreg_addr_min) {
+		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
+		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+		range.flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (priv->sreg_proto_min) {
+		range.min_proto.all = data[priv->sreg_proto_min].data[0];
+		range.max_proto.all = data[priv->sreg_proto_max].data[0];
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+	}
+
+	data[NFT_REG_VERDICT].verdict =
+		nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_NAT_TYPE] == NULL)
+		return -EINVAL;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+	case NFT_NAT_SNAT:
+		priv->type = NF_NAT_MANIP_SRC;
+		break;
+	case NFT_NAT_DNAT:
+		priv->type = NF_NAT_MANIP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MIN]) {
+		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
+		err = nft_validate_input_register(priv->sreg_addr_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MAX]) {
+		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
+		err = nft_validate_input_register(priv->sreg_addr_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_addr_max = priv->sreg_addr_min;
+
+	if (tb[NFTA_NAT_PROTO_MIN]) {
+		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
+		err = nft_validate_input_register(priv->sreg_proto_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_PROTO_MAX]) {
+		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
+		err = nft_validate_input_register(priv->sreg_proto_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_proto_max = priv->sreg_proto_min;
+
+	return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NF_NAT_MANIP_SRC:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+			goto nla_put_failure;
+		break;
+	case NF_NAT_MANIP_DST:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_nat_ops __read_mostly = {
+	.name		= "nat",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_nat_eval,
+	.init		= nft_nat_init,
+	.dump		= nft_nat_dump,
+	.policy		= nft_nat_policy,
+	.maxattr	= NFTA_NAT_MAX,
+};
+
+/*
+ * NAT table
+ */
+
+static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_nat *nat;
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	unsigned int ret;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+
+	nat = nfct_nat(ct);
+	if (nat == NULL) {
+		/* Conntrack module was loaded late, can't add extension. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL)
+			return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED + IP_CT_IS_REPLY:
+		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   ops->hooknum))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall through */
+	case IP_CT_NEW:
+		if (nf_nat_initialized(ct, maniptype))
+			break;
+
+		ret = nft_do_chain(ops, skb, in, out, okfn);
+		if (ret != NF_ACCEPT)
+			return ret;
+		if (!nf_nat_initialized(ct, maniptype)) {
+			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+	default:
+		break;
+	}
+
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	__be32 daddr = ip_hdr(skb)->daddr;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    ip_hdr(skb)->daddr != daddr) {
+		skb_dst_drop(skb);
+	}
+	return ret;
+}
+
+static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo __maybe_unused;
+	const struct nf_conn *ct __maybe_unused;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.u3.ip !=
+		    ct->tuplehash[!dir].tuple.dst.u3.ip ||
+		    ct->tuplehash[dir].tuple.src.u.all !=
+		    ct->tuplehash[!dir].tuple.dst.u.all)
+			return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
+								ret : NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+		    ct->tuplehash[!dir].tuple.src.u3.ip) {
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+static struct nft_base_chain nf_chain_nat_prerouting __read_mostly = {
+	.chain	= {
+		.name		= "PREROUTING",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_prerouting.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_prerouting,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+		.priv		= &nf_chain_nat_prerouting.chain,
+	},
+};
+
+static struct nft_base_chain nf_chain_nat_postrouting __read_mostly = {
+	.chain	= {
+		.name		= "POSTROUTING",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_postrouting.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_postrouting,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+		.priv		= &nf_chain_nat_postrouting.chain,
+	},
+};
+
+static struct nft_base_chain nf_chain_nat_output __read_mostly = {
+	.chain	= {
+		.name		= "OUTPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_output.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_output,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+		.priv		= &nf_chain_nat_output.chain,
+	},
+};
+
+static struct nft_base_chain nf_chain_nat_input __read_mostly = {
+	.chain	= {
+		.name		= "INPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_nat_input.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_nat_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+		.priv		= &nf_chain_nat_input.chain,
+	},
+};
+
+
+static struct nft_table nf_table_nat_ipv4 __read_mostly = {
+	.name	= "nat",
+	.chains	= LIST_HEAD_INIT(nf_table_nat_ipv4.chains),
+};
+
+static int __init nf_table_nat_init(void)
+{
+	int err;
+
+	list_add_tail(&nf_chain_nat_prerouting.chain.list,
+		      &nf_table_nat_ipv4.chains);
+	list_add_tail(&nf_chain_nat_postrouting.chain.list,
+		      &nf_table_nat_ipv4.chains);
+	list_add_tail(&nf_chain_nat_output.chain.list,
+		      &nf_table_nat_ipv4.chains);
+	list_add_tail(&nf_chain_nat_input.chain.list,
+		      &nf_table_nat_ipv4.chains);
+
+	err = nft_register_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
+	if (err < 0)
+		goto err1;
+
+	err = nft_register_expr(&nft_nat_ops);
+	if (err < 0)
+		goto err2;
+
+	return 0;
+
+err2:
+	nft_unregister_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
+err1:
+	return err;
+}
+
+static void __exit nf_table_nat_exit(void)
+{
+	nft_unregister_expr(&nft_nat_ops);
+	nft_unregister_table(&nf_table_nat_ipv4, AF_INET);
+}
+
+module_init(nf_table_nat_init);
+module_exit(nf_table_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_TABLE(AF_INET, "nat");
+MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv4/netfilter/nf_table_route_ipv4.c b/net/ipv4/netfilter/nf_table_route_ipv4.c
new file mode 100644
index 000000000000..4f257a1ed661
--- /dev/null
+++ b/net/ipv4/netfilter/nf_table_route_ipv4.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	u32 mark;
+	__be32 saddr, daddr;
+	u_int8_t tos;
+	const struct iphdr *iph;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	mark = skb->mark;
+	iph = ip_hdr(skb);
+	saddr = iph->saddr;
+	daddr = iph->daddr;
+	tos = iph->tos;
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE) {
+		iph = ip_hdr(skb);
+
+		if (iph->saddr != saddr ||
+		    iph->daddr != daddr ||
+		    skb->mark != mark ||
+		    iph->tos != tos)
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+	}
+	return ret;
+}
+
+static struct nft_base_chain nf_chain_route_output __read_mostly = {
+	.chain	= {
+		.name		= "OUTPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_route_table_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_MANGLE,
+		.priv		= &nf_chain_route_output.chain,
+	},
+};
+
+static struct nft_table nf_table_route_ipv4 __read_mostly = {
+	.name	= "route",
+	.chains	= LIST_HEAD_INIT(nf_table_route_ipv4.chains),
+};
+
+static int __init nf_table_route_init(void)
+{
+	list_add_tail(&nf_chain_route_output.chain.list,
+		      &nf_table_route_ipv4.chains);
+	return nft_register_table(&nf_table_route_ipv4, NFPROTO_IPV4);
+}
+
+static void __exit nf_table_route_exit(void)
+{
+	nft_unregister_table(&nf_table_route_ipv4, NFPROTO_IPV4);
+}
+
+module_init(nf_table_route_init);
+module_exit(nf_table_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_TABLE(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
new file mode 100644
index 000000000000..63d0a3bf53d3
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/ip.h>
+
+static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+				    struct sk_buff *skb,
+				    const struct net_device *in,
+				    const struct net_device *out,
+				    int (*okfn)(struct sk_buff *))
+{
+	if (unlikely(skb->len < sizeof(struct iphdr) ||
+		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
+		if (net_ratelimit())
+			pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
+				"packet\n");
+		return NF_ACCEPT;
+	}
+
+	return nft_do_chain(ops, skb, in, out, okfn);
+}
+
+static struct nft_af_info nft_af_ipv4 __read_mostly = {
+	.family		= NFPROTO_IPV4,
+	.nhooks		= NF_INET_NUMHOOKS,
+	.owner		= THIS_MODULE,
+	.hooks		= {
+		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
+	},
+};
+
+static int __init nf_tables_ipv4_init(void)
+{
+	return nft_register_afinfo(&nft_af_ipv4);
+}
+
+static void __exit nf_tables_ipv4_exit(void)
+{
+	nft_unregister_afinfo(&nft_af_ipv4);
+}
+
+module_init(nf_tables_ipv4_init);
+module_exit(nf_tables_ipv4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET);
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 000000000000..b4ee8d3bb1e4
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/icmp.h>
+
+struct nft_reject {
+	enum nft_reject_types	type:8;
+	u8			icmp_code;
+};
+
+static void nft_reject_eval(const struct nft_expr *expr,
+			      struct nft_data data[NFT_REG_MAX + 1],
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_reject *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NFT_REJECT_ICMP_UNREACH:
+		icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0);
+		break;
+	case NFT_REJECT_TCP_RST:
+		break;
+	}
+
+	data[NFT_REG_VERDICT].verdict = NF_DROP;
+}
+
+static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
+	[NFTA_REJECT_TYPE]		= { .type = NLA_U32 },
+	[NFTA_REJECT_ICMP_CODE]		= { .type = NLA_U8 },
+};
+
+static int nft_reject_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_reject *priv = nft_expr_priv(expr);
+
+	if (tb[NFTA_REJECT_TYPE] == NULL)
+		return -EINVAL;
+
+	priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
+	switch (priv->type) {
+	case NFT_REJECT_ICMP_UNREACH:
+		if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
+			return -EINVAL;
+		priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+	case NFT_REJECT_TCP_RST:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_reject *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type))
+		goto nla_put_failure;
+
+	switch (priv->type) {
+	case NFT_REJECT_ICMP_UNREACH:
+		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
+			goto nla_put_failure;
+		break;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops reject_ops __read_mostly = {
+	.name		= "reject",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_reject_eval,
+	.init		= nft_reject_init,
+	.dump		= nft_reject_dump,
+	.policy		= nft_reject_policy,
+	.maxattr	= NFTA_REJECT_MAX,
+};
+
+static int __init nft_reject_module_init(void)
+{
+	return nft_register_expr(&reject_ops);
+}
+
+static void __exit nft_reject_module_exit(void)
+{
+	nft_unregister_expr(&reject_ops);
+}
+
+module_init(nft_reject_module_init);
+module_exit(nft_reject_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("reject");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index a7f842b29b67..5677e38eeca3 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -25,6 +25,14 @@ config NF_CONNTRACK_IPV6
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NF_TABLES_IPV6
+	depends on NF_TABLES
+	tristate "IPv6 nf_tables support"
+
+config NF_TABLE_ROUTE_IPV6
+	depends on NF_TABLES_IPV6
+	tristate "IPv6 nf_tables route table support"
+
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
 	depends on INET && IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2b53738f798c..956af4492d10 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -23,6 +23,10 @@ obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
 nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
 obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
+# nf_tables
+obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
+obj-$(CONFIG_NF_TABLE_ROUTE_IPV6) += nf_table_route_ipv6.o
+
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
diff --git a/net/ipv6/netfilter/nf_table_route_ipv6.c b/net/ipv6/netfilter/nf_table_route_ipv6.c
new file mode 100644
index 000000000000..48ac65c7b398
--- /dev/null
+++ b/net/ipv6/netfilter/nf_table_route_ipv6.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	struct in6_addr saddr, daddr;
+	u_int8_t hop_limit;
+	u32 mark, flowlabel;
+
+	/* save source/dest address, mark, hoplimit, flowlabel, priority */
+	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+	mark = skb->mark;
+	hop_limit = ipv6_hdr(skb)->hop_limit;
+
+	/* flowlabel and prio (includes version, which shouldn't change either */
+	flowlabel = *((u32 *)ipv6_hdr(skb));
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE &&
+	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+	     skb->mark != mark ||
+	     ipv6_hdr(skb)->hop_limit != hop_limit ||
+	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
+		return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
+
+	return ret;
+}
+
+static struct nft_base_chain nf_chain_route_output __read_mostly = {
+	.chain	= {
+		.name		= "OUTPUT",
+		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
+		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+	},
+	.ops	= {
+		.hook		= nf_route_table_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_MANGLE,
+		.priv		= &nf_chain_route_output.chain,
+	},
+};
+
+static struct nft_table nf_table_route_ipv6 __read_mostly = {
+	.name	= "route",
+	.chains	= LIST_HEAD_INIT(nf_table_route_ipv6.chains),
+};
+
+static int __init nf_table_route_init(void)
+{
+	list_add_tail(&nf_chain_route_output.chain.list,
+		      &nf_table_route_ipv6.chains);
+	return nft_register_table(&nf_table_route_ipv6, NFPROTO_IPV6);
+}
+
+static void __exit nf_table_route_exit(void)
+{
+	nft_unregister_table(&nf_table_route_ipv6, NFPROTO_IPV6);
+}
+
+module_init(nf_table_route_init);
+module_exit(nf_table_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_TABLE(AF_INET6, "route");
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
new file mode 100644
index 000000000000..e0717cea4913
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
+				    struct sk_buff *skb,
+				    const struct net_device *in,
+				    const struct net_device *out,
+				    int (*okfn)(struct sk_buff *))
+{
+	if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
+		if (net_ratelimit())
+			pr_info("nf_tables_ipv6: ignoring short SOCK_RAW "
+				"packet\n");
+		return NF_ACCEPT;
+	}
+
+	return nft_do_chain(ops, skb, in, out, okfn);
+}
+
+static struct nft_af_info nft_af_ipv6 __read_mostly = {
+	.family		= NFPROTO_IPV6,
+	.nhooks		= NF_INET_NUMHOOKS,
+	.owner		= THIS_MODULE,
+	.hooks		= {
+		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
+	},
+};
+
+static int __init nf_tables_ipv6_init(void)
+{
+	return nft_register_afinfo(&nft_af_ipv6);
+}
+
+static void __exit nf_tables_ipv6_exit(void)
+{
+	nft_unregister_afinfo(&nft_af_ipv6);
+}
+
+module_init(nf_tables_ipv6_init);
+module_exit(nf_tables_ipv6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET6);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 6e839b6dff2b..c271e1af93b5 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -413,6 +413,43 @@ config NETFILTER_SYNPROXY
 
 endif # NF_CONNTRACK
 
+config NF_TABLES
+	depends on NETFILTER_NETLINK
+	tristate "Netfilter nf_tables support"
+
+config NFT_EXTHDR
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables IPv6 exthdr module"
+
+config NFT_META
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables meta module"
+
+config NFT_CT
+	depends on NF_TABLES
+	depends on NF_CONNTRACK
+	tristate "Netfilter nf_tables conntrack module"
+
+config NFT_SET
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables set module"
+
+config NFT_HASH
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables hash module"
+
+config NFT_COUNTER
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables counter module"
+
+config NFT_LOG
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables log module"
+
+config NFT_LIMIT
+	depends on NF_TABLES
+	tristate "Netfilter nf_tables limit module"
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index c3a0a12907f6..1ca3f3932826 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -64,6 +64,22 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 # SYNPROXY
 obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
 
+# nf_tables
+nf_tables-objs += nf_tables_core.o nf_tables_api.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
+nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+
+obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
+obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
+obj-$(CONFIG_NFT_META)		+= nft_meta.o
+obj-$(CONFIG_NFT_CT)		+= nft_ct.o
+obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
+#nf_tables-objs			+= nft_meta_target.o
+obj-$(CONFIG_NFT_SET)		+= nft_set.o
+obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
+obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
+obj-$(CONFIG_NFT_LOG)		+= nft_log.o
+
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
new file mode 100644
index 000000000000..7d59c89c6c75
--- /dev/null
+++ b/net/netfilter/nf_tables_api.c
@@ -0,0 +1,1760 @@
+/*
+ * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/sock.h>
+
+static LIST_HEAD(nf_tables_afinfo);
+static LIST_HEAD(nf_tables_expressions);
+
+/**
+ *	nft_register_afinfo - register nf_tables address family info
+ *
+ *	@afi: address family info to register
+ *
+ *	Register the address family for use with nf_tables. Returns zero on
+ *	success or a negative errno code otherwise.
+ */
+int nft_register_afinfo(struct nft_af_info *afi)
+{
+	INIT_LIST_HEAD(&afi->tables);
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_tail(&afi->list, &nf_tables_afinfo);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_afinfo);
+
+/**
+ *	nft_unregister_afinfo - unregister nf_tables address family info
+ *
+ *	@afi: address family info to unregister
+ *
+ *	Unregister the address family for use with nf_tables.
+ */
+void nft_unregister_afinfo(struct nft_af_info *afi)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del(&afi->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
+
+static struct nft_af_info *nft_afinfo_lookup(int family)
+{
+	struct nft_af_info *afi;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (afi->family == family)
+			return afi;
+	}
+	return NULL;
+}
+
+static struct nft_af_info *nf_tables_afinfo_lookup(int family, bool autoload)
+{
+	struct nft_af_info *afi;
+
+	afi = nft_afinfo_lookup(family);
+	if (afi != NULL)
+		return afi;
+#ifdef CONFIG_MODULES
+	if (autoload) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-afinfo-%u", family);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		afi = nft_afinfo_lookup(family);
+		if (afi != NULL)
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-EAFNOSUPPORT);
+}
+
+/*
+ * Tables
+ */
+
+static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
+					  const struct nlattr *nla)
+{
+	struct nft_table *table;
+
+	list_for_each_entry(table, &afi->tables, list) {
+		if (!nla_strcmp(nla, table->name))
+			return table;
+	}
+	return NULL;
+}
+
+static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
+						const struct nlattr *nla,
+						bool autoload)
+{
+	struct nft_table *table;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	table = nft_table_lookup(afi, nla);
+	if (table != NULL)
+		return table;
+
+#ifdef CONFIG_MODULES
+	if (autoload) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-table-%u-%*.s", afi->family,
+			       nla_len(nla)-1, (const char *)nla_data(nla));
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (nft_table_lookup(afi, nla))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
+static inline u64 nf_tables_alloc_handle(struct nft_table *table)
+{
+	return ++table->hgenerator;
+}
+
+static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
+	[NFTA_TABLE_NAME]	= { .type = NLA_STRING },
+};
+
+static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq,
+				     int event, u32 flags, int family,
+				     const struct nft_table *table)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= 0;
+
+	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_table_notify(const struct sk_buff *oskb,
+				  const struct nlmsghdr *nlh,
+				  const struct nft_table *table,
+				  int event, int family)
+{
+	struct sk_buff *skb;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+	u32 seq = nlh ? nlh->nlmsg_seq : 0;
+	struct net *net = oskb ? sock_net(oskb->sk) : &init_net;
+	bool report;
+	int err;
+
+	report = nlh ? nlmsg_report(nlh) : false;
+	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_table_info(skb, portid, seq, event, 0,
+					family, table);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+			     GFP_KERNEL);
+err:
+	if (err < 0)
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+	return err;
+}
+
+static int nf_tables_dump_tables(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	int family = nfmsg->nfgen_family;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry(table, &afi->tables, list) {
+			if (idx < s_idx)
+				goto cont;
+			if (idx > s_idx)
+				memset(&cb->args[1], 0,
+				       sizeof(cb->args) - sizeof(cb->args[0]));
+			if (nf_tables_fill_table_info(skb,
+						      NETLINK_CB(cb->skb).portid,
+						      cb->nlh->nlmsg_seq,
+						      NFT_MSG_NEWTABLE,
+						      NLM_F_MULTI,
+						      afi->family, table) < 0)
+				goto done;
+cont:
+			idx++;
+		}
+	}
+done:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	struct sk_buff *skb2;
+	int family = nfmsg->nfgen_family;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_tables,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid,
+					nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
+					family, table);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nlattr *name;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	name = nla[NFTA_TABLE_NAME];
+	table = nf_tables_table_lookup(afi, name, false);
+	if (IS_ERR(table)) {
+		if (PTR_ERR(table) != -ENOENT)
+			return PTR_ERR(table);
+		table = NULL;
+	}
+
+	if (table != NULL) {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+			return -EOPNOTSUPP;
+		return 0;
+	}
+
+	table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL);
+	if (table == NULL)
+		return -ENOMEM;
+
+	nla_strlcpy(table->name, name, nla_len(name));
+	INIT_LIST_HEAD(&table->chains);
+
+	list_add_tail(&table->list, &afi->tables);
+	nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family);
+	return 0;
+}
+
+static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	if (table->flags & NFT_TABLE_BUILTIN)
+		return -EOPNOTSUPP;
+
+	if (table->use)
+		return -EBUSY;
+
+	list_del(&table->list);
+	nf_tables_table_notify(skb, nlh, table, NFT_MSG_DELTABLE, family);
+	kfree(table);
+	return 0;
+}
+
+static struct nft_table *__nf_tables_table_lookup(const struct nft_af_info *afi,
+						  const char *name)
+{
+	struct nft_table *table;
+
+	list_for_each_entry(table, &afi->tables, list) {
+		if (!strcmp(name, table->name))
+			return table;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_chain_notify(const struct sk_buff *oskb,
+				  const struct nlmsghdr *nlh,
+				  const struct nft_table *table,
+				  const struct nft_chain *chain,
+				  int event, int family);
+
+/**
+ *	nft_register_table - register a built-in table
+ *
+ *	@table: the table to register
+ *	@family: protocol family to register table with
+ *
+ *	Register a built-in table for use with nf_tables. Returns zero on
+ *	success or a negative errno code otherwise.
+ */
+int nft_register_table(struct nft_table *table, int family)
+{
+	struct nft_af_info *afi;
+	struct nft_table *t;
+	struct nft_chain *chain;
+	int err;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+again:
+	afi = nf_tables_afinfo_lookup(family, true);
+	if (IS_ERR(afi)) {
+		err = PTR_ERR(afi);
+		if (err == -EAGAIN)
+			goto again;
+		goto err;
+	}
+
+	t = __nf_tables_table_lookup(afi, table->name);
+	if (IS_ERR(t)) {
+		err = PTR_ERR(t);
+		if (err != -ENOENT)
+			goto err;
+		t = NULL;
+	}
+
+	if (t != NULL) {
+		err = -EEXIST;
+		goto err;
+	}
+
+	table->flags |= NFT_TABLE_BUILTIN;
+	list_add_tail(&table->list, &afi->tables);
+	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_NEWTABLE, family);
+	list_for_each_entry(chain, &table->chains, list)
+		nf_tables_chain_notify(NULL, NULL, table, chain,
+				       NFT_MSG_NEWCHAIN, family);
+	err = 0;
+err:
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nft_register_table);
+
+/**
+ *	nft_unregister_table - unregister a built-in table
+ *
+ *	@table: the table to unregister
+ *	@family: protocol family to unregister table with
+ *
+ *	Unregister a built-in table for use with nf_tables.
+ */
+void nft_unregister_table(struct nft_table *table, int family)
+{
+	struct nft_chain *chain;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del(&table->list);
+	list_for_each_entry(chain, &table->chains, list)
+		nf_tables_chain_notify(NULL, NULL, table, chain,
+				       NFT_MSG_DELCHAIN, family);
+	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_DELTABLE, family);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_table);
+
+/*
+ * Chains
+ */
+
+static struct nft_chain *
+nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
+{
+	struct nft_chain *chain;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (chain->handle == handle)
+			return chain;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
+						const struct nlattr *nla)
+{
+	struct nft_chain *chain;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (!nla_strcmp(nla, chain->name))
+			return chain;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
+	[NFTA_CHAIN_TABLE]	= { .type = NLA_STRING },
+	[NFTA_CHAIN_HANDLE]	= { .type = NLA_U64 },
+	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
+	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
+	[NFTA_HOOK_HOOKNUM]	= { .type = NLA_U32 },
+	[NFTA_HOOK_PRIORITY]	= { .type = NLA_U32 },
+};
+
+static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
+				     int event, u32 flags, int family,
+				     const struct nft_table *table,
+				     const struct nft_chain *chain)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= 0;
+
+	if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle)))
+		goto nla_put_failure;
+	if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
+		goto nla_put_failure;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		const struct nf_hook_ops *ops = &nft_base_chain(chain)->ops;
+		struct nlattr *nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+		if (nest == NULL)
+			goto nla_put_failure;
+		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+			goto nla_put_failure;
+		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+			goto nla_put_failure;
+		nla_nest_end(skb, nest);
+	}
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_chain_notify(const struct sk_buff *oskb,
+				  const struct nlmsghdr *nlh,
+				  const struct nft_table *table,
+				  const struct nft_chain *chain,
+				  int event, int family)
+{
+	struct sk_buff *skb;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+	struct net *net = oskb ? sock_net(oskb->sk) : &init_net;
+	u32 seq = nlh ? nlh->nlmsg_seq : 0;
+	bool report;
+	int err;
+
+	report = nlh ? nlmsg_report(nlh) : false;
+	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_chain_info(skb, portid, seq, event, 0, family,
+					table, chain);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+			     GFP_KERNEL);
+err:
+	if (err < 0)
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+	return err;
+}
+
+static int nf_tables_dump_chains(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	int family = nfmsg->nfgen_family;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry(table, &afi->tables, list) {
+			list_for_each_entry(chain, &table->chains, list) {
+				if (idx < s_idx)
+					goto cont;
+				if (idx > s_idx)
+					memset(&cb->args[1], 0,
+					       sizeof(cb->args) - sizeof(cb->args[0]));
+				if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid,
+							      cb->nlh->nlmsg_seq,
+							      NFT_MSG_NEWCHAIN,
+							      NLM_F_MULTI,
+							      afi->family, table, chain) < 0)
+					goto done;
+cont:
+				idx++;
+			}
+		}
+	}
+done:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+
+static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	struct sk_buff *skb2;
+	int family = nfmsg->nfgen_family;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_chains,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid,
+					nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
+					family, table, chain);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nlattr * uninitialized_var(name);
+	const struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_chain *chain;
+	struct nft_base_chain *basechain;
+	struct nlattr *ha[NFTA_HOOK_MAX + 1];
+	int family = nfmsg->nfgen_family;
+	u64 handle = 0;
+	int err;
+	bool create;
+
+	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+	afi = nf_tables_afinfo_lookup(family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], create);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	if (table->use == UINT_MAX)
+		return -EOVERFLOW;
+
+	chain = NULL;
+	name = nla[NFTA_CHAIN_NAME];
+
+	if (nla[NFTA_CHAIN_HANDLE]) {
+		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+		chain = nf_tables_chain_lookup_byhandle(table, handle);
+		if (IS_ERR(chain))
+			return PTR_ERR(chain);
+	} else {
+		chain = nf_tables_chain_lookup(table, name);
+		if (IS_ERR(chain)) {
+			if (PTR_ERR(chain) != -ENOENT)
+				return PTR_ERR(chain);
+			chain = NULL;
+		}
+	}
+
+	if (chain != NULL) {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+			return -EOPNOTSUPP;
+
+		if (nla[NFTA_CHAIN_HANDLE] && name &&
+		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
+			return -EEXIST;
+
+		if (nla[NFTA_CHAIN_HANDLE] && name)
+			nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+
+		goto notify;
+	}
+
+	if (nla[NFTA_CHAIN_HOOK]) {
+		struct nf_hook_ops *ops;
+
+		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
+				       nft_hook_policy);
+		if (err < 0)
+			return err;
+		if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
+		    ha[NFTA_HOOK_PRIORITY] == NULL)
+			return -EINVAL;
+		if (ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])) >= afi->nhooks)
+			return -EINVAL;
+
+		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+		if (basechain == NULL)
+			return -ENOMEM;
+		chain = &basechain->chain;
+
+		ops = &basechain->ops;
+		ops->pf		= family;
+		ops->owner	= afi->owner;
+		ops->hooknum	= ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+		ops->priority	= ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+		ops->priv	= chain;
+		ops->hook	= nft_do_chain;
+		if (afi->hooks[ops->hooknum])
+			ops->hook = afi->hooks[ops->hooknum];
+
+		chain->flags |= NFT_BASE_CHAIN;
+	} else {
+		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+		if (chain == NULL)
+			return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&chain->rules);
+	chain->handle = nf_tables_alloc_handle(table);
+	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+
+	list_add_tail(&chain->list, &table->chains);
+	table->use++;
+notify:
+	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN,
+			       family);
+	return 0;
+}
+
+static void nf_tables_rcu_chain_destroy(struct rcu_head *head)
+{
+	struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head);
+
+	BUG_ON(chain->use > 0);
+
+	if (chain->flags & NFT_BASE_CHAIN)
+		kfree(nft_base_chain(chain));
+	else
+		kfree(chain);
+}
+
+static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
+			      const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_chain *chain;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	if (chain->flags & NFT_CHAIN_BUILTIN)
+		return -EOPNOTSUPP;
+
+	if (!list_empty(&chain->rules))
+		return -EBUSY;
+
+	list_del(&chain->list);
+	table->use--;
+
+	if (chain->flags & NFT_BASE_CHAIN)
+		nf_unregister_hook(&nft_base_chain(chain)->ops);
+
+	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_DELCHAIN,
+			       family);
+
+	/* Make sure all rule references are gone before this is released */
+	call_rcu(&chain->rcu_head, nf_tables_rcu_chain_destroy);
+	return 0;
+}
+
+static void nft_ctx_init(struct nft_ctx *ctx,
+			 const struct nft_af_info *afi,
+			 const struct nft_table *table,
+			 const struct nft_chain *chain)
+{
+	ctx->afi   = afi;
+	ctx->table = table;
+	ctx->chain = chain;
+}
+
+/*
+ * Expressions
+ */
+
+/**
+ *	nft_register_expr - register nf_tables expr operations
+ *	@ops: expr operations
+ *
+ *	Registers the expr operations for use with nf_tables. Returns zero on
+ *	success or a negative errno code otherwise.
+ */
+int nft_register_expr(struct nft_expr_ops *ops)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_tail(&ops->list, &nf_tables_expressions);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_expr);
+
+/**
+ *	nft_unregister_expr - unregister nf_tables expr operations
+ *	@ops: expr operations
+ *
+ * 	Unregisters the expr operations for use with nf_tables.
+ */
+void nft_unregister_expr(struct nft_expr_ops *ops)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del(&ops->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_expr);
+
+static const struct nft_expr_ops *__nft_expr_ops_get(struct nlattr *nla)
+{
+	const struct nft_expr_ops *ops;
+
+	list_for_each_entry(ops, &nf_tables_expressions, list) {
+		if (!nla_strcmp(nla, ops->name))
+			return ops;
+	}
+	return NULL;
+}
+
+static const struct nft_expr_ops *nft_expr_ops_get(struct nlattr *nla)
+{
+	const struct nft_expr_ops *ops;
+
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	ops = __nft_expr_ops_get(nla);
+	if (ops != NULL && try_module_get(ops->owner))
+		return ops;
+
+#ifdef CONFIG_MODULES
+	if (ops == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-expr-%.*s",
+			       nla_len(nla), (char *)nla_data(nla));
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (__nft_expr_ops_get(nla))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
+	[NFTA_EXPR_NAME]	= { .type = NLA_STRING },
+	[NFTA_EXPR_DATA]	= { .type = NLA_NESTED },
+};
+
+static int nf_tables_fill_expr_info(struct sk_buff *skb,
+				    const struct nft_expr *expr)
+{
+	if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->name))
+		goto nla_put_failure;
+
+	if (expr->ops->dump) {
+		struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
+		if (data == NULL)
+			goto nla_put_failure;
+		if (expr->ops->dump(skb, expr) < 0)
+			goto nla_put_failure;
+		nla_nest_end(skb, data);
+	}
+
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+};
+
+struct nft_expr_info {
+	const struct nft_expr_ops	*ops;
+	struct nlattr			*tb[NFTA_EXPR_MAX + 1];
+};
+
+static int nf_tables_expr_parse(const struct nlattr *nla,
+				struct nft_expr_info *info)
+{
+	const struct nft_expr_ops *ops;
+	int err;
+
+	err = nla_parse_nested(info->tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
+	if (err < 0)
+		return err;
+
+	ops = nft_expr_ops_get(info->tb[NFTA_EXPR_NAME]);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	info->ops = ops;
+	return 0;
+}
+
+static int nf_tables_newexpr(const struct nft_ctx *ctx,
+			     struct nft_expr_info *info,
+			     struct nft_expr *expr)
+{
+	const struct nft_expr_ops *ops = info->ops;
+	int err;
+
+	expr->ops = ops;
+	if (ops->init) {
+		struct nlattr *ma[ops->maxattr + 1];
+
+		if (info->tb[NFTA_EXPR_DATA]) {
+			err = nla_parse_nested(ma, ops->maxattr,
+					       info->tb[NFTA_EXPR_DATA],
+					       ops->policy);
+			if (err < 0)
+				goto err1;
+		} else
+			memset(ma, 0, sizeof(ma[0]) * (ops->maxattr + 1));
+
+		err = ops->init(ctx, expr, (const struct nlattr **)ma);
+		if (err < 0)
+			goto err1;
+	}
+
+	info->ops = NULL;
+	return 0;
+
+err1:
+	expr->ops = NULL;
+	return err;
+}
+
+static void nf_tables_expr_destroy(struct nft_expr *expr)
+{
+	if (expr->ops->destroy)
+		expr->ops->destroy(expr);
+	module_put(expr->ops->owner);
+}
+
+/*
+ * Rules
+ */
+
+static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
+						u64 handle)
+{
+	struct nft_rule *rule;
+
+	// FIXME: this sucks
+	list_for_each_entry(rule, &chain->rules, list) {
+		if (handle == rule->handle)
+			return rule;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
+					      const struct nlattr *nla)
+{
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
+	return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+}
+
+static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
+	[NFTA_RULE_TABLE]	= { .type = NLA_STRING },
+	[NFTA_RULE_CHAIN]	= { .type = NLA_STRING,
+				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
+	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 },
+	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED },
+};
+
+static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
+				    int event, u32 flags, int family,
+				    const struct nft_table *table,
+				    const struct nft_chain *chain,
+				    const struct nft_rule *rule)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	const struct nft_expr *expr, *next;
+	struct nlattr *list;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+			flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= 0;
+
+	if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
+		goto nla_put_failure;
+	if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
+		goto nla_put_failure;
+
+	list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+	if (list == NULL)
+		goto nla_put_failure;
+	nft_rule_for_each_expr(expr, next, rule) {
+		struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM);
+		if (elem == NULL)
+			goto nla_put_failure;
+		if (nf_tables_fill_expr_info(skb, expr) < 0)
+			goto nla_put_failure;
+		nla_nest_end(skb, elem);
+	}
+	nla_nest_end(skb, list);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_rule_notify(const struct sk_buff *oskb,
+				 const struct nlmsghdr *nlh,
+				 const struct nft_table *table,
+				 const struct nft_chain *chain,
+				 const struct nft_rule *rule,
+				 int event, u32 flags, int family)
+{
+	struct sk_buff *skb;
+	u32 portid = NETLINK_CB(oskb).portid;
+	struct net *net = oskb ? sock_net(oskb->sk) : &init_net;
+	u32 seq = nlh->nlmsg_seq;
+	bool report;
+	int err;
+
+	report = nlmsg_report(nlh);
+	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_rule_info(skb, portid, seq, event, flags,
+				       family, table, chain, rule);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+			     GFP_KERNEL);
+err:
+	if (err < 0)
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+	return err;
+}
+
+static int nf_tables_dump_rules(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	const struct nft_rule *rule;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	int family = nfmsg->nfgen_family;
+
+	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry(table, &afi->tables, list) {
+			list_for_each_entry(chain, &table->chains, list) {
+				list_for_each_entry(rule, &chain->rules, list) {
+					if (idx < s_idx)
+						goto cont;
+					if (idx > s_idx)
+						memset(&cb->args[1], 0,
+						       sizeof(cb->args) - sizeof(cb->args[0]));
+					if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid,
+								      cb->nlh->nlmsg_seq,
+								      NFT_MSG_NEWRULE,
+								      NLM_F_MULTI | NLM_F_APPEND,
+								      afi->family, table, chain, rule) < 0)
+						goto done;
+cont:
+					idx++;
+				}
+			}
+		}
+	}
+done:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
+			     const struct nlmsghdr *nlh,
+			     const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+	const struct nft_rule *rule;
+	struct sk_buff *skb2;
+	int family = nfmsg->nfgen_family;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_rules,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+	if (IS_ERR(rule))
+		return PTR_ERR(rule);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid,
+				       nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+				       family, table, chain, rule);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+	kfree_skb(skb2);
+	return err;
+}
+
+static void nf_tables_rcu_rule_destroy(struct rcu_head *head)
+{
+	struct nft_rule *rule = container_of(head, struct nft_rule, rcu_head);
+	struct nft_expr *expr;
+
+	/*
+	 * Careful: some expressions might not be initialized in case this
+	 * is called on error from nf_tables_newrule().
+	 */
+	expr = nft_expr_first(rule);
+	while (expr->ops && expr != nft_expr_last(rule)) {
+		nf_tables_expr_destroy(expr);
+		expr = nft_expr_next(expr);
+	}
+	kfree(rule);
+}
+
+static void nf_tables_rule_destroy(struct nft_rule *rule)
+{
+	call_rcu(&rule->rcu_head, nf_tables_rcu_rule_destroy);
+}
+
+#define NFT_RULE_MAXEXPRS	128
+
+static struct nft_expr_info *info;
+
+static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
+			     const struct nlmsghdr *nlh,
+			     const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_chain *chain;
+	struct nft_rule *rule, *old_rule = NULL;
+	struct nft_expr *expr;
+	struct nft_ctx ctx;
+	struct nlattr *tmp;
+	unsigned int size, i, n;
+	int err, rem;
+	bool create;
+	u64 handle;
+
+	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, create);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], create);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	if (nla[NFTA_RULE_HANDLE]) {
+		handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
+		rule = __nf_tables_rule_lookup(chain, handle);
+		if (IS_ERR(rule))
+			return PTR_ERR(rule);
+
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+			old_rule = rule;
+		else
+			return -EOPNOTSUPP;
+	} else {
+		if (!create || nlh->nlmsg_flags & NLM_F_REPLACE)
+			return -EINVAL;
+		handle = nf_tables_alloc_handle(table);
+	}
+
+	n = 0;
+	size = 0;
+	if (nla[NFTA_RULE_EXPRESSIONS]) {
+		nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
+			err = -EINVAL;
+			if (nla_type(tmp) != NFTA_LIST_ELEM)
+				goto err1;
+			if (n == NFT_RULE_MAXEXPRS)
+				goto err1;
+			err = nf_tables_expr_parse(tmp, &info[n]);
+			if (err < 0)
+				goto err1;
+			size += info[n].ops->size;
+			n++;
+		}
+	}
+
+	err = -ENOMEM;
+	rule = kzalloc(sizeof(*rule) + size, GFP_KERNEL);
+	if (rule == NULL)
+		goto err1;
+
+	rule->handle = handle;
+	rule->dlen   = size;
+
+	nft_ctx_init(&ctx, afi, table, chain);
+	expr = nft_expr_first(rule);
+	for (i = 0; i < n; i++) {
+		err = nf_tables_newexpr(&ctx, &info[i], expr);
+		if (err < 0)
+			goto err2;
+		expr = nft_expr_next(expr);
+	}
+
+	/* Register hook when first rule is inserted into a base chain */
+	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN) {
+		err = nf_register_hook(&nft_base_chain(chain)->ops);
+		if (err < 0)
+			goto err2;
+	}
+
+	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+		list_replace_rcu(&old_rule->list, &rule->list);
+		nf_tables_rule_destroy(old_rule);
+	} else if (nlh->nlmsg_flags & NLM_F_APPEND)
+		list_add_tail_rcu(&rule->list, &chain->rules);
+	else
+		list_add_rcu(&rule->list, &chain->rules);
+
+	nf_tables_rule_notify(skb, nlh, table, chain, rule, NFT_MSG_NEWRULE,
+			      nlh->nlmsg_flags & (NLM_F_APPEND | NLM_F_REPLACE),
+			      nfmsg->nfgen_family);
+	return 0;
+
+err2:
+	nf_tables_rule_destroy(rule);
+err1:
+	for (i = 0; i < n; i++) {
+		if (info[i].ops != NULL)
+			module_put(info[i].ops->owner);
+	}
+	return err;
+}
+
+static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
+			     const struct nlmsghdr *nlh,
+			     const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	struct nft_chain *chain;
+	struct nft_rule *rule, *tmp;
+	int family = nfmsg->nfgen_family;
+
+	afi = nf_tables_afinfo_lookup(family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+	if (IS_ERR(chain))
+		return PTR_ERR(chain);
+
+	if (nla[NFTA_RULE_HANDLE]) {
+		rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+		if (IS_ERR(rule))
+			return PTR_ERR(rule);
+
+		/* List removal must be visible before destroying expressions */
+		list_del_rcu(&rule->list);
+
+		nf_tables_rule_notify(skb, nlh, table, chain, rule,
+				      NFT_MSG_DELRULE, 0, family);
+		nf_tables_rule_destroy(rule);
+	} else {
+		/* Remove all rules in this chain */
+		list_for_each_entry_safe(rule, tmp, &chain->rules, list) {
+			list_del_rcu(&rule->list);
+
+			nf_tables_rule_notify(skb, nlh, table, chain, rule,
+					      NFT_MSG_DELRULE, 0, family);
+			nf_tables_rule_destroy(rule);
+		}
+	}
+
+	/* Unregister hook when last rule from base chain is deleted */
+	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN)
+		nf_unregister_hook(&nft_base_chain(chain)->ops);
+
+	return 0;
+}
+
+static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
+	[NFT_MSG_NEWTABLE] = {
+		.call		= nf_tables_newtable,
+		.attr_count	= NFTA_TABLE_MAX,
+		.policy		= nft_table_policy,
+	},
+	[NFT_MSG_GETTABLE] = {
+		.call		= nf_tables_gettable,
+		.attr_count	= NFTA_TABLE_MAX,
+		.policy		= nft_table_policy,
+	},
+	[NFT_MSG_DELTABLE] = {
+		.call		= nf_tables_deltable,
+		.attr_count	= NFTA_TABLE_MAX,
+		.policy		= nft_table_policy,
+	},
+	[NFT_MSG_NEWCHAIN] = {
+		.call		= nf_tables_newchain,
+		.attr_count	= NFTA_CHAIN_MAX,
+		.policy		= nft_chain_policy,
+	},
+	[NFT_MSG_GETCHAIN] = {
+		.call		= nf_tables_getchain,
+		.attr_count	= NFTA_CHAIN_MAX,
+		.policy		= nft_chain_policy,
+	},
+	[NFT_MSG_DELCHAIN] = {
+		.call		= nf_tables_delchain,
+		.attr_count	= NFTA_CHAIN_MAX,
+		.policy		= nft_chain_policy,
+	},
+	[NFT_MSG_NEWRULE] = {
+		.call		= nf_tables_newrule,
+		.attr_count	= NFTA_RULE_MAX,
+		.policy		= nft_rule_policy,
+	},
+	[NFT_MSG_GETRULE] = {
+		.call		= nf_tables_getrule,
+		.attr_count	= NFTA_RULE_MAX,
+		.policy		= nft_rule_policy,
+	},
+	[NFT_MSG_DELRULE] = {
+		.call		= nf_tables_delrule,
+		.attr_count	= NFTA_RULE_MAX,
+		.policy		= nft_rule_policy,
+	},
+};
+
+static const struct nfnetlink_subsystem nf_tables_subsys = {
+	.name		= "nf_tables",
+	.subsys_id	= NFNL_SUBSYS_NFTABLES,
+	.cb_count	= NFT_MSG_MAX,
+	.cb		= nf_tables_cb,
+};
+
+/**
+ *	nft_validate_input_register - validate an expressions' input register
+ *
+ *	@reg: the register number
+ *
+ * 	Validate that the input register is one of the general purpose
+ * 	registers.
+ */
+int nft_validate_input_register(enum nft_registers reg)
+{
+	if (reg <= NFT_REG_VERDICT)
+		return -EINVAL;
+	if (reg > NFT_REG_MAX)
+		return -ERANGE;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_input_register);
+
+/**
+ *	nft_validate_output_register - validate an expressions' output register
+ *
+ *	@reg: the register number
+ *
+ * 	Validate that the output register is one of the general purpose
+ * 	registers or the verdict register.
+ */
+int nft_validate_output_register(enum nft_registers reg)
+{
+	if (reg < NFT_REG_VERDICT)
+		return -EINVAL;
+	if (reg > NFT_REG_MAX)
+		return -ERANGE;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_output_register);
+
+/**
+ *	nft_validate_data_load - validate an expressions' data load
+ *
+ *	@ctx: context of the expression performing the load
+ * 	@reg: the destination register number
+ * 	@data: the data to load
+ * 	@type: the data type
+ *
+ * 	Validate that a data load uses the appropriate data type for
+ * 	the destination register. A value of NULL for the data means
+ * 	that its runtime gathered data, which is always of type
+ * 	NFT_DATA_VALUE.
+ */
+int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg,
+			   const struct nft_data *data,
+			   enum nft_data_types type)
+{
+	switch (reg) {
+	case NFT_REG_VERDICT:
+		if (data == NULL || type != NFT_DATA_VERDICT)
+			return -EINVAL;
+		// FIXME: do loop detection
+		return 0;
+	default:
+		if (data != NULL && type != NFT_DATA_VALUE)
+			return -EINVAL;
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(nft_validate_data_load);
+
+static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
+	[NFTA_VERDICT_CODE]	= { .type = NLA_U32 },
+	[NFTA_VERDICT_CHAIN]	= { .type = NLA_STRING,
+				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
+};
+
+static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
+			    struct nft_data_desc *desc, const struct nlattr *nla)
+{
+	struct nlattr *tb[NFTA_VERDICT_MAX + 1];
+	struct nft_chain *chain;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_VERDICT_CODE])
+		return -EINVAL;
+	data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
+
+	switch (data->verdict) {
+	case NF_ACCEPT:
+	case NF_DROP:
+	case NF_QUEUE:
+	case NFT_CONTINUE:
+	case NFT_BREAK:
+	case NFT_RETURN:
+		desc->len = sizeof(data->verdict);
+		break;
+	case NFT_JUMP:
+	case NFT_GOTO:
+		if (!tb[NFTA_VERDICT_CHAIN])
+			return -EINVAL;
+		chain = nf_tables_chain_lookup(ctx->table,
+					       tb[NFTA_VERDICT_CHAIN]);
+		if (IS_ERR(chain))
+			return PTR_ERR(chain);
+		if (chain->flags & NFT_BASE_CHAIN)
+			return -EOPNOTSUPP;
+
+		if (ctx->chain->level + 1 > chain->level) {
+			if (ctx->chain->level + 1 == 16)
+				return -EMLINK;
+			chain->level = ctx->chain->level + 1;
+		}
+		chain->use++;
+		data->chain = chain;
+		desc->len = sizeof(data);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	desc->type = NFT_DATA_VERDICT;
+	return 0;
+}
+
+static void nft_verdict_uninit(const struct nft_data *data)
+{
+	switch (data->verdict) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		data->chain->use--;
+		break;
+	}
+}
+
+static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_DATA_VERDICT);
+	if (!nest)
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict)))
+		goto nla_put_failure;
+
+	switch (data->verdict) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+			  struct nft_data_desc *desc, const struct nlattr *nla)
+{
+	unsigned int len;
+
+	len = nla_len(nla);
+	if (len == 0)
+		return -EINVAL;
+	if (len > sizeof(data->data))
+		return -EOVERFLOW;
+
+	nla_memcpy(data->data, nla, sizeof(data->data));
+	desc->type = NFT_DATA_VALUE;
+	desc->len  = len;
+	return 0;
+}
+
+static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
+			  unsigned int len)
+{
+	return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
+}
+
+static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
+	[NFTA_DATA_VALUE]	= { .type = NLA_BINARY,
+				    .len  = FIELD_SIZEOF(struct nft_data, data) },
+	[NFTA_DATA_VERDICT]	= { .type = NLA_NESTED },
+};
+
+/**
+ *	nft_data_init - parse nf_tables data netlink attributes
+ *
+ *	@ctx: context of the expression using the data
+ *	@data: destination struct nft_data
+ *	@desc: data description
+ *	@nla: netlink attribute containing data
+ *
+ *	Parse the netlink data attributes and initialize a struct nft_data.
+ *	The type and length of data are returned in the data description.
+ *
+ *	The caller can indicate that it only wants to accept data of type
+ *	NFT_DATA_VALUE by passing NULL for the ctx argument.
+ */
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+		  struct nft_data_desc *desc, const struct nlattr *nla)
+{
+	struct nlattr *tb[NFTA_DATA_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_DATA_VALUE])
+		return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+	if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
+		return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(nft_data_init);
+
+/**
+ *	nft_data_uninit - release a nft_data item
+ *
+ *	@data: struct nft_data to release
+ *	@type: type of data
+ *
+ *	Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ *	all others need to be released by calling this function.
+ */
+void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
+{
+	switch (type) {
+	case NFT_DATA_VALUE:
+		return;
+	case NFT_DATA_VERDICT:
+		return nft_verdict_uninit(data);
+	default:
+		WARN_ON(1);
+	}
+}
+EXPORT_SYMBOL_GPL(nft_data_uninit);
+
+int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
+		  enum nft_data_types type, unsigned int len)
+{
+	struct nlattr *nest;
+	int err;
+
+	nest = nla_nest_start(skb, attr);
+	if (nest == NULL)
+		return -1;
+
+	switch (type) {
+	case NFT_DATA_VALUE:
+		err = nft_value_dump(skb, data, len);
+		break;
+	case NFT_DATA_VERDICT:
+		err = nft_verdict_dump(skb, data);
+		break;
+	default:
+		err = -EINVAL;
+		WARN_ON(1);
+	}
+
+	nla_nest_end(skb, nest);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nft_data_dump);
+
+static int __init nf_tables_module_init(void)
+{
+	int err;
+
+	info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS,
+		       GFP_KERNEL);
+	if (info == NULL) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	err = nf_tables_core_module_init();
+	if (err < 0)
+		goto err2;
+
+	err = nfnetlink_subsys_register(&nf_tables_subsys);
+	if (err < 0)
+		goto err3;
+
+	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
+	return 0;
+err3:
+	nf_tables_core_module_exit();
+err2:
+	kfree(info);
+err1:
+	return err;
+}
+
+static void __exit nf_tables_module_exit(void)
+{
+	nfnetlink_subsys_unregister(&nf_tables_subsys);
+	nf_tables_core_module_exit();
+	kfree(info);
+}
+
+module_init(nf_tables_module_init);
+module_exit(nf_tables_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
new file mode 100644
index 000000000000..bc7fb85d4002
--- /dev/null
+++ b/net/netfilter/nf_tables_core.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+#define NFT_JUMP_STACK_SIZE	16
+
+unsigned int nft_do_chain(const struct nf_hook_ops *ops,
+			  struct sk_buff *skb,
+			  const struct net_device *in,
+			  const struct net_device *out,
+			  int (*okfn)(struct sk_buff *))
+{
+	const struct nft_chain *chain = ops->priv;
+	const struct nft_rule *rule;
+	const struct nft_expr *expr, *last;
+	struct nft_data data[NFT_REG_MAX + 1];
+	const struct nft_pktinfo pkt = {
+		.skb		= skb,
+		.in		= in,
+		.out		= out,
+		.hooknum	= ops->hooknum,
+	};
+	unsigned int stackptr = 0;
+	struct {
+		const struct nft_chain	*chain;
+		const struct nft_rule	*rule;
+	} jumpstack[NFT_JUMP_STACK_SIZE];
+
+do_chain:
+	rule = list_entry(&chain->rules, struct nft_rule, list);
+next_rule:
+	data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+	list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
+		nft_rule_for_each_expr(expr, last, rule) {
+			expr->ops->eval(expr, data, &pkt);
+			if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
+				break;
+		}
+
+		switch (data[NFT_REG_VERDICT].verdict) {
+		case NFT_BREAK:
+			data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+			/* fall through */
+		case NFT_CONTINUE:
+			continue;
+		}
+		break;
+	}
+
+	switch (data[NFT_REG_VERDICT].verdict) {
+	case NF_ACCEPT:
+	case NF_DROP:
+	case NF_QUEUE:
+		return data[NFT_REG_VERDICT].verdict;
+	case NFT_JUMP:
+		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
+		jumpstack[stackptr].chain = chain;
+		jumpstack[stackptr].rule  = rule;
+		stackptr++;
+		/* fall through */
+	case NFT_GOTO:
+		chain = data[NFT_REG_VERDICT].chain;
+		goto do_chain;
+	case NFT_RETURN:
+	case NFT_CONTINUE:
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	if (stackptr > 0) {
+		stackptr--;
+		chain = jumpstack[stackptr].chain;
+		rule  = jumpstack[stackptr].rule;
+		goto next_rule;
+	}
+
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nft_do_chain);
+
+int __init nf_tables_core_module_init(void)
+{
+	int err;
+
+	err = nft_immediate_module_init();
+	if (err < 0)
+		goto err1;
+
+	err = nft_cmp_module_init();
+	if (err < 0)
+		goto err2;
+
+	err = nft_lookup_module_init();
+	if (err < 0)
+		goto err3;
+
+	err = nft_bitwise_module_init();
+	if (err < 0)
+		goto err4;
+
+	err = nft_byteorder_module_init();
+	if (err < 0)
+		goto err5;
+
+	err = nft_payload_module_init();
+	if (err < 0)
+		goto err6;
+
+	return 0;
+
+err6:
+	nft_byteorder_module_exit();
+err5:
+	nft_bitwise_module_exit();
+err4:
+	nft_lookup_module_exit();
+err3:
+	nft_cmp_module_exit();
+err2:
+	nft_immediate_module_exit();
+err1:
+	return err;
+}
+
+void nf_tables_core_module_exit(void)
+{
+	nft_payload_module_exit();
+	nft_byteorder_module_exit();
+	nft_bitwise_module_exit();
+	nft_lookup_module_exit();
+	nft_cmp_module_exit();
+	nft_immediate_module_exit();
+}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
new file mode 100644
index 000000000000..0f7501506367
--- /dev/null
+++ b/net/netfilter/nft_bitwise.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_bitwise {
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	u8			len;
+	struct nft_data		mask;
+	struct nft_data		xor;
+};
+
+static void nft_bitwise_eval(const struct nft_expr *expr,
+			     struct nft_data data[NFT_REG_MAX + 1],
+			     const struct nft_pktinfo *pkt)
+{
+	const struct nft_bitwise *priv = nft_expr_priv(expr);
+	const struct nft_data *src = &data[priv->sreg];
+	struct nft_data *dst = &data[priv->dreg];
+	unsigned int i;
+
+	for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) {
+		dst->data[i] = (src->data[i] & priv->mask.data[i]) ^
+			       priv->xor.data[i];
+	}
+}
+
+static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
+	[NFTA_BITWISE_SREG]	= { .type = NLA_U32 },
+	[NFTA_BITWISE_DREG]	= { .type = NLA_U32 },
+	[NFTA_BITWISE_LEN]	= { .type = NLA_U32 },
+	[NFTA_BITWISE_MASK]	= { .type = NLA_NESTED },
+	[NFTA_BITWISE_XOR]	= { .type = NLA_NESTED },
+};
+
+static int nft_bitwise_init(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nlattr * const tb[])
+{
+	struct nft_bitwise *priv = nft_expr_priv(expr);
+	struct nft_data_desc d1, d2;
+	int err;
+
+	if (tb[NFTA_BITWISE_SREG] == NULL ||
+	    tb[NFTA_BITWISE_DREG] == NULL ||
+	    tb[NFTA_BITWISE_LEN] == NULL ||
+	    tb[NFTA_BITWISE_MASK] == NULL ||
+	    tb[NFTA_BITWISE_XOR] == NULL)
+		return -EINVAL;
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+	if (err < 0)
+		return err;
+
+	priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+
+	err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
+	if (err < 0)
+		return err;
+	if (d1.len != priv->len)
+		return -EINVAL;
+
+	err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
+	if (err < 0)
+		return err;
+	if (d2.len != priv->len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_bitwise *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_bitwise_ops __read_mostly = {
+	.name		= "bitwise",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_bitwise_eval,
+	.init		= nft_bitwise_init,
+	.dump		= nft_bitwise_dump,
+	.policy		= nft_bitwise_policy,
+	.maxattr	= NFTA_BITWISE_MAX,
+};
+
+int __init nft_bitwise_module_init(void)
+{
+	return nft_register_expr(&nft_bitwise_ops);
+}
+
+void nft_bitwise_module_exit(void)
+{
+	nft_unregister_expr(&nft_bitwise_ops);
+}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
new file mode 100644
index 000000000000..8b0657a4d17b
--- /dev/null
+++ b/net/netfilter/nft_byteorder.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_byteorder {
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	enum nft_byteorder_ops	op:8;
+	u8			len;
+	u8			size;
+};
+
+static void nft_byteorder_eval(const struct nft_expr *expr,
+			       struct nft_data data[NFT_REG_MAX + 1],
+			       const struct nft_pktinfo *pkt)
+{
+	const struct nft_byteorder *priv = nft_expr_priv(expr);
+	struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg];
+	union { u32 u32; u16 u16; } *s, *d;
+	unsigned int i;
+
+	s = (void *)src->data;
+	d = (void *)dst->data;
+
+	switch (priv->size) {
+	case 4:
+		switch (priv->op) {
+		case NFT_BYTEORDER_NTOH:
+			for (i = 0; i < priv->len / 4; i++)
+				d[i].u32 = ntohl((__force __be32)s[i].u32);
+			break;
+		case NFT_BYTEORDER_HTON:
+			for (i = 0; i < priv->len / 4; i++)
+				d[i].u32 = (__force __u32)htonl(s[i].u32);
+			break;
+		}
+		break;
+	case 2:
+		switch (priv->op) {
+		case NFT_BYTEORDER_NTOH:
+			for (i = 0; i < priv->len / 2; i++)
+				d[i].u16 = ntohs((__force __be16)s[i].u16);
+			break;
+		case NFT_BYTEORDER_HTON:
+			for (i = 0; i < priv->len / 2; i++)
+				d[i].u16 = (__force __u16)htons(s[i].u16);
+			break;
+		}
+		break;
+	}
+}
+
+static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
+	[NFTA_BYTEORDER_SREG]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_DREG]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_OP]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_LEN]	= { .type = NLA_U32 },
+	[NFTA_BYTEORDER_SIZE]	= { .type = NLA_U32 },
+};
+
+static int nft_byteorder_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_byteorder *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_BYTEORDER_SREG] == NULL ||
+	    tb[NFTA_BYTEORDER_DREG] == NULL ||
+	    tb[NFTA_BYTEORDER_LEN] == NULL ||
+	    tb[NFTA_BYTEORDER_SIZE] == NULL ||
+	    tb[NFTA_BYTEORDER_OP] == NULL)
+		return -EINVAL;
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+	if (err < 0)
+		return err;
+
+	priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP]));
+	switch (priv->op) {
+	case NFT_BYTEORDER_NTOH:
+	case NFT_BYTEORDER_HTON:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+	if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+	switch (priv->size) {
+	case 2:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_byteorder *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_byteorder_ops __read_mostly = {
+	.name		= "byteorder",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_byteorder_eval,
+	.init		= nft_byteorder_init,
+	.dump		= nft_byteorder_dump,
+	.policy		= nft_byteorder_policy,
+	.maxattr	= NFTA_BYTEORDER_MAX,
+};
+
+int __init nft_byteorder_module_init(void)
+{
+	return nft_register_expr(&nft_byteorder_ops);
+}
+
+void nft_byteorder_module_exit(void)
+{
+	nft_unregister_expr(&nft_byteorder_ops);
+}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
new file mode 100644
index 000000000000..e734d670120a
--- /dev/null
+++ b/net/netfilter/nft_cmp.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_cmp_expr {
+	struct nft_data		data;
+	enum nft_registers	sreg:8;
+	u8			len;
+	enum nft_cmp_ops	op:8;
+};
+
+static void nft_cmp_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+	int d;
+
+	d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len);
+	switch (priv->op) {
+	case NFT_CMP_EQ:
+		if (d != 0)
+			goto mismatch;
+		break;
+	case NFT_CMP_NEQ:
+		if (d == 0)
+			goto mismatch;
+		break;
+	case NFT_CMP_LT:
+		if (d == 0)
+			goto mismatch;
+	case NFT_CMP_LTE:
+		if (d > 0)
+			goto mismatch;
+		break;
+	case NFT_CMP_GT:
+		if (d == 0)
+			goto mismatch;
+	case NFT_CMP_GTE:
+		if (d < 0)
+			goto mismatch;
+		break;
+	}
+	return;
+
+mismatch:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = {
+	[NFTA_CMP_SREG]		= { .type = NLA_U32 },
+	[NFTA_CMP_OP]		= { .type = NLA_U32 },
+	[NFTA_CMP_DATA]		= { .type = NLA_NESTED },
+};
+
+static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_cmp_expr *priv = nft_expr_priv(expr);
+	struct nft_data_desc desc;
+	int err;
+
+	if (tb[NFTA_CMP_SREG] == NULL ||
+	    tb[NFTA_CMP_OP] == NULL ||
+	    tb[NFTA_CMP_DATA] == NULL)
+		return -EINVAL;
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+	switch (priv->op) {
+	case NFT_CMP_EQ:
+	case NFT_CMP_NEQ:
+	case NFT_CMP_LT:
+	case NFT_CMP_LTE:
+	case NFT_CMP_GT:
+	case NFT_CMP_GTE:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+	if (err < 0)
+		return err;
+
+	priv->len = desc.len;
+	return 0;
+}
+
+static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op)))
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_cmp_ops __read_mostly = {
+	.name		= "cmp",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_cmp_eval,
+	.init		= nft_cmp_init,
+	.dump		= nft_cmp_dump,
+	.policy		= nft_cmp_policy,
+	.maxattr	= NFTA_CMP_MAX,
+};
+
+int __init nft_cmp_module_init(void)
+{
+	return nft_register_expr(&nft_cmp_ops);
+}
+
+void nft_cmp_module_exit(void)
+{
+	nft_unregister_expr(&nft_cmp_ops);
+}
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
new file mode 100644
index 000000000000..33c5d36819bb
--- /dev/null
+++ b/net/netfilter/nft_counter.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_counter {
+	seqlock_t	lock;
+	u64		bytes;
+	u64		packets;
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+			     struct nft_data data[NFT_REG_MAX + 1],
+			     const struct nft_pktinfo *pkt)
+{
+	struct nft_counter *priv = nft_expr_priv(expr);
+
+	write_seqlock_bh(&priv->lock);
+	priv->bytes += pkt->skb->len;
+	priv->packets++;
+	write_sequnlock_bh(&priv->lock);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_counter *priv = nft_expr_priv(expr);
+	unsigned int seq;
+	u64 bytes;
+	u64 packets;
+
+	do {
+		seq = read_seqbegin(&priv->lock);
+		bytes	= priv->bytes;
+		packets	= priv->packets;
+	} while (read_seqretry(&priv->lock, seq));
+
+	if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes)))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 },
+	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 },
+};
+
+static int nft_counter_init(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nlattr * const tb[])
+{
+	struct nft_counter *priv = nft_expr_priv(expr);
+
+	if (tb[NFTA_COUNTER_PACKETS])
+	        priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+	if (tb[NFTA_COUNTER_BYTES])
+		priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+
+	seqlock_init(&priv->lock);
+	return 0;
+}
+
+static struct nft_expr_ops nft_counter_ops __read_mostly = {
+	.name		= "counter",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_counter)),
+	.policy		= nft_counter_policy,
+	.maxattr	= NFTA_COUNTER_MAX,
+	.owner		= THIS_MODULE,
+	.eval		= nft_counter_eval,
+	.init		= nft_counter_init,
+	.dump		= nft_counter_dump,
+};
+
+static int __init nft_counter_module_init(void)
+{
+	return nft_register_expr(&nft_counter_ops);
+}
+
+static void __exit nft_counter_module_exit(void)
+{
+	nft_unregister_expr(&nft_counter_ops);
+}
+
+module_init(nft_counter_module_init);
+module_exit(nft_counter_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("counter");
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
new file mode 100644
index 000000000000..a1756d678226
--- /dev/null
+++ b/net/netfilter/nft_ct.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+
+struct nft_ct {
+	enum nft_ct_keys	key:8;
+	enum ip_conntrack_dir	dir:8;
+	enum nft_registers	dreg:8;
+	uint8_t			family;
+};
+
+static void nft_ct_eval(const struct nft_expr *expr,
+			struct nft_data data[NFT_REG_MAX + 1],
+			const struct nft_pktinfo *pkt)
+{
+	const struct nft_ct *priv = nft_expr_priv(expr);
+	struct nft_data *dest = &data[priv->dreg];
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	const struct nf_conn_help *help;
+	const struct nf_conntrack_tuple *tuple;
+	const struct nf_conntrack_helper *helper;
+	long diff;
+	unsigned int state;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+
+	switch (priv->key) {
+	case NFT_CT_STATE:
+		if (ct == NULL)
+			state = NF_CT_STATE_INVALID_BIT;
+		else if (nf_ct_is_untracked(ct))
+			state = NF_CT_STATE_UNTRACKED_BIT;
+		else
+			state = NF_CT_STATE_BIT(ctinfo);
+		dest->data[0] = state;
+		return;
+	}
+
+	if (ct == NULL)
+		goto err;
+
+	switch (priv->key) {
+	case NFT_CT_DIRECTION:
+		dest->data[0] = CTINFO2DIR(ctinfo);
+		return;
+	case NFT_CT_STATUS:
+		dest->data[0] = ct->status;
+		return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+	case NFT_CT_MARK:
+		dest->data[0] = ct->mark;
+		return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	case NFT_CT_SECMARK:
+		dest->data[0] = ct->secmark;
+		return;
+#endif
+	case NFT_CT_EXPIRATION:
+		diff = (long)jiffies - (long)ct->timeout.expires;
+		if (diff < 0)
+			diff = 0;
+		dest->data[0] = jiffies_to_msecs(diff);
+		return;
+	case NFT_CT_HELPER:
+		if (ct->master == NULL)
+			goto err;
+		help = nfct_help(ct->master);
+		if (help == NULL)
+			goto err;
+		helper = rcu_dereference(help->helper);
+		if (helper == NULL)
+			goto err;
+		if (strlen(helper->name) >= sizeof(dest->data))
+			goto err;
+		strncpy((char *)dest->data, helper->name, sizeof(dest->data));
+		return;
+	}
+
+	tuple = &ct->tuplehash[priv->dir].tuple;
+	switch (priv->key) {
+	case NFT_CT_L3PROTOCOL:
+		dest->data[0] = nf_ct_l3num(ct);
+		return;
+	case NFT_CT_SRC:
+		memcpy(dest->data, tuple->src.u3.all,
+		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+		return;
+	case NFT_CT_DST:
+		memcpy(dest->data, tuple->dst.u3.all,
+		       nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+		return;
+	case NFT_CT_PROTOCOL:
+		dest->data[0] = nf_ct_protonum(ct);
+		return;
+	case NFT_CT_PROTO_SRC:
+		dest->data[0] = (__force __u16)tuple->src.u.all;
+		return;
+	case NFT_CT_PROTO_DST:
+		dest->data[0] = (__force __u16)tuple->dst.u.all;
+		return;
+	}
+	return;
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
+	[NFTA_CT_DREG]		= { .type = NLA_U32 },
+	[NFTA_CT_KEY]		= { .type = NLA_U32 },
+	[NFTA_CT_DIRECTION]	= { .type = NLA_U8 },
+};
+
+static int nft_ct_init(const struct nft_ctx *ctx,
+		       const struct nft_expr *expr,
+		       const struct nlattr * const tb[])
+{
+	struct nft_ct *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_CT_DREG] == NULL ||
+	    tb[NFTA_CT_KEY] == NULL)
+		return -EINVAL;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+	if (tb[NFTA_CT_DIRECTION] != NULL) {
+		priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+		switch (priv->dir) {
+		case IP_CT_DIR_ORIGINAL:
+		case IP_CT_DIR_REPLY:
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (priv->key) {
+	case NFT_CT_STATE:
+	case NFT_CT_DIRECTION:
+	case NFT_CT_STATUS:
+#ifdef CONFIG_NF_CONNTRACK_MARK
+	case NFT_CT_MARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	case NFT_CT_SECMARK:
+#endif
+	case NFT_CT_EXPIRATION:
+	case NFT_CT_HELPER:
+		if (tb[NFTA_CT_DIRECTION] != NULL)
+			return -EINVAL;
+		break;
+	case NFT_CT_PROTOCOL:
+	case NFT_CT_SRC:
+	case NFT_CT_DST:
+	case NFT_CT_PROTO_SRC:
+	case NFT_CT_PROTO_DST:
+		if (tb[NFTA_CT_DIRECTION] == NULL)
+			return -EINVAL;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nf_ct_l3proto_try_module_get(ctx->afi->family);
+	if (err < 0)
+		return err;
+	priv->family = ctx->afi->family;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		goto err1;
+
+	err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+	if (err < 0)
+		goto err1;
+	return 0;
+
+err1:
+	nf_ct_l3proto_module_put(ctx->afi->family);
+	return err;
+}
+
+static void nft_ct_destroy(const struct nft_expr *expr)
+{
+	struct nft_ct *priv = nft_expr_priv(expr);
+
+	nf_ct_l3proto_module_put(priv->family);
+}
+
+static int nft_ct_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_ct *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_ct_ops __read_mostly = {
+	.name		= "ct",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_ct_eval,
+	.init		= nft_ct_init,
+	.destroy	= nft_ct_destroy,
+	.dump		= nft_ct_dump,
+	.policy		= nft_ct_policy,
+	.maxattr	= NFTA_CT_MAX,
+};
+
+static int __init nft_ct_module_init(void)
+{
+	return nft_register_expr(&nft_ct_ops);
+}
+
+static void __exit nft_ct_module_exit(void)
+{
+	nft_unregister_expr(&nft_ct_ops);
+}
+
+module_init(nft_ct_module_init);
+module_exit(nft_ct_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("ct");
diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c
new file mode 100644
index 000000000000..9fc8eb308193
--- /dev/null
+++ b/net/netfilter/nft_expr_template.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_template {
+
+};
+
+static void nft_template_eval(const struct nft_expr *expr,
+			      struct nft_data data[NFT_REG_MAX + 1],
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = {
+	[NFTA_TEMPLATE_ATTR]		= { .type = NLA_U32 },
+};
+
+static int nft_template_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr *tb[])
+{
+	struct nft_template *priv = nft_expr_priv(expr);
+
+	return 0;
+}
+
+static void nft_template_destroy(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr)
+{
+	struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_template *priv = nft_expr_priv(expr);
+
+	NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops template_ops __read_mostly = {
+	.name		= "template",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_template)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_template_eval,
+	.init		= nft_template_init,
+	.destroy	= nft_template_destroy,
+	.dump		= nft_template_dump,
+	.policy		= nft_template_policy,
+	.maxattr	= NFTA_TEMPLATE_MAX,
+};
+
+static int __init nft_template_module_init(void)
+{
+	return nft_register_expr(&template_ops);
+}
+
+static void __exit nft_template_module_exit(void)
+{
+	nft_unregister_expr(&template_ops);
+}
+
+module_init(nft_template_module_init);
+module_exit(nft_template_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("template");
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
new file mode 100644
index 000000000000..21c6a6b7b662
--- /dev/null
+++ b/net/netfilter/nft_exthdr.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+// FIXME:
+#include <net/ipv6.h>
+
+struct nft_exthdr {
+	u8			type;
+	u8			offset;
+	u8			len;
+	enum nft_registers	dreg:8;
+};
+
+static void nft_exthdr_eval(const struct nft_expr *expr,
+			    struct nft_data data[NFT_REG_MAX + 1],
+			    const struct nft_pktinfo *pkt)
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	struct nft_data *dest = &data[priv->dreg];
+	unsigned int offset;
+	int err;
+
+	err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
+	if (err < 0)
+		goto err;
+	offset += priv->offset;
+
+	if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0)
+		goto err;
+	return;
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
+	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
+	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 },
+};
+
+static int nft_exthdr_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_EXTHDR_DREG] == NULL ||
+	    tb[NFTA_EXTHDR_TYPE] == NULL ||
+	    tb[NFTA_EXTHDR_OFFSET] == NULL ||
+	    tb[NFTA_EXTHDR_LEN] == NULL)
+		return -EINVAL;
+
+	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+	priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
+	priv->len    = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+	if (priv->len == 0 ||
+	    priv->len > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops exthdr_ops __read_mostly = {
+	.name		= "exthdr",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_exthdr_eval,
+	.init		= nft_exthdr_init,
+	.dump		= nft_exthdr_dump,
+	.policy		= nft_exthdr_policy,
+	.maxattr	= NFTA_EXTHDR_MAX,
+};
+
+static int __init nft_exthdr_module_init(void)
+{
+	return nft_register_expr(&exthdr_ops);
+}
+
+static void __exit nft_exthdr_module_exit(void)
+{
+	nft_unregister_expr(&exthdr_ops);
+}
+
+module_init(nft_exthdr_module_init);
+module_exit(nft_exthdr_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("exthdr");
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
new file mode 100644
index 000000000000..67cc502881f1
--- /dev/null
+++ b/net/netfilter/nft_hash.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_hash {
+	struct hlist_head	*hash;
+	unsigned int		hsize;
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	u8			klen;
+	u8			dlen;
+	u16			flags;
+};
+
+struct nft_hash_elem {
+	struct hlist_node	hnode;
+	struct nft_data		key;
+	struct nft_data		data[];
+};
+
+static u32 nft_hash_rnd __read_mostly;
+static bool nft_hash_rnd_initted __read_mostly;
+
+static unsigned int nft_hash_data(const struct nft_data *data,
+				  unsigned int hsize, unsigned int len)
+{
+	unsigned int h;
+
+	// FIXME: can we reasonably guarantee the upper bits are fixed?
+	h = jhash2(data->data, len >> 2, nft_hash_rnd);
+	return ((u64)h * hsize) >> 32;
+}
+
+static void nft_hash_eval(const struct nft_expr *expr,
+			  struct nft_data data[NFT_REG_MAX + 1],
+			  const struct nft_pktinfo *pkt)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	const struct nft_hash_elem *elem;
+	const struct nft_data *key = &data[priv->sreg];
+	unsigned int h;
+
+	h = nft_hash_data(key, priv->hsize, priv->klen);
+	hlist_for_each_entry(elem, &priv->hash[h], hnode) {
+		if (nft_data_cmp(&elem->key, key, priv->klen))
+			continue;
+		if (priv->flags & NFT_HASH_MAP)
+			nft_data_copy(&data[priv->dreg], elem->data);
+		return;
+	}
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static void nft_hash_elem_destroy(const struct nft_expr *expr,
+				  struct nft_hash_elem *elem)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+
+	nft_data_uninit(&elem->key, NFT_DATA_VALUE);
+	if (priv->flags & NFT_HASH_MAP)
+		nft_data_uninit(elem->data, nft_dreg_to_type(priv->dreg));
+	kfree(elem);
+}
+
+static const struct nla_policy nft_he_policy[NFTA_HE_MAX + 1] = {
+	[NFTA_HE_KEY]		= { .type = NLA_NESTED },
+	[NFTA_HE_DATA]		= { .type = NLA_NESTED },
+};
+
+static int nft_hash_elem_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr *nla,
+			      struct nft_hash_elem **new)
+{
+	struct nft_hash *priv = nft_expr_priv(expr);
+	struct nlattr *tb[NFTA_HE_MAX + 1];
+	struct nft_hash_elem *elem;
+	struct nft_data_desc d1, d2;
+	unsigned int size;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_HE_MAX, nla, nft_he_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_HE_KEY] == NULL)
+		return -EINVAL;
+	size = sizeof(*elem);
+
+	if (priv->flags & NFT_HASH_MAP) {
+		if (tb[NFTA_HE_DATA] == NULL)
+			return -EINVAL;
+		size += sizeof(elem->data[0]);
+	} else {
+		if (tb[NFTA_HE_DATA] != NULL)
+			return -EINVAL;
+	}
+
+	elem = kzalloc(size, GFP_KERNEL);
+	if (elem == NULL)
+		return -ENOMEM;
+
+	err = nft_data_init(ctx, &elem->key, &d1, tb[NFTA_HE_KEY]);
+	if (err < 0)
+		goto err1;
+	err = -EINVAL;
+	if (d1.type != NFT_DATA_VALUE || d1.len != priv->klen)
+		goto err2;
+
+	if (tb[NFTA_HE_DATA] != NULL) {
+		err = nft_data_init(ctx, elem->data, &d2, tb[NFTA_HE_DATA]);
+		if (err < 0)
+			goto err2;
+		err = nft_validate_data_load(ctx, priv->dreg, elem->data, d2.type);
+		if (err < 0)
+			goto err3;
+	}
+
+	*new = elem;
+	return 0;
+
+err3:
+	nft_data_uninit(elem->data, d2.type);
+err2:
+	nft_data_uninit(&elem->key, d1.type);
+err1:
+	kfree(elem);
+	return err;
+}
+
+static int nft_hash_elem_dump(struct sk_buff *skb, const struct nft_expr *expr,
+			      const struct nft_hash_elem *elem)
+
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_HE_KEY, &elem->key,
+			  NFT_DATA_VALUE, priv->klen) < 0)
+		goto nla_put_failure;
+
+	if (priv->flags & NFT_HASH_MAP) {
+		if (nft_data_dump(skb, NFTA_HE_DATA, elem->data,
+				  NFT_DATA_VALUE, priv->dlen) < 0)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static void nft_hash_destroy(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	const struct hlist_node *next;
+	struct nft_hash_elem *elem;
+	unsigned int i;
+
+	for (i = 0; i < priv->hsize; i++) {
+		hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) {
+			hlist_del(&elem->hnode);
+			nft_hash_elem_destroy(expr, elem);
+		}
+	}
+	kfree(priv->hash);
+}
+
+static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
+	[NFTA_HASH_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
+	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
+	[NFTA_HASH_KLEN]	= { .type = NLA_U32 },
+	[NFTA_HASH_ELEMENTS]	= { .type = NLA_NESTED },
+};
+
+static int nft_hash_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			 const struct nlattr * const tb[])
+{
+	struct nft_hash *priv = nft_expr_priv(expr);
+	struct nft_hash_elem *elem, *uninitialized_var(new);
+	const struct nlattr *nla;
+	unsigned int cnt, i;
+	unsigned int h;
+	int err, rem;
+
+	if (unlikely(!nft_hash_rnd_initted)) {
+		get_random_bytes(&nft_hash_rnd, 4);
+		nft_hash_rnd_initted = true;
+	}
+
+	if (tb[NFTA_HASH_SREG] == NULL ||
+	    tb[NFTA_HASH_KLEN] == NULL ||
+	    tb[NFTA_HASH_ELEMENTS] == NULL)
+		return -EINVAL;
+
+	if (tb[NFTA_HASH_FLAGS] != NULL) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_HASH_FLAGS]));
+		if (priv->flags & ~NFT_HASH_MAP)
+			return -EINVAL;
+	}
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_HASH_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_HASH_DREG] != NULL) {
+		if (!(priv->flags & NFT_HASH_MAP))
+			return -EINVAL;
+		priv->dreg = ntohl(nla_get_be32(tb[NFTA_HASH_DREG]));
+		err = nft_validate_output_register(priv->dreg);
+		if (err < 0)
+			return err;
+	}
+
+	priv->klen = ntohl(nla_get_be32(tb[NFTA_HASH_KLEN]));
+	if (priv->klen == 0)
+		return -EINVAL;
+
+	cnt = 0;
+	nla_for_each_nested(nla, tb[NFTA_HASH_ELEMENTS], rem) {
+		if (nla_type(nla) != NFTA_LIST_ELEM)
+			return -EINVAL;
+		cnt++;
+	}
+
+	/* Aim for a load factor of 0.75 */
+	cnt = cnt * 4 / 3;
+
+	priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL);
+	if (priv->hash == NULL)
+		return -ENOMEM;
+	priv->hsize = cnt;
+
+	for (i = 0; i < cnt; i++)
+		INIT_HLIST_HEAD(&priv->hash[i]);
+
+	err = -ENOMEM;
+	nla_for_each_nested(nla, tb[NFTA_HASH_ELEMENTS], rem) {
+		err = nft_hash_elem_init(ctx, expr, nla, &new);
+		if (err < 0)
+			goto err1;
+
+		h = nft_hash_data(&new->key, priv->hsize, priv->klen);
+		hlist_for_each_entry(elem, &priv->hash[h], hnode) {
+			if (nft_data_cmp(&elem->key, &new->key, priv->klen))
+				continue;
+			nft_hash_elem_destroy(expr, new);
+			err = -EEXIST;
+			goto err1;
+		}
+		hlist_add_head(&new->hnode, &priv->hash[h]);
+	}
+	return 0;
+
+err1:
+	nft_hash_destroy(ctx, expr);
+	return err;
+}
+
+static int nft_hash_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+	const struct nft_hash_elem *elem;
+	struct nlattr *list;
+	unsigned int i;
+
+	if (priv->flags)
+		if (nla_put_be32(skb, NFTA_HASH_FLAGS, htonl(priv->flags)))
+			goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_HASH_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (priv->flags & NFT_HASH_MAP)
+		if (nla_put_be32(skb, NFTA_HASH_DREG, htonl(priv->dreg)))
+			goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_HASH_KLEN, htonl(priv->klen)))
+		goto nla_put_failure;
+
+	list = nla_nest_start(skb, NFTA_HASH_ELEMENTS);
+	if (list == NULL)
+		goto nla_put_failure;
+
+	for (i = 0; i < priv->hsize; i++) {
+		hlist_for_each_entry(elem, &priv->hash[i], hnode) {
+			if (nft_hash_elem_dump(skb, expr, elem) < 0)
+				goto nla_put_failure;
+		}
+	}
+
+	nla_nest_end(skb, list);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_hash_ops __read_mostly = {
+	.name		= "hash",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_hash)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_hash_eval,
+	.init		= nft_hash_init,
+	.destroy	= nft_hash_destroy,
+	.dump		= nft_hash_dump,
+	.policy		= nft_hash_policy,
+	.maxattr	= NFTA_HASH_MAX,
+};
+
+static int __init nft_hash_module_init(void)
+{
+	return nft_register_expr(&nft_hash_ops);
+}
+
+static void __exit nft_hash_module_exit(void)
+{
+	nft_unregister_expr(&nft_hash_ops);
+}
+
+module_init(nft_hash_module_init);
+module_exit(nft_hash_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("hash");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
new file mode 100644
index 000000000000..3bf42c3cc49a
--- /dev/null
+++ b/net/netfilter/nft_immediate.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_immediate_expr {
+	struct nft_data		data;
+	enum nft_registers	dreg:8;
+	u8			dlen;
+};
+
+static void nft_immediate_eval(const struct nft_expr *expr,
+			       struct nft_data data[NFT_REG_MAX + 1],
+			       const struct nft_pktinfo *pkt)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+	nft_data_copy(&data[priv->dreg], &priv->data);
+}
+
+static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
+	[NFTA_IMMEDIATE_DREG]	= { .type = NLA_U32 },
+	[NFTA_IMMEDIATE_DATA]	= { .type = NLA_NESTED },
+};
+
+static int nft_immediate_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	struct nft_data_desc desc;
+	int err;
+
+	if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
+	    tb[NFTA_IMMEDIATE_DATA] == NULL)
+		return -EINVAL;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+
+	err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
+	if (err < 0)
+		return err;
+	priv->dlen = desc.len;
+
+	err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+
+err1:
+	nft_data_uninit(&priv->data, desc.type);
+	return err;
+}
+
+static void nft_immediate_destroy(const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
+}
+
+static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+
+	return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data,
+			     nft_dreg_to_type(priv->dreg), priv->dlen);
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_imm_ops __read_mostly = {
+	.name		= "immediate",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_immediate_eval,
+	.init		= nft_immediate_init,
+	.destroy	= nft_immediate_destroy,
+	.dump		= nft_immediate_dump,
+	.policy		= nft_immediate_policy,
+	.maxattr	= NFTA_IMMEDIATE_MAX,
+};
+
+int __init nft_immediate_module_init(void)
+{
+	return nft_register_expr(&nft_imm_ops);
+}
+
+void nft_immediate_module_exit(void)
+{
+	nft_unregister_expr(&nft_imm_ops);
+}
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
new file mode 100644
index 000000000000..e0e3fc8aebc3
--- /dev/null
+++ b/net/netfilter/nft_limit.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static DEFINE_SPINLOCK(limit_lock);
+
+struct nft_limit {
+	u64		tokens;
+	u64		rate;
+	u64		unit;
+	unsigned long	stamp;
+};
+
+static void nft_limit_eval(const struct nft_expr *expr,
+			   struct nft_data data[NFT_REG_MAX + 1],
+			   const struct nft_pktinfo *pkt)
+{
+	struct nft_limit *priv = nft_expr_priv(expr);
+
+	spin_lock_bh(&limit_lock);
+	if (time_after_eq(jiffies, priv->stamp)) {
+		priv->tokens = priv->rate;
+		priv->stamp = jiffies + priv->unit * HZ;
+	}
+
+	if (priv->tokens >= 1) {
+		priv->tokens--;
+		spin_unlock_bh(&limit_lock);
+		return;
+	}
+	spin_unlock_bh(&limit_lock);
+
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
+	[NFTA_LIMIT_RATE]	= { .type = NLA_U64 },
+	[NFTA_LIMIT_UNIT]	= { .type = NLA_U64 },
+};
+
+static int nft_limit_init(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nlattr * const tb[])
+{
+	struct nft_limit *priv = nft_expr_priv(expr);
+
+	if (tb[NFTA_LIMIT_RATE] == NULL ||
+	    tb[NFTA_LIMIT_UNIT] == NULL)
+		return -EINVAL;
+
+	priv->rate   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+	priv->unit   = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+	priv->stamp  = jiffies + priv->unit * HZ;
+	priv->tokens = priv->rate;
+	return 0;
+}
+
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_limit *priv = nft_expr_priv(expr);
+
+	if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)))
+		goto nla_put_failure;
+	if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_limit_ops __read_mostly = {
+	.name		= "limit",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_limit_eval,
+	.init		= nft_limit_init,
+	.dump		= nft_limit_dump,
+	.policy		= nft_limit_policy,
+	.maxattr	= NFTA_LIMIT_MAX,
+};
+
+static int __init nft_limit_module_init(void)
+{
+	return nft_register_expr(&nft_limit_ops);
+}
+
+static void __exit nft_limit_module_exit(void)
+{
+	nft_unregister_expr(&nft_limit_ops);
+}
+
+module_init(nft_limit_module_init);
+module_exit(nft_limit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("limit");
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
new file mode 100644
index 000000000000..da495c3b1e7e
--- /dev/null
+++ b/net/netfilter/nft_log.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netdevice.h>
+
+static const char *nft_log_null_prefix = "";
+
+struct nft_log {
+	struct nf_loginfo	loginfo;
+	char			*prefix;
+	int			family;
+};
+
+static void nft_log_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_log *priv = nft_expr_priv(expr);
+	struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+	nf_log_packet(net, priv->family, pkt->hooknum, pkt->skb, pkt->in,
+		      pkt->out, &priv->loginfo, "%s", priv->prefix);
+}
+
+static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
+	[NFTA_LOG_GROUP]	= { .type = NLA_U16 },
+	[NFTA_LOG_PREFIX]	= { .type = NLA_STRING },
+	[NFTA_LOG_SNAPLEN]	= { .type = NLA_U32 },
+	[NFTA_LOG_QTHRESHOLD]	= { .type = NLA_U16 },
+};
+
+static int nft_log_init(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_log *priv = nft_expr_priv(expr);
+	struct nf_loginfo *li = &priv->loginfo;
+	const struct nlattr *nla;
+
+	priv->family = ctx->afi->family;
+
+	nla = tb[NFTA_LOG_PREFIX];
+	if (nla != NULL) {
+		priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+		if (priv->prefix == NULL)
+			return -ENOMEM;
+		nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+	} else
+		priv->prefix = (char *)nft_log_null_prefix;
+
+	li->type = NF_LOG_TYPE_ULOG;
+	if (tb[NFTA_LOG_GROUP] != NULL)
+		li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP]));
+
+	if (tb[NFTA_LOG_SNAPLEN] != NULL)
+		li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN]));
+	if (tb[NFTA_LOG_QTHRESHOLD] != NULL) {
+		li->u.ulog.qthreshold =
+			ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD]));
+	}
+
+	return 0;
+}
+
+static void nft_log_destroy(const struct nft_expr *expr)
+{
+	struct nft_log *priv = nft_expr_priv(expr);
+
+	if (priv->prefix != nft_log_null_prefix)
+		kfree(priv->prefix);
+}
+
+static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_log *priv = nft_expr_priv(expr);
+	const struct nf_loginfo *li = &priv->loginfo;
+
+	if (priv->prefix != nft_log_null_prefix)
+		if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix))
+			goto nla_put_failure;
+	if (li->u.ulog.group)
+		if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group)))
+			goto nla_put_failure;
+	if (li->u.ulog.copy_len)
+		if (nla_put_be32(skb, NFTA_LOG_SNAPLEN,
+				 htonl(li->u.ulog.copy_len)))
+			goto nla_put_failure;
+	if (li->u.ulog.qthreshold)
+		if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD,
+				 htons(li->u.ulog.qthreshold)))
+			goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_log_ops __read_mostly = {
+	.name		= "log",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_log)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_log_eval,
+	.init		= nft_log_init,
+	.destroy	= nft_log_destroy,
+	.dump		= nft_log_dump,
+	.policy		= nft_log_policy,
+	.maxattr	= NFTA_LOG_MAX,
+};
+
+static int __init nft_log_module_init(void)
+{
+	return nft_register_expr(&nft_log_ops);
+}
+
+static void __exit nft_log_module_exit(void)
+{
+	nft_unregister_expr(&nft_log_ops);
+}
+
+module_init(nft_log_module_init);
+module_exit(nft_log_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("log");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
new file mode 100644
index 000000000000..96735aa2f039
--- /dev/null
+++ b/net/netfilter/nft_meta.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
+#include <net/netfilter/nf_tables.h>
+
+struct nft_meta {
+	enum nft_meta_keys	key:8;
+	enum nft_registers	dreg:8;
+};
+
+static void nft_meta_eval(const struct nft_expr *expr,
+			  struct nft_data data[NFT_REG_MAX + 1],
+			  const struct nft_pktinfo *pkt)
+{
+	const struct nft_meta *priv = nft_expr_priv(expr);
+	const struct sk_buff *skb = pkt->skb;
+	const struct net_device *in = pkt->in, *out = pkt->out;
+	struct nft_data *dest = &data[priv->dreg];
+
+	switch (priv->key) {
+	case NFT_META_LEN:
+		dest->data[0] = skb->len;
+		break;
+	case NFT_META_PROTOCOL:
+		*(__be16 *)dest->data = skb->protocol;
+		break;
+	case NFT_META_PRIORITY:
+		dest->data[0] = skb->priority;
+		break;
+	case NFT_META_MARK:
+		dest->data[0] = skb->mark;
+		break;
+	case NFT_META_IIF:
+		if (in == NULL)
+			goto err;
+		dest->data[0] = in->ifindex;
+		break;
+	case NFT_META_OIF:
+		if (out == NULL)
+			goto err;
+		dest->data[0] = out->ifindex;
+		break;
+	case NFT_META_IIFNAME:
+		if (in == NULL)
+			goto err;
+		strncpy((char *)dest->data, in->name, sizeof(dest->data));
+		break;
+	case NFT_META_OIFNAME:
+		if (out == NULL)
+			goto err;
+		strncpy((char *)dest->data, out->name, sizeof(dest->data));
+		break;
+	case NFT_META_IIFTYPE:
+		if (in == NULL)
+			goto err;
+		*(u16 *)dest->data = in->type;
+		break;
+	case NFT_META_OIFTYPE:
+		if (out == NULL)
+			goto err;
+		*(u16 *)dest->data = out->type;
+		break;
+	case NFT_META_SKUID:
+		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+			goto err;
+
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket == NULL ||
+		    skb->sk->sk_socket->file == NULL) {
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+			goto err;
+		}
+
+		dest->data[0] =
+			from_kuid_munged(&init_user_ns,
+				skb->sk->sk_socket->file->f_cred->fsuid);
+		read_unlock_bh(&skb->sk->sk_callback_lock);
+		break;
+	case NFT_META_SKGID:
+		if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+			goto err;
+
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket == NULL ||
+		    skb->sk->sk_socket->file == NULL) {
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+			goto err;
+		}
+		dest->data[0] =
+			from_kgid_munged(&init_user_ns,
+				 skb->sk->sk_socket->file->f_cred->fsgid);
+		read_unlock_bh(&skb->sk->sk_callback_lock);
+		break;
+#ifdef CONFIG_NET_CLS_ROUTE
+	case NFT_META_RTCLASSID: {
+		const struct dst_entry *dst = skb_dst(skb);
+
+		if (dst == NULL)
+			goto err;
+		dest->data[0] = dst->tclassid;
+		break;
+	}
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+		dest->data[0] = skb->secmark;
+		break;
+#endif
+	default:
+		WARN_ON(1);
+		goto err;
+	}
+	return;
+
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+	[NFTA_META_DREG]	= { .type = NLA_U32 },
+	[NFTA_META_KEY]		= { .type = NLA_U32 },
+};
+
+static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			 const struct nlattr * const tb[])
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_META_DREG] == NULL ||
+	    tb[NFTA_META_KEY] == NULL)
+		return -EINVAL;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (priv->key) {
+	case NFT_META_LEN:
+	case NFT_META_PROTOCOL:
+	case NFT_META_PRIORITY:
+	case NFT_META_MARK:
+	case NFT_META_IIF:
+	case NFT_META_OIF:
+	case NFT_META_IIFNAME:
+	case NFT_META_OIFNAME:
+	case NFT_META_IIFTYPE:
+	case NFT_META_OIFTYPE:
+	case NFT_META_SKUID:
+	case NFT_META_SKGID:
+#ifdef CONFIG_NET_CLS_ROUTE
+	case NFT_META_RTCLASSID:
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+#endif
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_meta_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_meta *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_meta_ops __read_mostly = {
+	.name		= "meta",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_meta_eval,
+	.init		= nft_meta_init,
+	.dump		= nft_meta_dump,
+	.policy		= nft_meta_policy,
+	.maxattr	= NFTA_META_MAX,
+};
+
+static int __init nft_meta_module_init(void)
+{
+	return nft_register_expr(&nft_meta_ops);
+}
+
+static void __exit nft_meta_module_exit(void)
+{
+	nft_unregister_expr(&nft_meta_ops);
+}
+
+module_init(nft_meta_module_init);
+module_exit(nft_meta_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_meta_target.c b/net/netfilter/nft_meta_target.c
new file mode 100644
index 000000000000..71177df75ffb
--- /dev/null
+++ b/net/netfilter/nft_meta_target.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_meta {
+	enum nft_meta_keys	key;
+};
+
+static void nft_meta_eval(const struct nft_expr *expr,
+			  struct nft_data *nfres,
+			  struct nft_data *data,
+			  const struct nft_pktinfo *pkt)
+{
+	const struct nft_meta *meta = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	u32 val = data->data[0];
+
+	switch (meta->key) {
+	case NFT_META_MARK:
+		skb->mark = val;
+		break;
+	case NFT_META_PRIORITY:
+		skb->priority = val;
+		break;
+	case NFT_META_NFTRACE:
+		skb->nf_trace = val;
+		break;
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+		skb->secmark = val;
+		break;
+#endif
+	default:
+		WARN_ON(1);
+	}
+}
+
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+	[NFTA_META_KEY]		= { .type = NLA_U32 },
+};
+
+static int nft_meta_init(const struct nft_expr *expr, struct nlattr *tb[])
+{
+	struct nft_meta *meta = nft_expr_priv(expr);
+
+	if (tb[NFTA_META_KEY] == NULL)
+		return -EINVAL;
+
+	meta->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (meta->key) {
+	case NFT_META_MARK:
+	case NFT_META_PRIORITY:
+	case NFT_META_NFTRACE:
+#ifdef CONFIG_NETWORK_SECMARK
+	case NFT_META_SECMARK:
+#endif
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nft_meta_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_meta *meta = nft_expr_priv(expr);
+
+	NLA_PUT_BE32(skb, NFTA_META_KEY, htonl(meta->key));
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops meta_target __read_mostly = {
+	.name		= "meta",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_meta_eval,
+	.init		= nft_meta_init,
+	.dump		= nft_meta_dump,
+	.policy		= nft_meta_policy,
+	.maxattr	= NFTA_META_MAX,
+};
+
+static int __init nft_meta_target_init(void)
+{
+	return nft_register_expr(&meta_target);
+}
+
+static void __exit nft_meta_target_exit(void)
+{
+	nft_unregister_expr(&meta_target);
+}
+
+module_init(nft_meta_target_init);
+module_exit(nft_meta_target_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
new file mode 100644
index 000000000000..329f134b3f89
--- /dev/null
+++ b/net/netfilter/nft_payload.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_payload {
+	enum nft_payload_bases	base:8;
+	u8			offset;
+	u8			len;
+	enum nft_registers	dreg:8;
+};
+
+static void nft_payload_eval(const struct nft_expr *expr,
+			     struct nft_data data[NFT_REG_MAX + 1],
+			     const struct nft_pktinfo *pkt)
+{
+	const struct nft_payload *priv = nft_expr_priv(expr);
+	const struct sk_buff *skb = pkt->skb;
+	struct nft_data *dest = &data[priv->dreg];
+	int offset;
+
+	switch (priv->base) {
+	case NFT_PAYLOAD_LL_HEADER:
+		if (!skb_mac_header_was_set(skb))
+			goto err;
+		offset = skb_mac_header(skb) - skb->data;
+		break;
+	case NFT_PAYLOAD_NETWORK_HEADER:
+		offset = skb_network_offset(skb);
+		break;
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		offset = skb_transport_offset(skb);
+		break;
+	default:
+		BUG();
+	}
+	offset += priv->offset;
+
+	if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0)
+		goto err;
+	return;
+err:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
+	[NFTA_PAYLOAD_DREG]	= { .type = NLA_U32 },
+	[NFTA_PAYLOAD_BASE]	= { .type = NLA_U32 },
+	[NFTA_PAYLOAD_OFFSET]	= { .type = NLA_U32 },
+	[NFTA_PAYLOAD_LEN]	= { .type = NLA_U32 },
+};
+
+static int nft_payload_init(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nlattr * const tb[])
+{
+	struct nft_payload *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_PAYLOAD_DREG] == NULL ||
+	    tb[NFTA_PAYLOAD_BASE] == NULL ||
+	    tb[NFTA_PAYLOAD_OFFSET] == NULL ||
+	    tb[NFTA_PAYLOAD_LEN] == NULL)
+		return -EINVAL;
+
+	priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+	switch (priv->base) {
+	case NFT_PAYLOAD_LL_HEADER:
+	case NFT_PAYLOAD_NETWORK_HEADER:
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+	priv->len    = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+	if (priv->len == 0 ||
+	    priv->len > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG]));
+	err = nft_validate_output_register(priv->dreg);
+	if (err < 0)
+		return err;
+	return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+}
+
+static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_payload *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) ||
+	    nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) ||
+	    nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) ||
+	    nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_payload_ops __read_mostly = {
+	.name		= "payload",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_payload_eval,
+	.init		= nft_payload_init,
+	.dump		= nft_payload_dump,
+	.policy		= nft_payload_policy,
+	.maxattr	= NFTA_PAYLOAD_MAX,
+};
+
+int __init nft_payload_module_init(void)
+{
+	return nft_register_expr(&nft_payload_ops);
+}
+
+void nft_payload_module_exit(void)
+{
+	nft_unregister_expr(&nft_payload_ops);
+}
diff --git a/net/netfilter/nft_set.c b/net/netfilter/nft_set.c
new file mode 100644
index 000000000000..7b7c8354c327
--- /dev/null
+++ b/net/netfilter/nft_set.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_set {
+	struct rb_root		root;
+	enum nft_registers	sreg:8;
+	enum nft_registers	dreg:8;
+	u8			klen;
+	u8			dlen;
+	u16			flags;
+};
+
+struct nft_set_elem {
+	struct rb_node		node;
+	enum nft_set_elem_flags	flags;
+	struct nft_data		key;
+	struct nft_data		data[];
+};
+
+static void nft_set_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_set *priv = nft_expr_priv(expr);
+	const struct rb_node *parent = priv->root.rb_node;
+	const struct nft_set_elem *elem, *interval = NULL;
+	const struct nft_data *key = &data[priv->sreg];
+	int d;
+
+	while (parent != NULL) {
+		elem = rb_entry(parent, struct nft_set_elem, node);
+
+		d = nft_data_cmp(&elem->key, key, priv->klen);
+		if (d < 0) {
+			parent = parent->rb_left;
+			interval = elem;
+		} else if (d > 0)
+			parent = parent->rb_right;
+		else {
+found:
+			if (elem->flags & NFT_SE_INTERVAL_END)
+				goto out;
+			if (priv->flags & NFT_SET_MAP)
+				nft_data_copy(&data[priv->dreg], elem->data);
+			return;
+		}
+	}
+
+	if (priv->flags & NFT_SET_INTERVAL && interval != NULL) {
+		elem = interval;
+		goto found;
+	}
+out:
+	data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static void nft_set_elem_destroy(const struct nft_expr *expr,
+				 struct nft_set_elem *elem)
+{
+	const struct nft_set *priv = nft_expr_priv(expr);
+
+	nft_data_uninit(&elem->key, NFT_DATA_VALUE);
+	if (priv->flags & NFT_SET_MAP)
+		nft_data_uninit(elem->data, nft_dreg_to_type(priv->dreg));
+	kfree(elem);
+}
+
+static const struct nla_policy nft_se_policy[NFTA_SE_MAX + 1] = {
+	[NFTA_SE_KEY]		= { .type = NLA_NESTED },
+	[NFTA_SE_DATA]		= { .type = NLA_NESTED },
+	[NFTA_SE_FLAGS]		= { .type = NLA_U32 },
+};
+
+static int nft_set_elem_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr *nla,
+			     struct nft_set_elem **new)
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	struct nlattr *tb[NFTA_SE_MAX + 1];
+	struct nft_set_elem *elem;
+	struct nft_data_desc d1, d2;
+	enum nft_set_elem_flags flags = 0;
+	unsigned int size;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_SE_MAX, nla, nft_se_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_SE_KEY] == NULL)
+		return -EINVAL;
+
+	if (tb[NFTA_SE_FLAGS] != NULL) {
+		flags = ntohl(nla_get_be32(tb[NFTA_SE_FLAGS]));
+		if (flags & ~NFT_SE_INTERVAL_END)
+			return -EINVAL;
+	}
+
+	size = sizeof(*elem);
+	if (priv->flags & NFT_SET_MAP) {
+		if (tb[NFTA_SE_DATA] == NULL && !(flags & NFT_SE_INTERVAL_END))
+			return -EINVAL;
+		size += sizeof(elem->data[0]);
+	} else {
+		if (tb[NFTA_SE_DATA] != NULL)
+			return -EINVAL;
+	}
+
+	elem = kzalloc(size, GFP_KERNEL);
+	if (elem == NULL)
+		return -ENOMEM;
+	elem->flags = flags;
+
+	err = nft_data_init(ctx, &elem->key, &d1, tb[NFTA_SE_KEY]);
+	if (err < 0)
+		goto err1;
+	err = -EINVAL;
+	if (d1.type != NFT_DATA_VALUE || d1.len != priv->klen)
+		goto err2;
+
+	if (tb[NFTA_SE_DATA] != NULL) {
+		err = nft_data_init(ctx, elem->data, &d2, tb[NFTA_SE_DATA]);
+		if (err < 0)
+			goto err2;
+		err = -EINVAL;
+		if (priv->dreg != NFT_REG_VERDICT && d2.len != priv->dlen)
+			goto err2;
+		err = nft_validate_data_load(ctx, priv->dreg, elem->data, d2.type);
+		if (err < 0)
+			goto err3;
+	}
+
+	*new = elem;
+	return 0;
+
+err3:
+	nft_data_uninit(elem->data, d2.type);
+err2:
+	nft_data_uninit(&elem->key, d1.type);
+err1:
+	kfree(elem);
+	return err;
+}
+
+static int nft_set_elem_dump(struct sk_buff *skb, const struct nft_expr *expr,
+			     const struct nft_set_elem *elem)
+
+{
+	const struct nft_set *priv = nft_expr_priv(expr);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_SE_KEY, &elem->key,
+			  NFT_DATA_VALUE, priv->klen) < 0)
+		goto nla_put_failure;
+
+	if (priv->flags & NFT_SET_MAP && !(elem->flags & NFT_SE_INTERVAL_END)) {
+		if (nft_data_dump(skb, NFTA_SE_DATA, elem->data,
+				  nft_dreg_to_type(priv->dreg), priv->dlen) < 0)
+			goto nla_put_failure;
+	}
+
+	if (elem->flags){
+		if (nla_put_be32(skb, NFTA_SE_FLAGS, htonl(elem->flags)))
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static void nft_set_destroy(const struct nft_expr *expr)
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	struct nft_set_elem *elem;
+	struct rb_node *node;
+
+	while ((node = priv->root.rb_node) != NULL) {
+		rb_erase(node, &priv->root);
+		elem = rb_entry(node, struct nft_set_elem, node);
+		nft_set_elem_destroy(expr, elem);
+	}
+}
+
+static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
+	[NFTA_SET_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_SET_SREG]		= { .type = NLA_U32 },
+	[NFTA_SET_DREG]		= { .type = NLA_U32 },
+	[NFTA_SET_KLEN]		= { .type = NLA_U32 },
+	[NFTA_SET_DLEN]		= { .type = NLA_U32 },
+	[NFTA_SET_ELEMENTS]	= { .type = NLA_NESTED },
+};
+
+static int nft_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	struct nft_set_elem *elem, *uninitialized_var(new);
+	struct rb_node *parent, **p;
+	const struct nlattr *nla;
+	int err, rem, d;
+
+	if (tb[NFTA_SET_SREG] == NULL ||
+	    tb[NFTA_SET_KLEN] == NULL ||
+	    tb[NFTA_SET_ELEMENTS] == NULL)
+		return -EINVAL;
+
+	priv->root = RB_ROOT;
+
+	if (tb[NFTA_SET_FLAGS] != NULL) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_SET_FLAGS]));
+		if (priv->flags & ~(NFT_SET_INTERVAL | NFT_SET_MAP))
+			return -EINVAL;
+	}
+
+	priv->sreg = ntohl(nla_get_be32(tb[NFTA_SET_SREG]));
+	err = nft_validate_input_register(priv->sreg);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_SET_DREG] != NULL) {
+		if (!(priv->flags & NFT_SET_MAP))
+			return -EINVAL;
+		if (tb[NFTA_SET_DLEN] == NULL)
+			return -EINVAL;
+
+		priv->dreg = ntohl(nla_get_be32(tb[NFTA_SET_DREG]));
+		err = nft_validate_output_register(priv->dreg);
+		if (err < 0)
+			return err;
+
+		if (priv->dreg == NFT_REG_VERDICT)
+			priv->dlen = FIELD_SIZEOF(struct nft_data, data);
+		else {
+			priv->dlen = ntohl(nla_get_be32(tb[NFTA_SET_DLEN]));
+			if (priv->dlen == 0 ||
+			    priv->dlen > FIELD_SIZEOF(struct nft_data, data))
+				return -EINVAL;
+		}
+	} else {
+		if (priv->flags & NFT_SET_MAP)
+			return -EINVAL;
+		if (tb[NFTA_SET_DLEN] != NULL)
+			return -EINVAL;
+	}
+
+	priv->klen = ntohl(nla_get_be32(tb[NFTA_SET_KLEN]));
+	if (priv->klen == 0 ||
+	    priv->klen > FIELD_SIZEOF(struct nft_data, data))
+		return -EINVAL;
+
+	nla_for_each_nested(nla, tb[NFTA_SET_ELEMENTS], rem) {
+		err = -EINVAL;
+		if (nla_type(nla) != NFTA_LIST_ELEM)
+			goto err1;
+
+		err = nft_set_elem_init(ctx, expr, nla, &new);
+		if (err < 0)
+			goto err1;
+
+		parent = NULL;
+		p = &priv->root.rb_node;
+		while (*p != NULL) {
+			parent = *p;
+			elem = rb_entry(parent, struct nft_set_elem, node);
+			d = nft_data_cmp(&elem->key, &new->key, priv->klen);
+			if (d < 0)
+				p = &parent->rb_left;
+			else if (d > 0)
+				p = &parent->rb_right;
+			else {
+				err = -EEXIST;
+				goto err2;
+			}
+		}
+		rb_link_node(&new->node, parent, p);
+		rb_insert_color(&new->node, &priv->root);
+	}
+
+	return 0;
+
+err2:
+	nft_set_elem_destroy(expr, new);
+err1:
+	nft_set_destroy(expr);
+	return err;
+}
+
+static int nft_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_set *priv = nft_expr_priv(expr);
+	const struct nft_set_elem *elem;
+	struct rb_node *node;
+	struct nlattr *list;
+
+	if (priv->flags) {
+		if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(priv->flags)))
+			goto nla_put_failure;
+	}
+
+	if (nla_put_be32(skb, NFTA_SET_SREG, htonl(priv->sreg)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_SET_KLEN, htonl(priv->klen)))
+		goto nla_put_failure;
+
+	if (priv->flags & NFT_SET_MAP) {
+		if (nla_put_be32(skb, NFTA_SET_DREG, htonl(priv->dreg)))
+			goto nla_put_failure;
+		if (nla_put_be32(skb, NFTA_SET_DLEN, htonl(priv->dlen)))
+			goto nla_put_failure;
+	}
+
+	list = nla_nest_start(skb, NFTA_SET_ELEMENTS);
+	if (list == NULL)
+		goto nla_put_failure;
+
+	for (node = rb_first(&priv->root); node; node = rb_next(node)) {
+		elem = rb_entry(node, struct nft_set_elem, node);
+		if (nft_set_elem_dump(skb, expr, elem) < 0)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, list);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_ops nft_set_ops __read_mostly = {
+	.name		= "set",
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_set)),
+	.owner		= THIS_MODULE,
+	.eval		= nft_set_eval,
+	.init		= nft_set_init,
+	.destroy	= nft_set_destroy,
+	.dump		= nft_set_dump,
+	.policy		= nft_set_policy,
+	.maxattr	= NFTA_SET_MAX,
+};
+
+static int __init nft_set_module_init(void)
+{
+	return nft_register_expr(&nft_set_ops);
+}
+
+static void __exit nft_set_module_exit(void)
+{
+	nft_unregister_expr(&nft_set_ops);
+}
+
+module_init(nft_set_module_init);
+module_exit(nft_set_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("set");
-- 
cgit v1.2.3


From ef1f7df9170dbd875ce198ba84e6ab80f6fc139e Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Oct 2013 11:41:20 +0200
Subject: netfilter: nf_tables: expression ops overloading

Split the expression ops into two parts and support overloading of
the runtime expression ops based on the requested function through
a ->select_ops() callback.

This can be used to provide optimized implementations, for instance
for loading small aligned amounts of data from the packet or inlining
frequently used operations into the main evaluation loop.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h      |  42 +++++++++-----
 net/ipv4/netfilter/nf_table_nat_ipv4.c |  18 ++++--
 net/ipv4/netfilter/nft_reject_ipv4.c   |  18 ++++--
 net/netfilter/nf_tables_api.c          | 101 +++++++++++++++++++--------------
 net/netfilter/nft_bitwise.c            |  18 ++++--
 net/netfilter/nft_byteorder.c          |  18 ++++--
 net/netfilter/nft_cmp.c                |  18 ++++--
 net/netfilter/nft_counter.c            |  22 ++++---
 net/netfilter/nft_ct.c                 |  18 ++++--
 net/netfilter/nft_expr_template.c      |  20 ++++---
 net/netfilter/nft_exthdr.c             |  16 ++++--
 net/netfilter/nft_immediate.c          |  18 ++++--
 net/netfilter/nft_limit.c              |  18 ++++--
 net/netfilter/nft_log.c                |  18 ++++--
 net/netfilter/nft_lookup.c             |  16 ++++--
 net/netfilter/nft_meta.c               |  18 ++++--
 net/netfilter/nft_payload.c            |  18 ++++--
 17 files changed, 267 insertions(+), 148 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 677dd79380ed..66d0359702c6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -222,25 +222,45 @@ extern int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 extern void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 				 struct nft_set_binding *binding);
 
+
 /**
- *	struct nft_expr_ops - nf_tables expression operations
+ *	struct nft_expr_type - nf_tables expression type
  *
- *	@eval: Expression evaluation function
- *	@init: initialization function
- *	@destroy: destruction function
- *	@dump: function to dump parameters
+ *	@select_ops: function to select nft_expr_ops
+ *	@ops: default ops, used when no select_ops functions is present
  *	@list: used internally
  *	@name: Identifier
  *	@owner: module reference
  *	@policy: netlink attribute policy
  *	@maxattr: highest netlink attribute number
+ */
+struct nft_expr_type {
+	const struct nft_expr_ops	*(*select_ops)(const struct nlattr * const tb[]);
+	const struct nft_expr_ops	*ops;
+	struct list_head		list;
+	const char			*name;
+	struct module			*owner;
+	const struct nla_policy		*policy;
+	unsigned int			maxattr;
+};
+
+/**
+ *	struct nft_expr_ops - nf_tables expression operations
+ *
+ *	@eval: Expression evaluation function
  *	@size: full expression size, including private data size
+ *	@init: initialization function
+ *	@destroy: destruction function
+ *	@dump: function to dump parameters
+ *	@type: expression type
  */
 struct nft_expr;
 struct nft_expr_ops {
 	void				(*eval)(const struct nft_expr *expr,
 						struct nft_data data[NFT_REG_MAX + 1],
 						const struct nft_pktinfo *pkt);
+	unsigned int			size;
+
 	int				(*init)(const struct nft_ctx *ctx,
 						const struct nft_expr *expr,
 						const struct nlattr * const tb[]);
@@ -248,14 +268,10 @@ struct nft_expr_ops {
 	int				(*dump)(struct sk_buff *skb,
 						const struct nft_expr *expr);
 	const struct nft_data *		(*get_verdict)(const struct nft_expr *expr);
-	struct list_head		list;
-	const char			*name;
-	struct module			*owner;
-	const struct nla_policy		*policy;
-	unsigned int			maxattr;
-	unsigned int			size;
+	const struct nft_expr_type	*type;
 };
 
+#define NFT_EXPR_MAXATTR		16
 #define NFT_EXPR_SIZE(size)		(sizeof(struct nft_expr) + \
 					 ALIGN(size, __alignof__(struct nft_expr)))
 
@@ -418,8 +434,8 @@ extern void nft_unregister_afinfo(struct nft_af_info *);
 extern int nft_register_table(struct nft_table *, int family);
 extern void nft_unregister_table(struct nft_table *, int family);
 
-extern int nft_register_expr(struct nft_expr_ops *);
-extern void nft_unregister_expr(struct nft_expr_ops *);
+extern int nft_register_expr(struct nft_expr_type *);
+extern void nft_unregister_expr(struct nft_expr_type *);
 
 #define MODULE_ALIAS_NFT_FAMILY(family)	\
 	MODULE_ALIAS("nft-afinfo-" __stringify(family))
diff --git a/net/ipv4/netfilter/nf_table_nat_ipv4.c b/net/ipv4/netfilter/nf_table_nat_ipv4.c
index 2a6f184c10bd..2ecce39077a3 100644
--- a/net/ipv4/netfilter/nf_table_nat_ipv4.c
+++ b/net/ipv4/netfilter/nf_table_nat_ipv4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -149,15 +149,21 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_nat_ops __read_mostly = {
-	.name		= "nat",
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+	.type		= &nft_nat_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_nat_eval,
 	.init		= nft_nat_init,
 	.dump		= nft_nat_dump,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+	.name		= "nat",
+	.ops		= &nft_nat_ops,
 	.policy		= nft_nat_policy,
 	.maxattr	= NFTA_NAT_MAX,
+	.owner		= THIS_MODULE,
 };
 
 /*
@@ -382,7 +388,7 @@ static int __init nf_table_nat_init(void)
 	if (err < 0)
 		goto err1;
 
-	err = nft_register_expr(&nft_nat_ops);
+	err = nft_register_expr(&nft_nat_type);
 	if (err < 0)
 		goto err2;
 
@@ -396,7 +402,7 @@ err1:
 
 static void __exit nf_table_nat_exit(void)
 {
-	nft_unregister_expr(&nft_nat_ops);
+	nft_unregister_expr(&nft_nat_type);
 	nft_unregister_table(&nf_table_nat_ipv4, AF_INET);
 }
 
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index b4ee8d3bb1e4..fff5ba1a33b7 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -88,25 +88,31 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops reject_ops __read_mostly = {
-	.name		= "reject",
+static struct nft_expr_type nft_reject_type;
+static const struct nft_expr_ops nft_reject_ops = {
+	.type		= &nft_reject_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_reject_eval,
 	.init		= nft_reject_init,
 	.dump		= nft_reject_dump,
+};
+
+static struct nft_expr_type nft_reject_type __read_mostly = {
+	.name		= "reject",
+	.ops		= &nft_reject_ops,
 	.policy		= nft_reject_policy,
 	.maxattr	= NFTA_REJECT_MAX,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_reject_module_init(void)
 {
-	return nft_register_expr(&reject_ops);
+	return nft_register_expr(&nft_reject_type);
 }
 
 static void __exit nft_reject_module_exit(void)
 {
-	nft_unregister_expr(&reject_ops);
+	nft_unregister_expr(&nft_reject_type);
 }
 
 module_init(nft_reject_module_init);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 5092c817c222..6dac9a3c9c40 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -840,64 +840,64 @@ static void nft_ctx_init(struct nft_ctx *ctx,
  */
 
 /**
- *	nft_register_expr - register nf_tables expr operations
- *	@ops: expr operations
+ *	nft_register_expr - register nf_tables expr type
+ *	@ops: expr type
  *
- *	Registers the expr operations for use with nf_tables. Returns zero on
+ *	Registers the expr type for use with nf_tables. Returns zero on
  *	success or a negative errno code otherwise.
  */
-int nft_register_expr(struct nft_expr_ops *ops)
+int nft_register_expr(struct nft_expr_type *type)
 {
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_add_tail(&ops->list, &nf_tables_expressions);
+	list_add_tail(&type->list, &nf_tables_expressions);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nft_register_expr);
 
 /**
- *	nft_unregister_expr - unregister nf_tables expr operations
- *	@ops: expr operations
+ *	nft_unregister_expr - unregister nf_tables expr type
+ *	@ops: expr type
  *
- * 	Unregisters the expr operations for use with nf_tables.
+ * 	Unregisters the expr typefor use with nf_tables.
  */
-void nft_unregister_expr(struct nft_expr_ops *ops)
+void nft_unregister_expr(struct nft_expr_type *type)
 {
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_del(&ops->list);
+	list_del(&type->list);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 }
 EXPORT_SYMBOL_GPL(nft_unregister_expr);
 
-static const struct nft_expr_ops *__nft_expr_ops_get(struct nlattr *nla)
+static const struct nft_expr_type *__nft_expr_type_get(struct nlattr *nla)
 {
-	const struct nft_expr_ops *ops;
+	const struct nft_expr_type *type;
 
-	list_for_each_entry(ops, &nf_tables_expressions, list) {
-		if (!nla_strcmp(nla, ops->name))
-			return ops;
+	list_for_each_entry(type, &nf_tables_expressions, list) {
+		if (!nla_strcmp(nla, type->name))
+			return type;
 	}
 	return NULL;
 }
 
-static const struct nft_expr_ops *nft_expr_ops_get(struct nlattr *nla)
+static const struct nft_expr_type *nft_expr_type_get(struct nlattr *nla)
 {
-	const struct nft_expr_ops *ops;
+	const struct nft_expr_type *type;
 
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	ops = __nft_expr_ops_get(nla);
-	if (ops != NULL && try_module_get(ops->owner))
-		return ops;
+	type = __nft_expr_type_get(nla);
+	if (type != NULL && try_module_get(type->owner))
+		return type;
 
 #ifdef CONFIG_MODULES
-	if (ops == NULL) {
+	if (type == NULL) {
 		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 		request_module("nft-expr-%.*s",
 			       nla_len(nla), (char *)nla_data(nla));
 		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		if (__nft_expr_ops_get(nla))
+		if (__nft_expr_type_get(nla))
 			return ERR_PTR(-EAGAIN);
 	}
 #endif
@@ -912,7 +912,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
 static int nf_tables_fill_expr_info(struct sk_buff *skb,
 				    const struct nft_expr *expr)
 {
-	if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->name))
+	if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
 		goto nla_put_failure;
 
 	if (expr->ops->dump) {
@@ -932,28 +932,52 @@ nla_put_failure:
 
 struct nft_expr_info {
 	const struct nft_expr_ops	*ops;
-	struct nlattr			*tb[NFTA_EXPR_MAX + 1];
+	struct nlattr			*tb[NFT_EXPR_MAXATTR + 1];
 };
 
 static int nf_tables_expr_parse(const struct nlattr *nla,
 				struct nft_expr_info *info)
 {
+	const struct nft_expr_type *type;
 	const struct nft_expr_ops *ops;
+	struct nlattr *tb[NFTA_EXPR_MAX + 1];
 	int err;
 
-	err = nla_parse_nested(info->tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
+	err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
 	if (err < 0)
 		return err;
 
-	ops = nft_expr_ops_get(info->tb[NFTA_EXPR_NAME]);
-	if (IS_ERR(ops))
-		return PTR_ERR(ops);
+	type = nft_expr_type_get(tb[NFTA_EXPR_NAME]);
+	if (IS_ERR(type))
+		return PTR_ERR(type);
+
+	if (tb[NFTA_EXPR_DATA]) {
+		err = nla_parse_nested(info->tb, type->maxattr,
+				       tb[NFTA_EXPR_DATA], type->policy);
+		if (err < 0)
+			goto err1;
+	} else
+		memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
+
+	if (type->select_ops != NULL) {
+		ops = type->select_ops((const struct nlattr * const *)info->tb);
+		if (IS_ERR(ops)) {
+			err = PTR_ERR(ops);
+			goto err1;
+		}
+	} else
+		ops = type->ops;
+
 	info->ops = ops;
 	return 0;
+
+err1:
+	module_put(type->owner);
+	return err;
 }
 
 static int nf_tables_newexpr(const struct nft_ctx *ctx,
-			     struct nft_expr_info *info,
+			     const struct nft_expr_info *info,
 			     struct nft_expr *expr)
 {
 	const struct nft_expr_ops *ops = info->ops;
@@ -961,23 +985,11 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
 
 	expr->ops = ops;
 	if (ops->init) {
-		struct nlattr *ma[ops->maxattr + 1];
-
-		if (info->tb[NFTA_EXPR_DATA]) {
-			err = nla_parse_nested(ma, ops->maxattr,
-					       info->tb[NFTA_EXPR_DATA],
-					       ops->policy);
-			if (err < 0)
-				goto err1;
-		} else
-			memset(ma, 0, sizeof(ma[0]) * (ops->maxattr + 1));
-
-		err = ops->init(ctx, expr, (const struct nlattr **)ma);
+		err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
 		if (err < 0)
 			goto err1;
 	}
 
-	info->ops = NULL;
 	return 0;
 
 err1:
@@ -989,7 +1001,7 @@ static void nf_tables_expr_destroy(struct nft_expr *expr)
 {
 	if (expr->ops->destroy)
 		expr->ops->destroy(expr);
-	module_put(expr->ops->owner);
+	module_put(expr->ops->type->owner);
 }
 
 /*
@@ -1313,6 +1325,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
 		if (err < 0)
 			goto err2;
+		info[i].ops = NULL;
 		expr = nft_expr_next(expr);
 	}
 
@@ -1341,7 +1354,7 @@ err2:
 err1:
 	for (i = 0; i < n; i++) {
 		if (info[i].ops != NULL)
-			module_put(info[i].ops->owner);
+			module_put(info[i].ops->type->owner);
 	}
 	return err;
 }
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 0f7501506367..4fb6ee2c1106 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -118,23 +118,29 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_bitwise_ops __read_mostly = {
-	.name		= "bitwise",
+static struct nft_expr_type nft_bitwise_type;
+static const struct nft_expr_ops nft_bitwise_ops = {
+	.type		= &nft_bitwise_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_bitwise_eval,
 	.init		= nft_bitwise_init,
 	.dump		= nft_bitwise_dump,
+};
+
+static struct nft_expr_type nft_bitwise_type __read_mostly = {
+	.name		= "bitwise",
+	.ops		= &nft_bitwise_ops,
 	.policy		= nft_bitwise_policy,
 	.maxattr	= NFTA_BITWISE_MAX,
+	.owner		= THIS_MODULE,
 };
 
 int __init nft_bitwise_module_init(void)
 {
-	return nft_register_expr(&nft_bitwise_ops);
+	return nft_register_expr(&nft_bitwise_type);
 }
 
 void nft_bitwise_module_exit(void)
 {
-	nft_unregister_expr(&nft_bitwise_ops);
+	nft_unregister_expr(&nft_bitwise_type);
 }
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 8b0657a4d17b..c39ed8d29df1 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -145,23 +145,29 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_byteorder_ops __read_mostly = {
-	.name		= "byteorder",
+static struct nft_expr_type nft_byteorder_type;
+static const struct nft_expr_ops nft_byteorder_ops = {
+	.type		= &nft_byteorder_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_byteorder_eval,
 	.init		= nft_byteorder_init,
 	.dump		= nft_byteorder_dump,
+};
+
+static struct nft_expr_type nft_byteorder_type __read_mostly = {
+	.name		= "byteorder",
+	.ops		= &nft_byteorder_ops,
 	.policy		= nft_byteorder_policy,
 	.maxattr	= NFTA_BYTEORDER_MAX,
+	.owner		= THIS_MODULE,
 };
 
 int __init nft_byteorder_module_init(void)
 {
-	return nft_register_expr(&nft_byteorder_ops);
+	return nft_register_expr(&nft_byteorder_type);
 }
 
 void nft_byteorder_module_exit(void)
 {
-	nft_unregister_expr(&nft_byteorder_ops);
+	nft_unregister_expr(&nft_byteorder_type);
 }
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index e734d670120a..2c9d5fef2e63 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -124,23 +124,29 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_cmp_ops __read_mostly = {
-	.name		= "cmp",
+static struct nft_expr_type nft_cmp_type;
+static const struct nft_expr_ops nft_cmp_ops = {
+	.type		= &nft_cmp_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_cmp_eval,
 	.init		= nft_cmp_init,
 	.dump		= nft_cmp_dump,
+};
+
+static struct nft_expr_type nft_cmp_type __read_mostly = {
+	.name		= "cmp",
+	.ops		= &nft_cmp_ops,
 	.policy		= nft_cmp_policy,
 	.maxattr	= NFTA_CMP_MAX,
+	.owner		= THIS_MODULE,
 };
 
 int __init nft_cmp_module_init(void)
 {
-	return nft_register_expr(&nft_cmp_ops);
+	return nft_register_expr(&nft_cmp_type);
 }
 
 void nft_cmp_module_exit(void)
 {
-	nft_unregister_expr(&nft_cmp_ops);
+	nft_unregister_expr(&nft_cmp_type);
 }
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 33c5d36819bb..c89ee486ce54 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -78,25 +78,31 @@ static int nft_counter_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static struct nft_expr_ops nft_counter_ops __read_mostly = {
-	.name		= "counter",
+static struct nft_expr_type nft_counter_type;
+static const struct nft_expr_ops nft_counter_ops = {
+	.type		= &nft_counter_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_counter)),
-	.policy		= nft_counter_policy,
-	.maxattr	= NFTA_COUNTER_MAX,
-	.owner		= THIS_MODULE,
 	.eval		= nft_counter_eval,
 	.init		= nft_counter_init,
 	.dump		= nft_counter_dump,
 };
 
+static struct nft_expr_type nft_counter_type __read_mostly = {
+	.name		= "counter",
+	.ops		= &nft_counter_ops,
+	.policy		= nft_counter_policy,
+	.maxattr	= NFTA_COUNTER_MAX,
+	.owner		= THIS_MODULE,
+};
+
 static int __init nft_counter_module_init(void)
 {
-	return nft_register_expr(&nft_counter_ops);
+	return nft_register_expr(&nft_counter_type);
 }
 
 static void __exit nft_counter_module_exit(void)
 {
-	nft_unregister_expr(&nft_counter_ops);
+	nft_unregister_expr(&nft_counter_type);
 }
 
 module_init(nft_counter_module_init);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index a1756d678226..955f4e6e7089 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -222,26 +222,32 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_ct_ops __read_mostly = {
-	.name		= "ct",
+static struct nft_expr_type nft_ct_type;
+static const struct nft_expr_ops nft_ct_ops = {
+	.type		= &nft_ct_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ct)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_ct_eval,
 	.init		= nft_ct_init,
 	.destroy	= nft_ct_destroy,
 	.dump		= nft_ct_dump,
+};
+
+static struct nft_expr_type nft_ct_type __read_mostly = {
+	.name		= "ct",
+	.ops		= &nft_ct_ops,
 	.policy		= nft_ct_policy,
 	.maxattr	= NFTA_CT_MAX,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_ct_module_init(void)
 {
-	return nft_register_expr(&nft_ct_ops);
+	return nft_register_expr(&nft_ct_type);
 }
 
 static void __exit nft_ct_module_exit(void)
 {
-	nft_unregister_expr(&nft_ct_ops);
+	nft_unregister_expr(&nft_ct_type);
 }
 
 module_init(nft_ct_module_init);
diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c
index 9fc8eb308193..b6eed4d5a096 100644
--- a/net/netfilter/nft_expr_template.c
+++ b/net/netfilter/nft_expr_template.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -33,7 +33,7 @@ static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = {
 
 static int nft_template_init(const struct nft_ctx *ctx,
 			   const struct nft_expr *expr,
-			   const struct nlattr *tb[])
+			   const struct nlattr * const tb[])
 {
 	struct nft_template *priv = nft_expr_priv(expr);
 
@@ -58,26 +58,32 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops template_ops __read_mostly = {
-	.name		= "template",
+static struct nft_expr_type nft_template_type;
+static const struct nft_expr_ops nft_template_ops = {
+	.type		= &nft_template_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_template)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_template_eval,
 	.init		= nft_template_init,
 	.destroy	= nft_template_destroy,
 	.dump		= nft_template_dump,
+};
+
+static struct nft_expr_type nft_template_type __read_mostly = {
+	.name		= "template",
+	.ops		= &nft_template_ops,
 	.policy		= nft_template_policy,
 	.maxattr	= NFTA_TEMPLATE_MAX,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_template_module_init(void)
 {
-	return nft_register_expr(&template_ops);
+	return nft_register_expr(&nft_template_type);
 }
 
 static void __exit nft_template_module_exit(void)
 {
-	nft_unregister_expr(&template_ops);
+	nft_unregister_expr(&nft_template_type);
 }
 
 module_init(nft_template_module_init);
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 21c6a6b7b662..8e0bb75e7c51 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -98,25 +98,31 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops exthdr_ops __read_mostly = {
-	.name		= "exthdr",
+static struct nft_expr_type nft_exthdr_type;
+static const struct nft_expr_ops nft_exthdr_ops = {
+	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_exthdr_eval,
 	.init		= nft_exthdr_init,
 	.dump		= nft_exthdr_dump,
+};
+
+static struct nft_expr_type nft_exthdr_type __read_mostly = {
+	.name		= "exthdr",
+	.ops		= &nft_exthdr_ops,
 	.policy		= nft_exthdr_policy,
 	.maxattr	= NFTA_EXTHDR_MAX,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_exthdr_module_init(void)
 {
-	return nft_register_expr(&exthdr_ops);
+	return nft_register_expr(&nft_exthdr_type);
 }
 
 static void __exit nft_exthdr_module_exit(void)
 {
-	nft_unregister_expr(&exthdr_ops);
+	nft_unregister_expr(&nft_exthdr_type);
 }
 
 module_init(nft_exthdr_module_init);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 78334bf37007..1bfeeaf865b6 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -100,25 +100,31 @@ static const struct nft_data *nft_immediate_get_verdict(const struct nft_expr *e
 		return NULL;
 }
 
-static struct nft_expr_ops nft_imm_ops __read_mostly = {
-	.name		= "immediate",
+static struct nft_expr_type nft_imm_type;
+static const struct nft_expr_ops nft_imm_ops = {
+	.type		= &nft_imm_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_immediate_eval,
 	.init		= nft_immediate_init,
 	.destroy	= nft_immediate_destroy,
 	.dump		= nft_immediate_dump,
 	.get_verdict	= nft_immediate_get_verdict,
+};
+
+static struct nft_expr_type nft_imm_type __read_mostly = {
+	.name		= "immediate",
+	.ops		= &nft_imm_ops,
 	.policy		= nft_immediate_policy,
 	.maxattr	= NFTA_IMMEDIATE_MAX,
+	.owner		= THIS_MODULE,
 };
 
 int __init nft_immediate_module_init(void)
 {
-	return nft_register_expr(&nft_imm_ops);
+	return nft_register_expr(&nft_imm_type);
 }
 
 void nft_immediate_module_exit(void)
 {
-	nft_unregister_expr(&nft_imm_ops);
+	nft_unregister_expr(&nft_imm_type);
 }
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index e0e3fc8aebc3..85da5bd02f64 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -84,25 +84,31 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_limit_ops __read_mostly = {
-	.name		= "limit",
+static struct nft_expr_type nft_limit_type;
+static const struct nft_expr_ops nft_limit_ops = {
+	.type		= &nft_limit_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_limit_eval,
 	.init		= nft_limit_init,
 	.dump		= nft_limit_dump,
+};
+
+static struct nft_expr_type nft_limit_type __read_mostly = {
+	.name		= "limit",
+	.ops		= &nft_limit_ops,
 	.policy		= nft_limit_policy,
 	.maxattr	= NFTA_LIMIT_MAX,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_limit_module_init(void)
 {
-	return nft_register_expr(&nft_limit_ops);
+	return nft_register_expr(&nft_limit_type);
 }
 
 static void __exit nft_limit_module_exit(void)
 {
-	nft_unregister_expr(&nft_limit_ops);
+	nft_unregister_expr(&nft_limit_type);
 }
 
 module_init(nft_limit_module_init);
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index da495c3b1e7e..57cad072a13e 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -110,26 +110,32 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_log_ops __read_mostly = {
-	.name		= "log",
+static struct nft_expr_type nft_log_type;
+static const struct nft_expr_ops nft_log_ops = {
+	.type		= &nft_log_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_log)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_log_eval,
 	.init		= nft_log_init,
 	.destroy	= nft_log_destroy,
 	.dump		= nft_log_dump,
+};
+
+static struct nft_expr_type nft_log_type __read_mostly = {
+	.name		= "log",
+	.ops		= &nft_log_ops,
 	.policy		= nft_log_policy,
 	.maxattr	= NFTA_LOG_MAX,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_log_module_init(void)
 {
-	return nft_register_expr(&nft_log_ops);
+	return nft_register_expr(&nft_log_type);
 }
 
 static void __exit nft_log_module_exit(void)
 {
-	nft_unregister_expr(&nft_log_ops);
+	nft_unregister_expr(&nft_log_type);
 }
 
 module_init(nft_log_module_init);
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 4962d2173678..8a6116b75b5a 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -112,24 +112,30 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_lookup_ops __read_mostly = {
-	.name		= "lookup",
+static struct nft_expr_type nft_lookup_type;
+static const struct nft_expr_ops nft_lookup_ops = {
+	.type		= &nft_lookup_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_lookup_eval,
 	.init		= nft_lookup_init,
 	.destroy	= nft_lookup_destroy,
 	.dump		= nft_lookup_dump,
+};
+
+static struct nft_expr_type nft_lookup_type __read_mostly = {
+	.name		= "lookup",
+	.ops		= &nft_lookup_ops,
 	.policy		= nft_lookup_policy,
 	.maxattr	= NFTA_LOOKUP_MAX,
+	.owner		= THIS_MODULE,
 };
 
 int __init nft_lookup_module_init(void)
 {
-	return nft_register_expr(&nft_lookup_ops);
+	return nft_register_expr(&nft_lookup_type);
 }
 
 void nft_lookup_module_exit(void)
 {
-	nft_unregister_expr(&nft_lookup_ops);
+	nft_unregister_expr(&nft_lookup_type);
 }
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 96735aa2f039..8c28220a90b3 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -193,25 +193,31 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_meta_ops __read_mostly = {
-	.name		= "meta",
+static struct nft_expr_type nft_meta_type;
+static const struct nft_expr_ops nft_meta_ops = {
+	.type		= &nft_meta_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_meta_eval,
 	.init		= nft_meta_init,
 	.dump		= nft_meta_dump,
+};
+
+static struct nft_expr_type nft_meta_type __read_mostly = {
+	.name		= "meta",
+	.ops		= &nft_meta_ops,
 	.policy		= nft_meta_policy,
 	.maxattr	= NFTA_META_MAX,
+	.owner		= THIS_MODULE,
 };
 
 static int __init nft_meta_module_init(void)
 {
-	return nft_register_expr(&nft_meta_ops);
+	return nft_register_expr(&nft_meta_type);
 }
 
 static void __exit nft_meta_module_exit(void)
 {
-	nft_unregister_expr(&nft_meta_ops);
+	nft_unregister_expr(&nft_meta_type);
 }
 
 module_init(nft_meta_module_init);
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 329f134b3f89..d99db6e37fb1 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -115,23 +115,29 @@ nla_put_failure:
 	return -1;
 }
 
-static struct nft_expr_ops nft_payload_ops __read_mostly = {
-	.name		= "payload",
+static struct nft_expr_type nft_payload_type;
+static const struct nft_expr_ops nft_payload_ops = {
+	.type		= &nft_payload_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)),
-	.owner		= THIS_MODULE,
 	.eval		= nft_payload_eval,
 	.init		= nft_payload_init,
 	.dump		= nft_payload_dump,
+};
+
+static struct nft_expr_type nft_payload_type __read_mostly = {
+	.name		= "payload",
+	.ops		= &nft_payload_ops,
 	.policy		= nft_payload_policy,
 	.maxattr	= NFTA_PAYLOAD_MAX,
+	.owner		= THIS_MODULE,
 };
 
 int __init nft_payload_module_init(void)
 {
-	return nft_register_expr(&nft_payload_ops);
+	return nft_register_expr(&nft_payload_type);
 }
 
 void nft_payload_module_exit(void)
 {
-	nft_unregister_expr(&nft_payload_ops);
+	nft_unregister_expr(&nft_payload_type);
 }
-- 
cgit v1.2.3


From 9370761c56b66aa5c65e069a7b010111a025018d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 10 Oct 2013 23:21:26 +0200
Subject: netfilter: nf_tables: convert built-in tables/chains to chain types

This patch converts built-in tables/chains to chain types that
allows you to deploy customized table and chain configurations from
userspace.

After this patch, you have to specify the chain type when
creating a new chain:

 add chain ip filter output { type filter hook input priority 0; }
                              ^^^^ ------

The existing chain types after this patch are: filter, route and
nat. Note that tables are just containers of chains with no specific
semantics, which is a significant change with regards to iptables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h         |  31 ++-
 include/uapi/linux/netfilter/nf_tables.h  |   2 +
 net/ipv4/netfilter/Kconfig                |   8 +-
 net/ipv4/netfilter/Makefile               |   4 +-
 net/ipv4/netfilter/nf_table_nat_ipv4.c    | 415 ------------------------------
 net/ipv4/netfilter/nf_table_route_ipv4.c  |  97 -------
 net/ipv4/netfilter/nf_tables_ipv4.c       |  21 ++
 net/ipv4/netfilter/nft_chain_nat_ipv4.c   | 353 +++++++++++++++++++++++++
 net/ipv4/netfilter/nft_chain_route_ipv4.c |  86 +++++++
 net/ipv6/netfilter/Kconfig                |   4 +-
 net/ipv6/netfilter/Makefile               |   2 +-
 net/ipv6/netfilter/nf_table_route_ipv6.c  |  93 -------
 net/ipv6/netfilter/nf_tables_ipv6.c       |  22 +-
 net/ipv6/netfilter/nft_chain_route_ipv6.c |  82 ++++++
 net/netfilter/nf_tables_api.c             | 197 +++++++-------
 15 files changed, 682 insertions(+), 735 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nf_table_nat_ipv4.c
 delete mode 100644 net/ipv4/netfilter/nf_table_route_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_chain_nat_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_chain_route_ipv4.c
 delete mode 100644 net/ipv6/netfilter/nf_table_route_ipv6.c
 create mode 100644 net/ipv6/netfilter/nft_chain_route_ipv6.c

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 66d0359702c6..8403f7f52e81 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -336,7 +336,6 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
 
 enum nft_chain_flags {
 	NFT_BASE_CHAIN			= 0x1,
-	NFT_CHAIN_BUILTIN		= 0x2,
 };
 
 /**
@@ -362,14 +361,23 @@ struct nft_chain {
 	char				name[NFT_CHAIN_MAXNAMELEN];
 };
 
+enum nft_chain_type {
+	NFT_CHAIN_T_DEFAULT = 0,
+	NFT_CHAIN_T_ROUTE,
+	NFT_CHAIN_T_NAT,
+	NFT_CHAIN_T_MAX
+};
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
+ *	@type: chain type
  *	@chain: the chain
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
+	enum nft_chain_type		type;
 	struct nft_chain		chain;
 };
 
@@ -384,10 +392,6 @@ extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
 				 const struct net_device *out,
 				 int (*okfn)(struct sk_buff *));
 
-enum nft_table_flags {
-	NFT_TABLE_BUILTIN		= 0x1,
-};
-
 /**
  *	struct nft_table - nf_tables table
  *
@@ -431,8 +435,17 @@ struct nft_af_info {
 extern int nft_register_afinfo(struct nft_af_info *);
 extern void nft_unregister_afinfo(struct nft_af_info *);
 
-extern int nft_register_table(struct nft_table *, int family);
-extern void nft_unregister_table(struct nft_table *, int family);
+struct nf_chain_type {
+	unsigned int		hook_mask;
+	const char		*name;
+	enum nft_chain_type	type;
+	nf_hookfn		*fn[NF_MAX_HOOKS];
+	struct module		*me;
+	int			family;
+};
+
+extern int nft_register_chain_type(struct nf_chain_type *);
+extern void nft_unregister_chain_type(struct nf_chain_type *);
 
 extern int nft_register_expr(struct nft_expr_type *);
 extern void nft_unregister_expr(struct nft_expr_type *);
@@ -440,8 +453,8 @@ extern void nft_unregister_expr(struct nft_expr_type *);
 #define MODULE_ALIAS_NFT_FAMILY(family)	\
 	MODULE_ALIAS("nft-afinfo-" __stringify(family))
 
-#define MODULE_ALIAS_NFT_TABLE(family, name) \
-	MODULE_ALIAS("nft-table-" __stringify(family) "-" name)
+#define MODULE_ALIAS_NFT_CHAIN(family, name) \
+	MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)
 
 #define MODULE_ALIAS_NFT_EXPR(name) \
 	MODULE_ALIAS("nft-expr-" name)
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9e924014efe3..779cf951c8de 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -115,6 +115,7 @@ enum nft_table_attributes {
  * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
  * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
  * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING)
  */
 enum nft_chain_attributes {
 	NFTA_CHAIN_UNSPEC,
@@ -122,6 +123,7 @@ enum nft_chain_attributes {
 	NFTA_CHAIN_HANDLE,
 	NFTA_CHAIN_NAME,
 	NFTA_CHAIN_HOOK,
+	NFTA_CHAIN_TYPE,
 	__NFTA_CHAIN_MAX
 };
 #define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index eb1d56ece361..ae65fe98bfbe 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -44,13 +44,13 @@ config NFT_REJECT_IPV4
 	depends on NF_TABLES_IPV4
 	tristate "nf_tables IPv4 reject support"
 
-config NF_TABLE_ROUTE_IPV4
+config NFT_CHAIN_ROUTE_IPV4
 	depends on NF_TABLES_IPV4
-	tristate "IPv4 nf_tables route table support"
+	tristate "IPv4 nf_tables route chain support"
 
-config NF_TABLE_NAT_IPV4
+config NFT_CHAIN_NAT_IPV4
 	depends on NF_TABLES_IPV4
-	tristate "IPv4 nf_tables nat table support"
+	tristate "IPv4 nf_tables nat chain support"
 
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index b2f01cd2cd65..91e0bd71a6d3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -29,8 +29,8 @@ obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
 obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
-obj-$(CONFIG_NF_TABLE_ROUTE_IPV4) += nf_table_route_ipv4.o
-obj-$(CONFIG_NF_TABLE_NAT_IPV4) += nf_table_nat_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/nf_table_nat_ipv4.c b/net/ipv4/netfilter/nf_table_nat_ipv4.c
deleted file mode 100644
index 2ecce39077a3..000000000000
--- a/net/ipv4/netfilter/nf_table_nat_ipv4.c
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/ip.h>
-
-struct nft_nat {
-	enum nft_registers	sreg_addr_min:8;
-	enum nft_registers	sreg_addr_max:8;
-	enum nft_registers	sreg_proto_min:8;
-	enum nft_registers	sreg_proto_max:8;
-	enum nf_nat_manip_type	type;
-};
-
-static void nft_nat_eval(const struct nft_expr *expr,
-			 struct nft_data data[NFT_REG_MAX + 1],
-			 const struct nft_pktinfo *pkt)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
-
-	memset(&range, 0, sizeof(range));
-	if (priv->sreg_addr_min) {
-		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
-		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
-		range.flags |= NF_NAT_RANGE_MAP_IPS;
-	}
-
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = data[priv->sreg_proto_min].data[0];
-		range.max_proto.all = data[priv->sreg_proto_max].data[0];
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-
-	data[NFT_REG_VERDICT].verdict =
-		nf_nat_setup_info(ct, &range, priv->type);
-}
-
-static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
-	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
-};
-
-static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-			const struct nlattr * const tb[])
-{
-	struct nft_nat *priv = nft_expr_priv(expr);
-	int err;
-
-	if (tb[NFTA_NAT_TYPE] == NULL)
-		return -EINVAL;
-
-	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
-	case NFT_NAT_SNAT:
-		priv->type = NF_NAT_MANIP_SRC;
-		break;
-	case NFT_NAT_DNAT:
-		priv->type = NF_NAT_MANIP_DST;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MIN]) {
-		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
-		err = nft_validate_input_register(priv->sreg_addr_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MAX]) {
-		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
-		err = nft_validate_input_register(priv->sreg_addr_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_addr_max = priv->sreg_addr_min;
-
-	if (tb[NFTA_NAT_PROTO_MIN]) {
-		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
-		err = nft_validate_input_register(priv->sreg_proto_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_PROTO_MAX]) {
-		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
-		err = nft_validate_input_register(priv->sreg_proto_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_proto_max = priv->sreg_proto_min;
-
-	return 0;
-}
-
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-
-	switch (priv->type) {
-	case NF_NAT_MANIP_SRC:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
-			goto nla_put_failure;
-		break;
-	case NF_NAT_MANIP_DST:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
-			goto nla_put_failure;
-		break;
-	}
-
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static struct nft_expr_type nft_nat_type;
-static const struct nft_expr_ops nft_nat_ops = {
-	.type		= &nft_nat_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
-	.eval		= nft_nat_eval,
-	.init		= nft_nat_init,
-	.dump		= nft_nat_dump,
-};
-
-static struct nft_expr_type nft_nat_type __read_mostly = {
-	.name		= "nat",
-	.ops		= &nft_nat_ops,
-	.policy		= nft_nat_policy,
-	.maxattr	= NFTA_NAT_MAX,
-	.owner		= THIS_MODULE,
-};
-
-/*
- * NAT table
- */
-
-static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
-			      struct sk_buff *skb,
-			      const struct net_device *in,
-			      const struct net_device *out,
-			      int (*okfn)(struct sk_buff *))
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_nat *nat;
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
-	unsigned int ret;
-
-	if (ct == NULL || nf_ct_is_untracked(ct))
-		return NF_ACCEPT;
-
-	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
-
-	nat = nfct_nat(ct);
-	if (nat == NULL) {
-		/* Conntrack module was loaded late, can't add extension. */
-		if (nf_ct_is_confirmed(ct))
-			return NF_ACCEPT;
-		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
-		if (nat == NULL)
-			return NF_ACCEPT;
-	}
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED + IP_CT_IS_REPLY:
-		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-							   ops->hooknum))
-				return NF_DROP;
-			else
-				return NF_ACCEPT;
-		}
-		/* Fall through */
-	case IP_CT_NEW:
-		if (nf_nat_initialized(ct, maniptype))
-			break;
-
-		ret = nft_do_chain(ops, skb, in, out, okfn);
-		if (ret != NF_ACCEPT)
-			return ret;
-		if (!nf_nat_initialized(ct, maniptype)) {
-			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
-			if (ret != NF_ACCEPT)
-				return ret;
-		}
-	default:
-		break;
-	}
-
-	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
-}
-
-static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
-				      struct sk_buff *skb,
-				      const struct net_device *in,
-				      const struct net_device *out,
-				      int (*okfn)(struct sk_buff *))
-{
-	__be32 daddr = ip_hdr(skb)->daddr;
-	unsigned int ret;
-
-	ret = nf_nat_fn(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    ip_hdr(skb)->daddr != daddr) {
-		skb_dst_drop(skb);
-	}
-	return ret;
-}
-
-static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
-{
-	enum ip_conntrack_info ctinfo __maybe_unused;
-	const struct nf_conn *ct __maybe_unused;
-	unsigned int ret;
-
-	ret = nf_nat_fn(ops, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
-		if (ct->tuplehash[dir].tuple.src.u3.ip !=
-		    ct->tuplehash[!dir].tuple.dst.u3.ip ||
-		    ct->tuplehash[dir].tuple.src.u.all !=
-		    ct->tuplehash[!dir].tuple.dst.u.all)
-			return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
-								ret : NF_DROP;
-	}
-#endif
-	return ret;
-}
-
-static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
-				  struct sk_buff *skb,
-				  const struct net_device *in,
-				  const struct net_device *out,
-				  int (*okfn)(struct sk_buff *))
-{
-	enum ip_conntrack_info ctinfo;
-	const struct nf_conn *ct;
-	unsigned int ret;
-
-	ret = nf_nat_fn(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
-		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
-		    ct->tuplehash[!dir].tuple.src.u3.ip) {
-			if (ip_route_me_harder(skb, RTN_UNSPEC))
-				ret = NF_DROP;
-		}
-#ifdef CONFIG_XFRM
-		else if (ct->tuplehash[dir].tuple.dst.u.all !=
-			 ct->tuplehash[!dir].tuple.src.u.all)
-			if (nf_xfrm_me_harder(skb, AF_INET))
-				ret = NF_DROP;
-#endif
-	}
-	return ret;
-}
-
-static struct nft_base_chain nf_chain_nat_prerouting __read_mostly = {
-	.chain	= {
-		.name		= "PREROUTING",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_prerouting.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_prerouting,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP_PRI_NAT_DST,
-		.priv		= &nf_chain_nat_prerouting.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_postrouting __read_mostly = {
-	.chain	= {
-		.name		= "POSTROUTING",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_postrouting.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_postrouting,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_NAT_SRC,
-		.priv		= &nf_chain_nat_postrouting.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_output,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_NAT_DST,
-		.priv		= &nf_chain_nat_output.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_input __read_mostly = {
-	.chain	= {
-		.name		= "INPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_input.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_fn,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_NAT_SRC,
-		.priv		= &nf_chain_nat_input.chain,
-	},
-};
-
-
-static struct nft_table nf_table_nat_ipv4 __read_mostly = {
-	.name	= "nat",
-	.chains	= LIST_HEAD_INIT(nf_table_nat_ipv4.chains),
-};
-
-static int __init nf_table_nat_init(void)
-{
-	int err;
-
-	list_add_tail(&nf_chain_nat_prerouting.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_postrouting.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_output.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_input.chain.list,
-		      &nf_table_nat_ipv4.chains);
-
-	err = nft_register_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
-	if (err < 0)
-		goto err1;
-
-	err = nft_register_expr(&nft_nat_type);
-	if (err < 0)
-		goto err2;
-
-	return 0;
-
-err2:
-	nft_unregister_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
-err1:
-	return err;
-}
-
-static void __exit nf_table_nat_exit(void)
-{
-	nft_unregister_expr(&nft_nat_type);
-	nft_unregister_table(&nf_table_nat_ipv4, AF_INET);
-}
-
-module_init(nf_table_nat_init);
-module_exit(nf_table_nat_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET, "nat");
-MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv4/netfilter/nf_table_route_ipv4.c b/net/ipv4/netfilter/nf_table_route_ipv4.c
deleted file mode 100644
index 4f257a1ed661..000000000000
--- a/net/ipv4/netfilter/nf_table_route_ipv4.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
-					struct sk_buff *skb,
-					const struct net_device *in,
-					const struct net_device *out,
-					int (*okfn)(struct sk_buff *))
-{
-	unsigned int ret;
-	u32 mark;
-	__be32 saddr, daddr;
-	u_int8_t tos;
-	const struct iphdr *iph;
-
-	/* root is playing with raw sockets. */
-	if (skb->len < sizeof(struct iphdr) ||
-	    ip_hdrlen(skb) < sizeof(struct iphdr))
-		return NF_ACCEPT;
-
-	mark = skb->mark;
-	iph = ip_hdr(skb);
-	saddr = iph->saddr;
-	daddr = iph->daddr;
-	tos = iph->tos;
-
-	ret = nft_do_chain(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_QUEUE) {
-		iph = ip_hdr(skb);
-
-		if (iph->saddr != saddr ||
-		    iph->daddr != daddr ||
-		    skb->mark != mark ||
-		    iph->tos != tos)
-			if (ip_route_me_harder(skb, RTN_UNSPEC))
-				ret = NF_DROP;
-	}
-	return ret;
-}
-
-static struct nft_base_chain nf_chain_route_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_route_table_hook,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_MANGLE,
-		.priv		= &nf_chain_route_output.chain,
-	},
-};
-
-static struct nft_table nf_table_route_ipv4 __read_mostly = {
-	.name	= "route",
-	.chains	= LIST_HEAD_INIT(nf_table_route_ipv4.chains),
-};
-
-static int __init nf_table_route_init(void)
-{
-	list_add_tail(&nf_chain_route_output.chain.list,
-		      &nf_table_route_ipv4.chains);
-	return nft_register_table(&nf_table_route_ipv4, NFPROTO_IPV4);
-}
-
-static void __exit nf_table_route_exit(void)
-{
-	nft_unregister_table(&nf_table_route_ipv4, NFPROTO_IPV4);
-}
-
-module_init(nf_table_route_init);
-module_exit(nf_table_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 63d0a3bf53d3..23525c4c0192 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -41,14 +42,34 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+static struct nf_chain_type filter_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.fn		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain,
+		[NF_INET_FORWARD]	= nft_do_chain,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_do_chain,
+	},
+};
+
 static int __init nf_tables_ipv4_init(void)
 {
+	nft_register_chain_type(&filter_ipv4);
 	return nft_register_afinfo(&nft_af_ipv4);
 }
 
 static void __exit nf_tables_ipv4_exit(void)
 {
 	nft_unregister_afinfo(&nft_af_ipv4);
+	nft_unregister_chain_type(&filter_ipv4);
 }
 
 module_init(nf_tables_ipv4_init);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
new file mode 100644
index 000000000000..cd286306be85
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+	enum nft_registers	sreg_addr_min:8;
+	enum nft_registers	sreg_addr_max:8;
+	enum nft_registers	sreg_proto_min:8;
+	enum nft_registers	sreg_proto_max:8;
+	enum nf_nat_manip_type	type;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+	struct nf_nat_range range;
+
+	memset(&range, 0, sizeof(range));
+	if (priv->sreg_addr_min) {
+		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
+		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+		range.flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (priv->sreg_proto_min) {
+		range.min_proto.all = data[priv->sreg_proto_min].data[0];
+		range.max_proto.all = data[priv->sreg_proto_max].data[0];
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+	}
+
+	data[NFT_REG_VERDICT].verdict =
+		nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
+	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_NAT_TYPE] == NULL)
+		return -EINVAL;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+	case NFT_NAT_SNAT:
+		priv->type = NF_NAT_MANIP_SRC;
+		break;
+	case NFT_NAT_DNAT:
+		priv->type = NF_NAT_MANIP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MIN]) {
+		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
+		err = nft_validate_input_register(priv->sreg_addr_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_ADDR_MAX]) {
+		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
+		err = nft_validate_input_register(priv->sreg_addr_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_addr_max = priv->sreg_addr_min;
+
+	if (tb[NFTA_NAT_PROTO_MIN]) {
+		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
+		err = nft_validate_input_register(priv->sreg_proto_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_PROTO_MAX]) {
+		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
+		err = nft_validate_input_register(priv->sreg_proto_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_proto_max = priv->sreg_proto_min;
+
+	return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NF_NAT_MANIP_SRC:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+			goto nla_put_failure;
+		break;
+	case NF_NAT_MANIP_DST:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+	.type		= &nft_nat_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+	.eval		= nft_nat_eval,
+	.init		= nft_nat_init,
+	.dump		= nft_nat_dump,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+	.name		= "nat",
+	.ops		= &nft_nat_ops,
+	.policy		= nft_nat_policy,
+	.maxattr	= NFTA_NAT_MAX,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * NAT chains
+ */
+
+static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_nat *nat;
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	unsigned int ret;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+
+	nat = nfct_nat(ct);
+	if (nat == NULL) {
+		/* Conntrack module was loaded late, can't add extension. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL)
+			return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED + IP_CT_IS_REPLY:
+		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   ops->hooknum))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall through */
+	case IP_CT_NEW:
+		if (nf_nat_initialized(ct, maniptype))
+			break;
+
+		ret = nft_do_chain(ops, skb, in, out, okfn);
+		if (ret != NF_ACCEPT)
+			return ret;
+		if (!nf_nat_initialized(ct, maniptype)) {
+			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+	default:
+		break;
+	}
+
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	__be32 daddr = ip_hdr(skb)->daddr;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    ip_hdr(skb)->daddr != daddr) {
+		skb_dst_drop(skb);
+	}
+	return ret;
+}
+
+static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo __maybe_unused;
+	const struct nf_conn *ct __maybe_unused;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.u3.ip !=
+		    ct->tuplehash[!dir].tuple.dst.u3.ip ||
+		    ct->tuplehash[dir].tuple.src.u.all !=
+		    ct->tuplehash[!dir].tuple.dst.u.all)
+			return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
+								ret : NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int ret;
+
+	ret = nf_nat_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+		    ct->tuplehash[!dir].tuple.src.u3.ip) {
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+struct nf_chain_type nft_chain_nat_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "nat",
+	.type		= NFT_CHAIN_T_NAT,
+	.hook_mask	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.fn		= {
+		[NF_INET_PRE_ROUTING]	= nf_nat_prerouting,
+		[NF_INET_POST_ROUTING]	= nf_nat_postrouting,
+		[NF_INET_LOCAL_OUT]	= nf_nat_output,
+		[NF_INET_LOCAL_IN]	= nf_nat_fn,
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init nft_chain_nat_init(void)
+{
+	int err;
+
+	err = nft_register_chain_type(&nft_chain_nat_ipv4);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_nat_type);
+	if (err < 0)
+		goto err;
+
+	return 0;
+
+err:
+	nft_unregister_chain_type(&nft_chain_nat_ipv4);
+	return err;
+}
+
+static void __exit nft_chain_nat_exit(void)
+{
+	nft_unregister_expr(&nft_nat_type);
+	nft_unregister_chain_type(&nft_chain_nat_ipv4);
+}
+
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
+MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
new file mode 100644
index 000000000000..6b84e097b8fc
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	u32 mark;
+	__be32 saddr, daddr;
+	u_int8_t tos;
+	const struct iphdr *iph;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	mark = skb->mark;
+	iph = ip_hdr(skb);
+	saddr = iph->saddr;
+	daddr = iph->daddr;
+	tos = iph->tos;
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE) {
+		iph = ip_hdr(skb);
+
+		if (iph->saddr != saddr ||
+		    iph->daddr != daddr ||
+		    skb->mark != mark ||
+		    iph->tos != tos)
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+	}
+	return ret;
+}
+
+static struct nf_chain_type nft_chain_route_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "route",
+	.type		= NFT_CHAIN_T_ROUTE,
+	.hook_mask	= (1 << NF_INET_LOCAL_OUT),
+	.fn		= {
+		[NF_INET_LOCAL_OUT]	= nf_route_table_hook,
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init nft_chain_route_init(void)
+{
+	return nft_register_chain_type(&nft_chain_route_ipv4);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+	nft_unregister_chain_type(&nft_chain_route_ipv4);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 5677e38eeca3..23833064b7b5 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,9 +29,9 @@ config NF_TABLES_IPV6
 	depends on NF_TABLES
 	tristate "IPv6 nf_tables support"
 
-config NF_TABLE_ROUTE_IPV6
+config NFT_CHAIN_ROUTE_IPV6
 	depends on NF_TABLES_IPV6
-	tristate "IPv6 nf_tables route table support"
+	tristate "IPv6 nf_tables route chain support"
 
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 956af4492d10..be4913aa524d 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -25,7 +25,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
-obj-$(CONFIG_NF_TABLE_ROUTE_IPV6) += nf_table_route_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nf_table_route_ipv6.c b/net/ipv6/netfilter/nf_table_route_ipv6.c
deleted file mode 100644
index 48ac65c7b398..000000000000
--- a/net/ipv6/netfilter/nf_table_route_ipv6.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/route.h>
-
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
-					struct sk_buff *skb,
-					const struct net_device *in,
-					const struct net_device *out,
-					int (*okfn)(struct sk_buff *))
-{
-	unsigned int ret;
-	struct in6_addr saddr, daddr;
-	u_int8_t hop_limit;
-	u32 mark, flowlabel;
-
-	/* save source/dest address, mark, hoplimit, flowlabel, priority */
-	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
-	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
-	mark = skb->mark;
-	hop_limit = ipv6_hdr(skb)->hop_limit;
-
-	/* flowlabel and prio (includes version, which shouldn't change either */
-	flowlabel = *((u32 *)ipv6_hdr(skb));
-
-	ret = nft_do_chain(ops, skb, in, out, okfn);
-	if (ret != NF_DROP && ret != NF_QUEUE &&
-	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
-	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
-	     skb->mark != mark ||
-	     ipv6_hdr(skb)->hop_limit != hop_limit ||
-	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
-		return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
-
-	return ret;
-}
-
-static struct nft_base_chain nf_chain_route_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_route_table_hook,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP6_PRI_MANGLE,
-		.priv		= &nf_chain_route_output.chain,
-	},
-};
-
-static struct nft_table nf_table_route_ipv6 __read_mostly = {
-	.name	= "route",
-	.chains	= LIST_HEAD_INIT(nf_table_route_ipv6.chains),
-};
-
-static int __init nf_table_route_init(void)
-{
-	list_add_tail(&nf_chain_route_output.chain.list,
-		      &nf_table_route_ipv6.chains);
-	return nft_register_table(&nf_table_route_ipv6, NFPROTO_IPV6);
-}
-
-static void __exit nf_table_route_exit(void)
-{
-	nft_unregister_table(&nf_table_route_ipv6, NFPROTO_IPV6);
-}
-
-module_init(nf_table_route_init);
-module_exit(nf_table_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET6, "route");
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index e0717cea4913..3631d6238e6f 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -39,14 +40,33 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static struct nf_chain_type filter_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.fn		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain,
+		[NF_INET_FORWARD]	= nft_do_chain,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_do_chain,
+	},
+};
+
 static int __init nf_tables_ipv6_init(void)
 {
+	nft_register_chain_type(&filter_ipv6);
 	return nft_register_afinfo(&nft_af_ipv6);
 }
-
 static void __exit nf_tables_ipv6_exit(void)
 {
 	nft_unregister_afinfo(&nft_af_ipv6);
+	nft_unregister_chain_type(&filter_ipv6);
 }
 
 module_init(nf_tables_ipv6_init);
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
new file mode 100644
index 000000000000..4cdc992fa067
--- /dev/null
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/route.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
+					int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	struct in6_addr saddr, daddr;
+	u_int8_t hop_limit;
+	u32 mark, flowlabel;
+
+	/* save source/dest address, mark, hoplimit, flowlabel, priority */
+	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+	mark = skb->mark;
+	hop_limit = ipv6_hdr(skb)->hop_limit;
+
+	/* flowlabel and prio (includes version, which shouldn't change either */
+	flowlabel = *((u32 *)ipv6_hdr(skb));
+
+	ret = nft_do_chain(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_QUEUE &&
+	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+	     skb->mark != mark ||
+	     ipv6_hdr(skb)->hop_limit != hop_limit ||
+	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
+		return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
+
+	return ret;
+}
+
+static struct nf_chain_type nft_chain_route_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "route",
+	.type		= NFT_CHAIN_T_ROUTE,
+	.hook_mask	= (1 << NF_INET_LOCAL_OUT),
+	.fn		= {
+                [NF_INET_LOCAL_OUT]	= nf_route_table_hook,
+        },
+        .me		= THIS_MODULE,
+};
+
+static int __init nft_chain_route_init(void)
+{
+	return nft_register_chain_type(&nft_chain_route_ipv6);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+	nft_unregister_chain_type(&nft_chain_route_ipv6);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6dac9a3c9c40..9c2d8d5af843 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -104,8 +104,7 @@ static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
 }
 
 static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
-						const struct nlattr *nla,
-						bool autoload)
+						const struct nlattr *nla)
 {
 	struct nft_table *table;
 
@@ -116,16 +115,6 @@ static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
 	if (table != NULL)
 		return table;
 
-#ifdef CONFIG_MODULES
-	if (autoload) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-table-%u-%*.s", afi->family,
-			       nla_len(nla)-1, (const char *)nla_data(nla));
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		if (nft_table_lookup(afi, nla))
-			return ERR_PTR(-EAGAIN);
-	}
-#endif
 	return ERR_PTR(-ENOENT);
 }
 
@@ -134,6 +123,39 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 	return ++table->hgenerator;
 }
 
+static struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
+
+static int __nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+{
+	int i;
+
+	for (i=0; i<NFT_CHAIN_T_MAX; i++) {
+		if (chain_type[family][i] != NULL &&
+		    !nla_strcmp(nla, chain_type[family][i]->name))
+			return i;
+	}
+	return -1;
+}
+
+static int nf_tables_chain_type_lookup(const struct nft_af_info *afi,
+				       const struct nlattr *nla,
+				       bool autoload)
+{
+	int type;
+
+	type = __nf_tables_chain_type_lookup(afi->family, nla);
+#ifdef CONFIG_MODULES
+	if (type < 0 && autoload) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-chain-%u-%*.s", afi->family,
+			       nla_len(nla)-1, (const char *)nla_data(nla));
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		type = __nf_tables_chain_type_lookup(afi->family, nla);
+	}
+#endif
+	return type;
+}
+
 static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
 	[NFTA_TABLE_NAME]	= { .type = NLA_STRING },
 };
@@ -258,7 +280,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -294,7 +316,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 		return PTR_ERR(afi);
 
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(afi, name, false);
+	table = nf_tables_table_lookup(afi, name);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -335,13 +357,10 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	if (table->flags & NFT_TABLE_BUILTIN)
-		return -EOPNOTSUPP;
-
 	if (table->use)
 		return -EBUSY;
 
@@ -351,99 +370,34 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	return 0;
 }
 
-static struct nft_table *__nf_tables_table_lookup(const struct nft_af_info *afi,
-						  const char *name)
+int nft_register_chain_type(struct nf_chain_type *ctype)
 {
-	struct nft_table *table;
-
-	list_for_each_entry(table, &afi->tables, list) {
-		if (!strcmp(name, table->name))
-			return table;
-	}
-
-	return ERR_PTR(-ENOENT);
-}
-
-static int nf_tables_chain_notify(const struct sk_buff *oskb,
-				  const struct nlmsghdr *nlh,
-				  const struct nft_table *table,
-				  const struct nft_chain *chain,
-				  int event, int family);
-
-/**
- *	nft_register_table - register a built-in table
- *
- *	@table: the table to register
- *	@family: protocol family to register table with
- *
- *	Register a built-in table for use with nf_tables. Returns zero on
- *	success or a negative errno code otherwise.
- */
-int nft_register_table(struct nft_table *table, int family)
-{
-	struct nft_af_info *afi;
-	struct nft_table *t;
-	struct nft_chain *chain;
-	int err;
+	int err = 0;
 
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-again:
-	afi = nf_tables_afinfo_lookup(family, true);
-	if (IS_ERR(afi)) {
-		err = PTR_ERR(afi);
-		if (err == -EAGAIN)
-			goto again;
-		goto err;
-	}
-
-	t = __nf_tables_table_lookup(afi, table->name);
-	if (IS_ERR(t)) {
-		err = PTR_ERR(t);
-		if (err != -ENOENT)
-			goto err;
-		t = NULL;
+	if (chain_type[ctype->family][ctype->type] != NULL) {
+		err = -EBUSY;
+		goto out;
 	}
 
-	if (t != NULL) {
-		err = -EEXIST;
-		goto err;
-	}
+	if (!try_module_get(ctype->me))
+		goto out;
 
-	table->flags |= NFT_TABLE_BUILTIN;
-	INIT_LIST_HEAD(&table->sets);
-	list_add_tail(&table->list, &afi->tables);
-	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_NEWTABLE, family);
-	list_for_each_entry(chain, &table->chains, list)
-		nf_tables_chain_notify(NULL, NULL, table, chain,
-				       NFT_MSG_NEWCHAIN, family);
-	err = 0;
-err:
+	chain_type[ctype->family][ctype->type] = ctype;
+out:
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	return err;
 }
-EXPORT_SYMBOL_GPL(nft_register_table);
+EXPORT_SYMBOL_GPL(nft_register_chain_type);
 
-/**
- *	nft_unregister_table - unregister a built-in table
- *
- *	@table: the table to unregister
- *	@family: protocol family to unregister table with
- *
- *	Unregister a built-in table for use with nf_tables.
- */
-void nft_unregister_table(struct nft_table *table, int family)
+void nft_unregister_chain_type(struct nf_chain_type *ctype)
 {
-	struct nft_chain *chain;
-
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_del(&table->list);
-	list_for_each_entry(chain, &table->chains, list)
-		nf_tables_chain_notify(NULL, NULL, table, chain,
-				       NFT_MSG_DELCHAIN, family);
-	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_DELTABLE, family);
+	chain_type[ctype->family][ctype->type] = NULL;
+	module_put(ctype->me);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 }
-EXPORT_SYMBOL_GPL(nft_unregister_table);
+EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
 
 /*
  * Chains
@@ -484,6 +438,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
 	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+	[NFTA_CHAIN_TYPE]	= { .type = NLA_NUL_STRING },
 };
 
 static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -526,6 +481,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
 			goto nla_put_failure;
 		nla_nest_end(skb, nest);
+
+		if (nla_put_string(skb, NFTA_CHAIN_TYPE,
+			chain_type[ops->pf][nft_base_chain(chain)->type]->name))
+				goto nla_put_failure;
 	}
 
 	return nlmsg_end(skb, nlh);
@@ -633,7 +592,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -680,7 +639,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -722,6 +681,17 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	if (nla[NFTA_CHAIN_HOOK]) {
 		struct nf_hook_ops *ops;
+		nf_hookfn *hookfn;
+		u32 hooknum;
+		int type = NFT_CHAIN_T_DEFAULT;
+
+		if (nla[NFTA_CHAIN_TYPE]) {
+			type = nf_tables_chain_type_lookup(afi,
+							   nla[NFTA_CHAIN_TYPE],
+							   create);
+			if (type < 0)
+				return -ENOENT;
+		}
 
 		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
 				       nft_hook_policy);
@@ -730,12 +700,20 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
 		    ha[NFTA_HOOK_PRIORITY] == NULL)
 			return -EINVAL;
-		if (ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])) >= afi->nhooks)
+
+		hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+		if (hooknum >= afi->nhooks)
 			return -EINVAL;
 
+		hookfn = chain_type[family][type]->fn[hooknum];
+		if (hookfn == NULL)
+			return -EOPNOTSUPP;
+
 		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
 		if (basechain == NULL)
 			return -ENOMEM;
+
+		basechain->type = type;
 		chain = &basechain->chain;
 
 		ops = &basechain->ops;
@@ -744,7 +722,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		ops->hooknum	= ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
 		ops->priority	= ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
 		ops->priv	= chain;
-		ops->hook	= nft_do_chain;
+		ops->hook       = hookfn;
 		if (afi->hooks[ops->hooknum])
 			ops->hook = afi->hooks[ops->hooknum];
 
@@ -793,7 +771,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -801,9 +779,6 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
-	if (chain->flags & NFT_CHAIN_BUILTIN)
-		return -EOPNOTSUPP;
-
 	if (!list_empty(&chain->rules))
 		return -EBUSY;
 
@@ -1190,7 +1165,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1268,7 +1243,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1374,7 +1349,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1490,7 +1465,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 		return PTR_ERR(afi);
 
 	if (nla[NFTA_SET_TABLE] != NULL) {
-		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], false);
+		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
@@ -1820,7 +1795,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2008,7 +1983,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-- 
cgit v1.2.3


From 0ca743a5599199152a31a7146b83213c786c2eb2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 14 Oct 2013 00:06:06 +0200
Subject: netfilter: nf_tables: add compatibility layer for x_tables

This patch adds the x_tables compatibility layer. This allows you
to use existing x_tables matches and targets from nf_tables.

This compatibility later allows us to use existing matches/targets
for features that are still missing in nf_tables. We can progressively
replace them with native nf_tables extensions. It also provides the
userspace compatibility software that allows you to express the
rule-set using the iptables syntax but using the nf_tables kernel
components.

In order to get this compatibility layer working, I've done the
following things:

* add NFNL_SUBSYS_NFT_COMPAT: this new nfnetlink subsystem is used
to query the x_tables match/target revision, so we don't need to
use the native x_table getsockopt interface.

* emulate xt structures: this required extending the struct nft_pktinfo
to include the fragment offset, which is already obtained from
ip[6]_tables and that is used by some matches/targets.

* add support for default policy to base chains, required to emulate
  x_tables.

* add NFTA_CHAIN_USE attribute to obtain the number of references to
  chains, required by x_tables emulation.

* add chain packet/byte counters using per-cpu.

* support 32-64 bits compat.

For historical reasons, this patch includes the following patches
that were posted in the netfilter-devel mailing list.

From Pablo Neira Ayuso:
* nf_tables: add default policy to base chains
* netfilter: nf_tables: add NFTA_CHAIN_USE attribute
* nf_tables: nft_compat: private data of target and matches in contiguous area
* nf_tables: validate hooks for compat match/target
* nf_tables: nft_compat: release cached matches/targets
* nf_tables: x_tables support as a compile time option
* nf_tables: fix alias for xtables over nftables module
* nf_tables: add packet and byte counters per chain
* nf_tables: fix per-chain counter stats if no counters are passed
* nf_tables: don't bump chain stats
* nf_tables: add protocol and flags for xtables over nf_tables
* nf_tables: add ip[6]t_entry emulation
* nf_tables: move specific layer 3 compat code to nf_tables_ipv[4|6]
* nf_tables: support 32bits-64bits x_tables compat
* nf_tables: fix compilation if CONFIG_COMPAT is disabled

From Patrick McHardy:
* nf_tables: move policy to struct nft_base_chain
* nf_tables: send notifications for base chain policy changes

From Alexander Primak:
* nf_tables: remove the duplicate NF_INET_LOCAL_OUT

From Nicolas Dichtel:
* nf_tables: fix compilation when nf-netlink is a module

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h               |  44 +-
 include/net/netfilter/nf_tables_ipv4.h          |  23 +
 include/net/netfilter/nf_tables_ipv6.h          |  30 +
 include/uapi/linux/netfilter/Kbuild             |   1 +
 include/uapi/linux/netfilter/nf_tables.h        |  32 +
 include/uapi/linux/netfilter/nf_tables_compat.h |  38 ++
 include/uapi/linux/netfilter/nfnetlink.h        |   3 +-
 net/ipv4/netfilter/nf_tables_ipv4.c             |  32 +-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c         |   6 +-
 net/ipv4/netfilter/nft_chain_route_ipv4.c       |   6 +-
 net/ipv6/netfilter/nf_tables_ipv6.c             |  33 +-
 net/ipv6/netfilter/nft_chain_route_ipv6.c       |   8 +-
 net/netfilter/Kconfig                           |   9 +
 net/netfilter/Makefile                          |   1 +
 net/netfilter/nf_tables_api.c                   | 220 ++++++-
 net/netfilter/nf_tables_core.c                  |  46 +-
 net/netfilter/nft_cmp.c                         |   3 +-
 net/netfilter/nft_compat.c                      | 768 ++++++++++++++++++++++++
 net/netfilter/nft_immediate.c                   |  12 +-
 net/netfilter/nft_payload.c                     |   4 +-
 20 files changed, 1241 insertions(+), 78 deletions(-)
 create mode 100644 include/net/netfilter/nf_tables_ipv4.h
 create mode 100644 include/net/netfilter/nf_tables_ipv6.h
 create mode 100644 include/uapi/linux/netfilter/nf_tables_compat.h
 create mode 100644 net/netfilter/nft_compat.c

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 8403f7f52e81..a68f45f0fe2e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netlink.h>
 
@@ -15,8 +16,23 @@ struct nft_pktinfo {
 	u8				hooknum;
 	u8				nhoff;
 	u8				thoff;
+	/* for x_tables compatibility */
+	struct xt_action_param		xt;
 };
 
+static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
+				   const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
+				   const struct net_device *in,
+				   const struct net_device *out)
+{
+	pkt->skb = skb;
+	pkt->in = pkt->xt.in = in;
+	pkt->out = pkt->xt.out = out;
+	pkt->hooknum = pkt->xt.hooknum = ops->hooknum;
+	pkt->xt.family = ops->pf;
+}
+
 struct nft_data {
 	union {
 		u32				data[4];
@@ -57,6 +73,7 @@ static inline void nft_data_debug(const struct nft_data *data)
  * 	@afi: address family info
  * 	@table: the table the chain is contained in
  * 	@chain: the chain the rule is contained in
+ *	@nla: netlink attributes
  */
 struct nft_ctx {
 	const struct sk_buff		*skb;
@@ -64,6 +81,7 @@ struct nft_ctx {
 	const struct nft_af_info	*afi;
 	const struct nft_table		*table;
 	const struct nft_chain		*chain;
+	const struct nlattr * const 	*nla;
 };
 
 struct nft_data_desc {
@@ -235,7 +253,8 @@ extern void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
  *	@maxattr: highest netlink attribute number
  */
 struct nft_expr_type {
-	const struct nft_expr_ops	*(*select_ops)(const struct nlattr * const tb[]);
+	const struct nft_expr_ops	*(*select_ops)(const struct nft_ctx *,
+						       const struct nlattr * const tb[]);
 	const struct nft_expr_ops	*ops;
 	struct list_head		list;
 	const char			*name;
@@ -253,6 +272,8 @@ struct nft_expr_type {
  *	@destroy: destruction function
  *	@dump: function to dump parameters
  *	@type: expression type
+ *	@validate: validate expression, called during loop detection
+ *	@data: extra data to attach to this expression operation
  */
 struct nft_expr;
 struct nft_expr_ops {
@@ -267,8 +288,11 @@ struct nft_expr_ops {
 	void				(*destroy)(const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
 						const struct nft_expr *expr);
-	const struct nft_data *		(*get_verdict)(const struct nft_expr *expr);
+	int				(*validate)(const struct nft_ctx *ctx,
+						    const struct nft_expr *expr,
+						    const struct nft_data **data);
 	const struct nft_expr_type	*type;
+	void				*data;
 };
 
 #define NFT_EXPR_MAXATTR		16
@@ -368,16 +392,25 @@ enum nft_chain_type {
 	NFT_CHAIN_T_MAX
 };
 
+struct nft_stats {
+	u64 bytes;
+	u64 pkts;
+};
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
  *	@type: chain type
+ *	@policy: default policy
+ *	@stats: per-cpu chain stats
  *	@chain: the chain
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
 	enum nft_chain_type		type;
+	u8				policy;
+	struct nft_stats __percpu	*stats;
 	struct nft_chain		chain;
 };
 
@@ -386,11 +419,8 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai
 	return container_of(chain, struct nft_base_chain, chain);
 }
 
-extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
-				 struct sk_buff *skb,
-				 const struct net_device *in,
-				 const struct net_device *out,
-				 int (*okfn)(struct sk_buff *));
+extern unsigned int nft_do_chain_pktinfo(struct nft_pktinfo *pkt,
+					 const struct nf_hook_ops *ops);
 
 /**
  *	struct nft_table - nf_tables table
diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
new file mode 100644
index 000000000000..1be1c2c197ee
--- /dev/null
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -0,0 +1,23 @@
+#ifndef _NF_TABLES_IPV4_H_
+#define _NF_TABLES_IPV4_H_
+
+#include <net/netfilter/nf_tables.h>
+#include <net/ip.h>
+
+static inline void
+nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+		     const struct nf_hook_ops *ops,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out)
+{
+	struct iphdr *ip;
+
+	nft_set_pktinfo(pkt, ops, skb, in, out);
+
+	pkt->xt.thoff = ip_hdrlen(pkt->skb);
+	ip = ip_hdr(pkt->skb);
+	pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+}
+
+#endif
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
new file mode 100644
index 000000000000..4a9b88a65963
--- /dev/null
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -0,0 +1,30 @@
+#ifndef _NF_TABLES_IPV6_H_
+#define _NF_TABLES_IPV6_H_
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ipv6.h>
+
+static inline int
+nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+		     const struct nf_hook_ops *ops,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out)
+{
+	int protohdr, thoff = 0;
+	unsigned short frag_off;
+
+	nft_set_pktinfo(pkt, ops, skb, in, out);
+
+	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
+	/* If malformed, drop it */
+	if (protohdr < 0)
+		return -1;
+
+	pkt->xt.thoff = thoff;
+	pkt->xt.fragoff = frag_off;
+
+	return 0;
+}
+
+#endif
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 6ce0b7f566a7..17c3af2c4bb9 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -6,6 +6,7 @@ header-y += nf_conntrack_sctp.h
 header-y += nf_conntrack_tcp.h
 header-y += nf_conntrack_tuple_common.h
 header-y += nf_tables.h
+header-y += nf_tables_compat.h
 header-y += nf_nat.h
 header-y += nfnetlink.h
 header-y += nfnetlink_acct.h
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 779cf951c8de..1563875e6942 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -115,7 +115,10 @@ enum nft_table_attributes {
  * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
  * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
  * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ * @NFTA_CHAIN_POLICY: numeric policy of the chain (NLA_U32)
+ * @NFTA_CHAIN_USE: number of references to this chain (NLA_U32)
  * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING)
+ * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes)
  */
 enum nft_chain_attributes {
 	NFTA_CHAIN_UNSPEC,
@@ -123,7 +126,10 @@ enum nft_chain_attributes {
 	NFTA_CHAIN_HANDLE,
 	NFTA_CHAIN_NAME,
 	NFTA_CHAIN_HOOK,
+	NFTA_CHAIN_POLICY,
+	NFTA_CHAIN_USE,
 	NFTA_CHAIN_TYPE,
+	NFTA_CHAIN_COUNTERS,
 	__NFTA_CHAIN_MAX
 };
 #define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
@@ -135,6 +141,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_CHAIN: name of the chain containing the rule (NLA_STRING)
  * @NFTA_RULE_HANDLE: numeric handle of the rule (NLA_U64)
  * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
  */
 enum nft_rule_attributes {
 	NFTA_RULE_UNSPEC,
@@ -142,10 +149,35 @@ enum nft_rule_attributes {
 	NFTA_RULE_CHAIN,
 	NFTA_RULE_HANDLE,
 	NFTA_RULE_EXPRESSIONS,
+	NFTA_RULE_COMPAT,
 	__NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
 
+/**
+ * enum nft_rule_compat_flags - nf_tables rule compat flags
+ *
+ * @NFT_RULE_COMPAT_F_INV: invert the check result
+ */
+enum nft_rule_compat_flags {
+	NFT_RULE_COMPAT_F_INV	= (1 << 1),
+	NFT_RULE_COMPAT_F_MASK	= NFT_RULE_COMPAT_F_INV,
+};
+
+/**
+ * enum nft_rule_compat_attributes - nf_tables rule compat attributes
+ *
+ * @NFTA_RULE_COMPAT_PROTO: numerice value of handled protocol (NLA_U32)
+ * @NFTA_RULE_COMPAT_FLAGS: bitmask of enum nft_rule_compat_flags (NLA_U32)
+ */
+enum nft_rule_compat_attributes {
+	NFTA_RULE_COMPAT_UNSPEC,
+	NFTA_RULE_COMPAT_PROTO,
+	NFTA_RULE_COMPAT_FLAGS,
+	__NFTA_RULE_COMPAT_MAX
+};
+#define NFTA_RULE_COMPAT_MAX	(__NFTA_RULE_COMPAT_MAX - 1)
+
 /**
  * enum nft_set_flags - nf_tables set flags
  *
diff --git a/include/uapi/linux/netfilter/nf_tables_compat.h b/include/uapi/linux/netfilter/nf_tables_compat.h
new file mode 100644
index 000000000000..8310f5f76551
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_tables_compat.h
@@ -0,0 +1,38 @@
+#ifndef _NFT_COMPAT_NFNETLINK_H_
+#define _NFT_COMPAT_NFNETLINK_H_
+
+enum nft_target_attributes {
+	NFTA_TARGET_UNSPEC,
+	NFTA_TARGET_NAME,
+	NFTA_TARGET_REV,
+	NFTA_TARGET_INFO,
+	__NFTA_TARGET_MAX
+};
+#define NFTA_TARGET_MAX		(__NFTA_TARGET_MAX - 1)
+
+enum nft_match_attributes {
+	NFTA_MATCH_UNSPEC,
+	NFTA_MATCH_NAME,
+	NFTA_MATCH_REV,
+	NFTA_MATCH_INFO,
+	__NFTA_MATCH_MAX
+};
+#define NFTA_MATCH_MAX		(__NFTA_MATCH_MAX - 1)
+
+#define NFT_COMPAT_NAME_MAX	32
+
+enum {
+	NFNL_MSG_COMPAT_GET,
+	NFNL_MSG_COMPAT_MAX
+};
+
+enum {
+	NFTA_COMPAT_UNSPEC = 0,
+	NFTA_COMPAT_NAME,
+	NFTA_COMPAT_REV,
+	NFTA_COMPAT_TYPE,
+	__NFTA_COMPAT_MAX,
+};
+#define NFTA_COMPAT_MAX (__NFTA_COMPAT_MAX - 1)
+
+#endif
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index d276c3bd55b8..288959404d54 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -54,6 +54,7 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_CTNETLINK_TIMEOUT	8
 #define NFNL_SUBSYS_CTHELPER		9
 #define NFNL_SUBSYS_NFTABLES		10
-#define NFNL_SUBSYS_COUNT		11
+#define NFNL_SUBSYS_NFT_COMPAT		11
+#define NFNL_SUBSYS_COUNT		12
 
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 23525c4c0192..c61cffb9b760 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -15,6 +15,8 @@
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 
 static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				    struct sk_buff *skb,
@@ -22,6 +24,8 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				    const struct net_device *out,
 				    int (*okfn)(struct sk_buff *))
 {
+	struct nft_pktinfo pkt;
+
 	if (unlikely(skb->len < sizeof(struct iphdr) ||
 		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
 		if (net_ratelimit())
@@ -29,8 +33,9 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				"packet\n");
 		return NF_ACCEPT;
 	}
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
 
-	return nft_do_chain(ops, skb, in, out, okfn);
+	return nft_do_chain_pktinfo(&pkt, ops);
 }
 
 static struct nft_af_info nft_af_ipv4 __read_mostly = {
@@ -42,6 +47,21 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+
+static unsigned int
+nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+		  struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+	return nft_do_chain_pktinfo(&pkt, ops);
+}
+
 static struct nf_chain_type filter_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "filter",
@@ -52,11 +72,11 @@ static struct nf_chain_type filter_ipv4 = {
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
 	.fn		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain,
-		[NF_INET_FORWARD]	= nft_do_chain,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain,
-		[NF_INET_POST_ROUTING]	= nft_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
+		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
 	},
 };
 
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index cd286306be85..e09c201adf84 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -23,6 +23,7 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/ip.h>
 
@@ -181,6 +182,7 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	struct nf_conn_nat *nat;
 	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	struct nft_pktinfo pkt;
 	unsigned int ret;
 
 	if (ct == NULL || nf_ct_is_untracked(ct))
@@ -213,7 +215,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
 		if (nf_nat_initialized(ct, maniptype))
 			break;
 
-		ret = nft_do_chain(ops, skb, in, out, okfn);
+		nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+		ret = nft_do_chain_pktinfo(&pkt, ops);
 		if (ret != NF_ACCEPT)
 			return ret;
 		if (!nf_nat_initialized(ct, maniptype)) {
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 6b84e097b8fc..4e6bf9a3d7aa 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -17,6 +17,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 #include <net/route.h>
 #include <net/ip.h>
 
@@ -27,6 +28,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 					int (*okfn)(struct sk_buff *))
 {
 	unsigned int ret;
+	struct nft_pktinfo pkt;
 	u32 mark;
 	__be32 saddr, daddr;
 	u_int8_t tos;
@@ -37,13 +39,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
 	mark = skb->mark;
 	iph = ip_hdr(skb);
 	saddr = iph->saddr;
 	daddr = iph->daddr;
 	tos = iph->tos;
 
-	ret = nft_do_chain(ops, skb, in, out, okfn);
+	ret = nft_do_chain_pktinfo(&pkt, ops);
 	if (ret != NF_DROP && ret != NF_QUEUE) {
 		iph = ip_hdr(skb);
 
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 3631d6238e6f..42f905a808a3 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -14,6 +14,7 @@
 #include <linux/ipv6.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
 
 static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
 				    struct sk_buff *skb,
@@ -21,14 +22,18 @@ static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
 				    const struct net_device *out,
 				    int (*okfn)(struct sk_buff *))
 {
+	struct nft_pktinfo pkt;
+
 	if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
 		if (net_ratelimit())
 			pr_info("nf_tables_ipv6: ignoring short SOCK_RAW "
 				"packet\n");
 		return NF_ACCEPT;
 	}
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
 
-	return nft_do_chain(ops, skb, in, out, okfn);
+	return nft_do_chain_pktinfo(&pkt, ops);
 }
 
 static struct nft_af_info nft_af_ipv6 __read_mostly = {
@@ -40,6 +45,22 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static unsigned int
+nft_do_chain_ipv6(const struct nf_hook_ops *ops,
+		  struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	struct nft_pktinfo pkt;
+
+	/* malformed packet, drop it */
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
+
+	return nft_do_chain_pktinfo(&pkt, ops);
+}
+
 static struct nf_chain_type filter_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.name		= "filter",
@@ -50,11 +71,11 @@ static struct nf_chain_type filter_ipv6 = {
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
 	.fn		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain,
-		[NF_INET_FORWARD]	= nft_do_chain,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain,
-		[NF_INET_POST_ROUTING]	= nft_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
+		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
 	},
 };
 
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 4cdc992fa067..3fe40f0456ad 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -19,6 +19,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
 #include <net/route.h>
 
 static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
@@ -28,10 +29,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 					int (*okfn)(struct sk_buff *))
 {
 	unsigned int ret;
+	struct nft_pktinfo pkt;
 	struct in6_addr saddr, daddr;
 	u_int8_t hop_limit;
 	u32 mark, flowlabel;
 
+	/* malformed packet, drop it */
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
+
 	/* save source/dest address, mark, hoplimit, flowlabel, priority */
 	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
 	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
@@ -41,7 +47,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	/* flowlabel and prio (includes version, which shouldn't change either */
 	flowlabel = *((u32 *)ipv6_hdr(skb));
 
-	ret = nft_do_chain(ops, skb, in, out, okfn);
+	ret = nft_do_chain_pktinfo(&pkt, ops);
 	if (ret != NF_DROP && ret != NF_QUEUE &&
 	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
 	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index aa184a46bbf3..49e362707379 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -450,6 +450,15 @@ config NFT_LIMIT
 	depends on NF_TABLES
 	tristate "Netfilter nf_tables limit module"
 
+config NFT_COMPAT
+	depends on NF_TABLES
+	depends on NETFILTER_XTABLES
+	tristate "Netfilter x_tables over nf_tables module"
+	help
+	  This is required if you intend to use any of existing
+	  x_tables match/target extensions over the nf_tables
+	  framework.
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b6b78754e4cc..a6781450b6fb 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -70,6 +70,7 @@ nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
 nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
+obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9c2d8d5af843..61e017b349cb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -438,7 +438,9 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
 	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+	[NFTA_CHAIN_POLICY]	= { .type = NLA_U32 },
 	[NFTA_CHAIN_TYPE]	= { .type = NLA_NUL_STRING },
+	[NFTA_CHAIN_COUNTERS]	= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -446,6 +448,33 @@ static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
 	[NFTA_HOOK_PRIORITY]	= { .type = NLA_U32 },
 };
 
+static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+{
+	struct nft_stats *cpu_stats, total;
+	struct nlattr *nest;
+	int cpu;
+
+	memset(&total, 0, sizeof(total));
+	for_each_possible_cpu(cpu) {
+		cpu_stats = per_cpu_ptr(stats, cpu);
+		total.pkts += cpu_stats->pkts;
+		total.bytes += cpu_stats->bytes;
+	}
+	nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
+	    nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -ENOSPC;
+}
+
 static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 				     int event, u32 flags, int family,
 				     const struct nft_table *table,
@@ -472,8 +501,11 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 		goto nla_put_failure;
 
 	if (chain->flags & NFT_BASE_CHAIN) {
-		const struct nf_hook_ops *ops = &nft_base_chain(chain)->ops;
-		struct nlattr *nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+		const struct nft_base_chain *basechain = nft_base_chain(chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+		struct nlattr *nest;
+
+		nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
 		if (nest == NULL)
 			goto nla_put_failure;
 		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
@@ -482,11 +514,21 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 			goto nla_put_failure;
 		nla_nest_end(skb, nest);
 
+		if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
+				 htonl(basechain->policy)))
+			goto nla_put_failure;
+
 		if (nla_put_string(skb, NFTA_CHAIN_TYPE,
 			chain_type[ops->pf][nft_base_chain(chain)->type]->name))
 				goto nla_put_failure;
+
+		if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+			goto nla_put_failure;
 	}
 
+	if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
+		goto nla_put_failure;
+
 	return nlmsg_end(skb, nlh);
 
 nla_put_failure:
@@ -617,6 +659,67 @@ err:
 	return err;
 }
 
+static int
+nf_tables_chain_policy(struct nft_base_chain *chain, const struct nlattr *attr)
+{
+	switch (ntohl(nla_get_be32(attr))) {
+	case NF_DROP:
+		chain->policy = NF_DROP;
+		break;
+	case NF_ACCEPT:
+		chain->policy = NF_ACCEPT;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 },
+	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 },
+};
+
+static int
+nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr)
+{
+	struct nlattr *tb[NFTA_COUNTER_MAX+1];
+	struct nft_stats __percpu *newstats;
+	struct nft_stats *stats;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
+		return -EINVAL;
+
+	newstats = alloc_percpu(struct nft_stats);
+	if (newstats == NULL)
+		return -ENOMEM;
+
+	/* Restore old counters on this cpu, no problem. Per-cpu statistics
+	 * are not exposed to userspace.
+	 */
+	stats = this_cpu_ptr(newstats);
+	stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+	stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+
+	if (chain->stats) {
+		/* nfnl_lock is held, add some nfnl function for this, later */
+		struct nft_stats __percpu *oldstats =
+			rcu_dereference_protected(chain->stats, 1);
+
+		rcu_assign_pointer(chain->stats, newstats);
+		synchronize_rcu();
+		free_percpu(oldstats);
+	} else
+		rcu_assign_pointer(chain->stats, newstats);
+
+	return 0;
+}
+
 static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			      const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[])
@@ -626,7 +729,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
-	struct nft_base_chain *basechain;
+	struct nft_base_chain *basechain = NULL;
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
 	int family = nfmsg->nfgen_family;
 	u64 handle = 0;
@@ -673,6 +776,26 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
 			return -EEXIST;
 
+		if (nla[NFTA_CHAIN_POLICY]) {
+			if (!(chain->flags & NFT_BASE_CHAIN))
+				return -EOPNOTSUPP;
+
+			err = nf_tables_chain_policy(nft_base_chain(chain),
+						     nla[NFTA_CHAIN_POLICY]);
+			if (err < 0)
+				return err;
+		}
+
+		if (nla[NFTA_CHAIN_COUNTERS]) {
+			if (!(chain->flags & NFT_BASE_CHAIN))
+				return -EOPNOTSUPP;
+
+			err = nf_tables_counters(nft_base_chain(chain),
+						 nla[NFTA_CHAIN_COUNTERS]);
+			if (err < 0)
+				return err;
+		}
+
 		if (nla[NFTA_CHAIN_HANDLE] && name)
 			nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
 
@@ -727,6 +850,36 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			ops->hook = afi->hooks[ops->hooknum];
 
 		chain->flags |= NFT_BASE_CHAIN;
+
+		if (nla[NFTA_CHAIN_POLICY]) {
+			err = nf_tables_chain_policy(basechain,
+						     nla[NFTA_CHAIN_POLICY]);
+			if (err < 0) {
+				free_percpu(basechain->stats);
+				kfree(basechain);
+				return err;
+			}
+		} else
+			basechain->policy = NF_ACCEPT;
+
+		if (nla[NFTA_CHAIN_COUNTERS]) {
+			err = nf_tables_counters(basechain,
+						 nla[NFTA_CHAIN_COUNTERS]);
+			if (err < 0) {
+				free_percpu(basechain->stats);
+				kfree(basechain);
+				return err;
+			}
+		} else {
+			struct nft_stats __percpu *newstats;
+
+			newstats = alloc_percpu(struct nft_stats);
+			if (newstats == NULL)
+				return -ENOMEM;
+
+			rcu_assign_pointer(nft_base_chain(chain)->stats,
+					   newstats);
+		}
 	} else {
 		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
 		if (chain == NULL)
@@ -739,6 +892,15 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	list_add_tail(&chain->list, &table->chains);
 	table->use++;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		err = nf_register_hook(&nft_base_chain(chain)->ops);
+		if (err < 0) {
+			free_percpu(basechain->stats);
+			kfree(basechain);
+			return err;
+		}
+	}
 notify:
 	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN,
 			       family);
@@ -751,9 +913,10 @@ static void nf_tables_rcu_chain_destroy(struct rcu_head *head)
 
 	BUG_ON(chain->use > 0);
 
-	if (chain->flags & NFT_BASE_CHAIN)
+	if (chain->flags & NFT_BASE_CHAIN) {
+		free_percpu(nft_base_chain(chain)->stats);
 		kfree(nft_base_chain(chain));
-	else
+	} else
 		kfree(chain);
 }
 
@@ -801,13 +964,15 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 			 const struct nlmsghdr *nlh,
 			 const struct nft_af_info *afi,
 			 const struct nft_table *table,
-			 const struct nft_chain *chain)
+			 const struct nft_chain *chain,
+			 const struct nlattr * const *nla)
 {
 	ctx->skb   = skb;
 	ctx->nlh   = nlh;
 	ctx->afi   = afi;
 	ctx->table = table;
 	ctx->chain = chain;
+	ctx->nla   = nla;
 }
 
 /*
@@ -910,7 +1075,8 @@ struct nft_expr_info {
 	struct nlattr			*tb[NFT_EXPR_MAXATTR + 1];
 };
 
-static int nf_tables_expr_parse(const struct nlattr *nla,
+static int nf_tables_expr_parse(const struct nft_ctx *ctx,
+				const struct nlattr *nla,
 				struct nft_expr_info *info)
 {
 	const struct nft_expr_type *type;
@@ -935,7 +1101,8 @@ static int nf_tables_expr_parse(const struct nlattr *nla,
 		memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
 
 	if (type->select_ops != NULL) {
-		ops = type->select_ops((const struct nlattr * const *)info->tb);
+		ops = type->select_ops(ctx,
+				       (const struct nlattr * const *)info->tb);
 		if (IS_ERR(ops)) {
 			err = PTR_ERR(ops);
 			goto err1;
@@ -1012,6 +1179,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 },
 	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED },
+	[NFTA_RULE_COMPAT]	= { .type = NLA_NESTED },
 };
 
 static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
@@ -1269,6 +1437,8 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		handle = nf_tables_alloc_handle(table);
 	}
 
+	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
 	n = 0;
 	size = 0;
 	if (nla[NFTA_RULE_EXPRESSIONS]) {
@@ -1278,7 +1448,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 				goto err1;
 			if (n == NFT_RULE_MAXEXPRS)
 				goto err1;
-			err = nf_tables_expr_parse(tmp, &info[n]);
+			err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
 			if (err < 0)
 				goto err1;
 			size += info[n].ops->size;
@@ -1294,7 +1464,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	rule->handle = handle;
 	rule->dlen   = size;
 
-	nft_ctx_init(&ctx, skb, nlh, afi, table, chain);
 	expr = nft_expr_first(rule);
 	for (i = 0; i < n; i++) {
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
@@ -1304,13 +1473,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		expr = nft_expr_next(expr);
 	}
 
-	/* Register hook when first rule is inserted into a base chain */
-	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN) {
-		err = nf_register_hook(&nft_base_chain(chain)->ops);
-		if (err < 0)
-			goto err2;
-	}
-
 	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
 		list_replace_rcu(&old_rule->list, &rule->list);
 		nf_tables_rule_destroy(old_rule);
@@ -1379,10 +1541,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 		}
 	}
 
-	/* Unregister hook when last rule from base chain is deleted */
-	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN)
-		nf_unregister_hook(&nft_base_chain(chain)->ops);
-
 	return 0;
 }
 
@@ -1470,7 +1628,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 			return PTR_ERR(table);
 	}
 
-	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
 	return 0;
 }
 
@@ -1799,7 +1957,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
 
 	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
 	if (IS_ERR(set)) {
@@ -1987,7 +2145,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
 	return 0;
 }
 
@@ -2435,23 +2593,27 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 {
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
-	const struct nft_data *data;
 	const struct nft_set *set;
 	struct nft_set_binding *binding;
 	struct nft_set_iter iter;
-	int err;
 
 	if (ctx->chain == chain)
 		return -ELOOP;
 
 	list_for_each_entry(rule, &chain->rules, list) {
 		nft_rule_for_each_expr(expr, last, rule) {
-			if (!expr->ops->get_verdict)
+			const struct nft_data *data = NULL;
+			int err;
+
+			if (!expr->ops->validate)
 				continue;
 
-			data = expr->ops->get_verdict(expr);
+			err = expr->ops->validate(ctx, expr, &data);
+			if (err < 0)
+				return err;
+
 			if (data == NULL)
-				break;
+				continue;
 
 			switch (data->verdict) {
 			case NFT_JUMP:
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 9aede59ed2d7..e51a45c12128 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -60,27 +60,34 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
 	return true;
 }
 
-unsigned int nft_do_chain(const struct nf_hook_ops *ops,
-			  struct sk_buff *skb,
-			  const struct net_device *in,
-			  const struct net_device *out,
-			  int (*okfn)(struct sk_buff *))
+struct nft_jumpstack {
+	const struct nft_chain	*chain;
+	const struct nft_rule	*rule;
+};
+
+static inline void
+nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt,
+		struct nft_jumpstack *jumpstack, unsigned int stackptr)
+{
+	struct nft_stats __percpu *stats;
+	const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this;
+
+	rcu_read_lock_bh();
+	stats = rcu_dereference(nft_base_chain(chain)->stats);
+	__this_cpu_inc(stats->pkts);
+	__this_cpu_add(stats->bytes, pkt->skb->len);
+	rcu_read_unlock_bh();
+}
+
+unsigned int
+nft_do_chain_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
 {
 	const struct nft_chain *chain = ops->priv;
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
 	struct nft_data data[NFT_REG_MAX + 1];
-	const struct nft_pktinfo pkt = {
-		.skb		= skb,
-		.in		= in,
-		.out		= out,
-		.hooknum	= ops->hooknum,
-	};
 	unsigned int stackptr = 0;
-	struct {
-		const struct nft_chain	*chain;
-		const struct nft_rule	*rule;
-	} jumpstack[NFT_JUMP_STACK_SIZE];
+	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
 
 do_chain:
 	rule = list_entry(&chain->rules, struct nft_rule, list);
@@ -91,8 +98,8 @@ next_rule:
 			if (expr->ops == &nft_cmp_fast_ops)
 				nft_cmp_fast_eval(expr, data);
 			else if (expr->ops != &nft_payload_fast_ops ||
-				 !nft_payload_fast_eval(expr, data, &pkt))
-				expr->ops->eval(expr, data, &pkt);
+				 !nft_payload_fast_eval(expr, data, pkt))
+				expr->ops->eval(expr, data, pkt);
 
 			if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
 				break;
@@ -135,10 +142,11 @@ next_rule:
 		rule  = jumpstack[stackptr].rule;
 		goto next_rule;
 	}
+	nft_chain_stats(chain, pkt, jumpstack, stackptr);
 
-	return NF_ACCEPT;
+	return nft_base_chain(chain)->policy;
 }
-EXPORT_SYMBOL_GPL(nft_do_chain);
+EXPORT_SYMBOL_GPL(nft_do_chain_pktinfo);
 
 int __init nf_tables_core_module_init(void)
 {
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 37134f3e84fb..954925db414d 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -162,7 +162,8 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
 	.dump		= nft_cmp_fast_dump,
 };
 
-static const struct nft_expr_ops *nft_cmp_select_ops(const struct nlattr * const tb[])
+static const struct nft_expr_ops *
+nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
 	struct nft_data_desc desc;
 	struct nft_data data;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
new file mode 100644
index 000000000000..4811f762e060
--- /dev/null
+++ b/net/netfilter/nft_compat.c
@@ -0,0 +1,768 @@
+/*
+ * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This software has been sponsored by Sophos Astaro <http://www.sophos.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_tables_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <asm/uaccess.h> /* for set_fs */
+#include <net/netfilter/nf_tables.h>
+
+union nft_entry {
+	struct ipt_entry e4;
+	struct ip6t_entry e6;
+};
+
+static inline void
+nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+{
+	par->target	= xt;
+	par->targinfo	= xt_info;
+	par->hotdrop	= false;
+}
+
+static void nft_target_eval(const struct nft_expr *expr,
+			    struct nft_data data[NFT_REG_MAX + 1],
+			    const struct nft_pktinfo *pkt)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_target *target = expr->ops->data;
+	struct sk_buff *skb = pkt->skb;
+	int ret;
+
+	nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+
+	ret = target->target(skb, &pkt->xt);
+
+	if (pkt->xt.hotdrop)
+		ret = NF_DROP;
+
+	switch(ret) {
+	case XT_CONTINUE:
+		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+		break;
+	default:
+		data[NFT_REG_VERDICT].verdict = ret;
+		break;
+	}
+	return;
+}
+
+static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
+	[NFTA_TARGET_NAME]	= { .type = NLA_NUL_STRING },
+	[NFTA_TARGET_REV]	= { .type = NLA_U32 },
+	[NFTA_TARGET_INFO]	= { .type = NLA_BINARY },
+};
+
+static void
+nft_target_set_tgchk_param(struct xt_tgchk_param *par,
+			   const struct nft_ctx *ctx,
+			   struct xt_target *target, void *info,
+			   union nft_entry *entry, u8 proto, bool inv)
+{
+	par->net	= &init_net;
+	par->table	= ctx->table->name;
+	switch (ctx->afi->family) {
+	case AF_INET:
+		entry->e4.ip.proto = proto;
+		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+		break;
+	case AF_INET6:
+		entry->e6.ipv6.proto = proto;
+		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+		break;
+	}
+	par->entryinfo	= entry;
+	par->target	= target;
+	par->targinfo	= info;
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		par->hook_mask = 1 << ops->hooknum;
+	}
+	par->family	= ctx->afi->family;
+}
+
+static void target_compat_from_user(struct xt_target *t, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+	if (t->compat_from_user) {
+		int pad;
+
+		t->compat_from_user(out, in);
+		pad = XT_ALIGN(t->targetsize) - t->targetsize;
+		if (pad > 0)
+			memset(out + t->targetsize, 0, pad);
+	} else
+#endif
+		memcpy(out, in, XT_ALIGN(t->targetsize));
+}
+
+static inline int nft_compat_target_offset(struct xt_target *target)
+{
+#ifdef CONFIG_COMPAT
+	return xt_compat_target_offset(target);
+#else
+	return 0;
+#endif
+}
+
+static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = {
+	[NFTA_RULE_COMPAT_PROTO]	= { .type = NLA_U32 },
+	[NFTA_RULE_COMPAT_FLAGS]	= { .type = NLA_U32 },
+};
+
+static u8 nft_parse_compat(const struct nlattr *attr, bool *inv)
+{
+	struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+	u32 flags;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr,
+			       nft_rule_compat_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS])
+		return -EINVAL;
+
+	flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
+	if (flags & ~NFT_RULE_COMPAT_F_MASK)
+		return -EINVAL;
+	if (flags & NFT_RULE_COMPAT_F_INV)
+		*inv = true;
+
+	return ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+}
+
+static int
+nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		const struct nlattr * const tb[])
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_target *target = expr->ops->data;
+	struct xt_tgchk_param par;
+	size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
+	u8 proto = 0;
+	bool inv = false;
+	union nft_entry e = {};
+	int ret;
+
+	target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info);
+
+	if (ctx->nla[NFTA_RULE_COMPAT])
+		proto = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &inv);
+
+	nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
+	ret = xt_check_target(&par, size, proto, inv);
+	if (ret < 0)
+		goto err;
+
+	/* The standard target cannot be used */
+	if (target->target == NULL) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	return 0;
+err:
+	module_put(target->me);
+	return ret;
+}
+
+static void
+nft_target_destroy(const struct nft_expr *expr)
+{
+	struct xt_target *target = expr->ops->data;
+
+	module_put(target->me);
+}
+
+static int
+target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in)
+{
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	if (t->compat_to_user) {
+		mm_segment_t old_fs;
+		void *out;
+
+		out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC);
+		if (out == NULL)
+			return -ENOMEM;
+
+		/* We want to reuse existing compat_to_user */
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		t->compat_to_user(out, in);
+		set_fs(old_fs);
+		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out);
+		kfree(out);
+	} else
+#endif
+		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in);
+
+	return ret;
+}
+
+static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct xt_target *target = expr->ops->data;
+	void *info = nft_expr_priv(expr);
+
+	if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
+	    nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
+	    target_dump_info(skb, target, info))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_target_validate(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nft_data **data)
+{
+	struct xt_target *target = expr->ops->data;
+	unsigned int hook_mask = 0;
+
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		hook_mask = 1 << ops->hooknum;
+		if (hook_mask & target->hooks)
+			return 0;
+
+		/* This target is being called from an invalid chain */
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void nft_match_eval(const struct nft_expr *expr,
+			   struct nft_data data[NFT_REG_MAX + 1],
+			   const struct nft_pktinfo *pkt)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+	struct sk_buff *skb = pkt->skb;
+	bool ret;
+
+	nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+
+	ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+
+	if (pkt->xt.hotdrop) {
+		data[NFT_REG_VERDICT].verdict = NF_DROP;
+		return;
+	}
+
+	switch(ret) {
+	case true:
+		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+		break;
+	case false:
+		data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+		break;
+	}
+}
+
+static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
+	[NFTA_MATCH_NAME]	= { .type = NLA_NUL_STRING },
+	[NFTA_MATCH_REV]	= { .type = NLA_U32 },
+	[NFTA_MATCH_INFO]	= { .type = NLA_BINARY },
+};
+
+/* struct xt_mtchk_param and xt_tgchk_param look very similar */
+static void
+nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
+			  struct xt_match *match, void *info,
+			  union nft_entry *entry, u8 proto, bool inv)
+{
+	par->net	= &init_net;
+	par->table	= ctx->table->name;
+	switch (ctx->afi->family) {
+	case AF_INET:
+		entry->e4.ip.proto = proto;
+		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+		break;
+	case AF_INET6:
+		entry->e6.ipv6.proto = proto;
+		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+		break;
+	}
+	par->entryinfo	= entry;
+	par->match	= match;
+	par->matchinfo	= info;
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		par->hook_mask = 1 << ops->hooknum;
+	}
+	par->family	= ctx->afi->family;
+}
+
+static void match_compat_from_user(struct xt_match *m, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+	if (m->compat_from_user) {
+		int pad;
+
+		m->compat_from_user(out, in);
+		pad = XT_ALIGN(m->matchsize) - m->matchsize;
+		if (pad > 0)
+			memset(out + m->matchsize, 0, pad);
+	} else
+#endif
+		memcpy(out, in, XT_ALIGN(m->matchsize));
+}
+
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		const struct nlattr * const tb[])
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+	struct xt_mtchk_param par;
+	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
+	u8 proto = 0;
+	bool inv = false;
+	union nft_entry e = {};
+	int ret;
+
+	match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info);
+
+	if (ctx->nla[NFTA_RULE_COMPAT])
+		proto = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &inv);
+
+	nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
+	ret = xt_check_match(&par, size, proto, inv);
+	if (ret < 0)
+		goto err;
+
+	return 0;
+err:
+	module_put(match->me);
+	return ret;
+}
+
+static void
+nft_match_destroy(const struct nft_expr *expr)
+{
+	struct xt_match *match = expr->ops->data;
+
+	module_put(match->me);
+}
+
+static int
+match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in)
+{
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	if (m->compat_to_user) {
+		mm_segment_t old_fs;
+		void *out;
+
+		out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC);
+		if (out == NULL)
+			return -ENOMEM;
+
+		/* We want to reuse existing compat_to_user */
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		m->compat_to_user(out, in);
+		set_fs(old_fs);
+		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out);
+		kfree(out);
+	} else
+#endif
+		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in);
+
+	return ret;
+}
+
+static inline int nft_compat_match_offset(struct xt_match *match)
+{
+#ifdef CONFIG_COMPAT
+	return xt_compat_match_offset(match);
+#else
+	return 0;
+#endif
+}
+
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+
+	if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
+	    nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
+	    match_dump_info(skb, match, info))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_match_validate(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nft_data **data)
+{
+	struct xt_match *match = expr->ops->data;
+	unsigned int hook_mask = 0;
+
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		hook_mask = 1 << ops->hooknum;
+		if (hook_mask & match->hooks)
+			return 0;
+
+		/* This match is being called from an invalid chain */
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+		      int event, u16 family, const char *name,
+		      int rev, int target)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+	event |= NFNL_SUBSYS_NFT_COMPAT << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = family;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = 0;
+
+	if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
+	    nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
+	    nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+static int
+nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb,
+		const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	int ret = 0, target;
+	struct nfgenmsg *nfmsg;
+	const char *fmt;
+	const char *name;
+	u32 rev;
+	struct sk_buff *skb2;
+
+	if (tb[NFTA_COMPAT_NAME] == NULL ||
+	    tb[NFTA_COMPAT_REV] == NULL ||
+	    tb[NFTA_COMPAT_TYPE] == NULL)
+		return -EINVAL;
+
+	name = nla_data(tb[NFTA_COMPAT_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
+	target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
+
+	nfmsg = nlmsg_data(nlh);
+
+	switch(nfmsg->nfgen_family) {
+	case AF_INET:
+		fmt = "ipt_%s";
+		break;
+	case AF_INET6:
+		fmt = "ip6t_%s";
+		break;
+	default:
+		pr_err("nft_compat: unsupported protocol %d\n",
+			nfmsg->nfgen_family);
+		return -EINVAL;
+	}
+
+	try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
+						 rev, target, &ret),
+						 fmt, name);
+
+	if (ret < 0)
+		return ret;
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		return -ENOMEM;
+
+	/* include the best revision for this extension in the message */
+	if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
+				  nlh->nlmsg_seq,
+				  NFNL_MSG_TYPE(nlh->nlmsg_type),
+				  NFNL_MSG_COMPAT_GET,
+				  nfmsg->nfgen_family,
+				  name, ret, target) <= 0) {
+		kfree_skb(skb2);
+		return -ENOSPC;
+	}
+
+	ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+				MSG_DONTWAIT);
+	if (ret > 0)
+		ret = 0;
+
+	return ret == -EAGAIN ? -ENOBUFS : ret;
+}
+
+static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
+	[NFTA_COMPAT_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = NFT_COMPAT_NAME_MAX-1 },
+	[NFTA_COMPAT_REV]	= { .type = NLA_U32 },
+	[NFTA_COMPAT_TYPE]	= { .type = NLA_U32 },
+};
+
+static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
+	[NFNL_MSG_COMPAT_GET]		= { .call = nfnl_compat_get,
+					    .attr_count = NFTA_COMPAT_MAX,
+					    .policy = nfnl_compat_policy_get },
+};
+
+static const struct nfnetlink_subsystem nfnl_compat_subsys = {
+	.name		= "nft-compat",
+	.subsys_id	= NFNL_SUBSYS_NFT_COMPAT,
+	.cb_count	= NFNL_MSG_COMPAT_MAX,
+	.cb		= nfnl_nft_compat_cb,
+};
+
+static LIST_HEAD(nft_match_list);
+
+struct nft_xt {
+	struct list_head	head;
+	struct nft_expr_ops	ops;
+};
+
+static struct nft_expr_type nft_match_type;
+
+static const struct nft_expr_ops *
+nft_match_select_ops(const struct nft_ctx *ctx,
+		     const struct nlattr * const tb[])
+{
+	struct nft_xt *nft_match;
+	struct xt_match *match;
+	char *mt_name;
+	__u32 rev, family;
+
+	if (tb[NFTA_MATCH_NAME] == NULL ||
+	    tb[NFTA_MATCH_REV] == NULL ||
+	    tb[NFTA_MATCH_INFO] == NULL)
+		return ERR_PTR(-EINVAL);
+
+	mt_name = nla_data(tb[NFTA_MATCH_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
+	family = ctx->afi->family;
+
+	/* Re-use the existing match if it's already loaded. */
+	list_for_each_entry(nft_match, &nft_match_list, head) {
+		struct xt_match *match = nft_match->ops.data;
+
+		if (strcmp(match->name, mt_name) == 0 &&
+		    match->revision == rev && match->family == family)
+			return &nft_match->ops;
+	}
+
+	match = xt_request_find_match(family, mt_name, rev);
+	if (IS_ERR(match))
+		return ERR_PTR(-ENOENT);
+
+	/* This is the first time we use this match, allocate operations */
+	nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+	if (nft_match == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	nft_match->ops.type = &nft_match_type;
+	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) +
+					    nft_compat_match_offset(match));
+	nft_match->ops.eval = nft_match_eval;
+	nft_match->ops.init = nft_match_init;
+	nft_match->ops.destroy = nft_match_destroy;
+	nft_match->ops.dump = nft_match_dump;
+	nft_match->ops.validate = nft_match_validate;
+	nft_match->ops.data = match;
+
+	list_add(&nft_match->head, &nft_match_list);
+
+	return &nft_match->ops;
+}
+
+static void nft_match_release(void)
+{
+	struct nft_xt *nft_match;
+
+	list_for_each_entry(nft_match, &nft_match_list, head)
+		kfree(nft_match);
+}
+
+static struct nft_expr_type nft_match_type __read_mostly = {
+	.name		= "match",
+	.select_ops	= nft_match_select_ops,
+	.policy		= nft_match_policy,
+	.maxattr	= NFTA_MATCH_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static LIST_HEAD(nft_target_list);
+
+static struct nft_expr_type nft_target_type;
+
+static const struct nft_expr_ops *
+nft_target_select_ops(const struct nft_ctx *ctx,
+		      const struct nlattr * const tb[])
+{
+	struct nft_xt *nft_target;
+	struct xt_target *target;
+	char *tg_name;
+	__u32 rev, family;
+
+	if (tb[NFTA_TARGET_NAME] == NULL ||
+	    tb[NFTA_TARGET_REV] == NULL ||
+	    tb[NFTA_TARGET_INFO] == NULL)
+		return ERR_PTR(-EINVAL);
+
+	tg_name = nla_data(tb[NFTA_TARGET_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
+	family = ctx->afi->family;
+
+	/* Re-use the existing target if it's already loaded. */
+	list_for_each_entry(nft_target, &nft_match_list, head) {
+		struct xt_target *target = nft_target->ops.data;
+
+		if (strcmp(target->name, tg_name) == 0 &&
+		    target->revision == rev && target->family == family)
+			return &nft_target->ops;
+	}
+
+	target = xt_request_find_target(family, tg_name, rev);
+	if (IS_ERR(target))
+		return ERR_PTR(-ENOENT);
+
+	/* This is the first time we use this target, allocate operations */
+	nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+	if (nft_target == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	nft_target->ops.type = &nft_target_type;
+	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) +
+					     nft_compat_target_offset(target));
+	nft_target->ops.eval = nft_target_eval;
+	nft_target->ops.init = nft_target_init;
+	nft_target->ops.destroy = nft_target_destroy;
+	nft_target->ops.dump = nft_target_dump;
+	nft_target->ops.validate = nft_target_validate;
+	nft_target->ops.data = target;
+
+	list_add(&nft_target->head, &nft_target_list);
+
+	return &nft_target->ops;
+}
+
+static void nft_target_release(void)
+{
+	struct nft_xt *nft_target;
+
+	list_for_each_entry(nft_target, &nft_target_list, head)
+		kfree(nft_target);
+}
+
+static struct nft_expr_type nft_target_type __read_mostly = {
+	.name		= "target",
+	.select_ops	= nft_target_select_ops,
+	.policy		= nft_target_policy,
+	.maxattr	= NFTA_TARGET_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_compat_module_init(void)
+{
+	int ret;
+
+	ret = nft_register_expr(&nft_match_type);
+	if (ret < 0)
+		return ret;
+
+	ret = nft_register_expr(&nft_target_type);
+	if (ret < 0)
+		goto err_match;
+
+	ret = nfnetlink_subsys_register(&nfnl_compat_subsys);
+	if (ret < 0) {
+		pr_err("nft_compat: cannot register with nfnetlink.\n");
+		goto err_target;
+	}
+
+	pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n");
+
+	return ret;
+
+err_target:
+	nft_unregister_expr(&nft_target_type);
+err_match:
+	nft_unregister_expr(&nft_match_type);
+	return ret;
+}
+
+static void __exit nft_compat_module_exit(void)
+{
+	nfnetlink_subsys_unregister(&nfnl_compat_subsys);
+	nft_unregister_expr(&nft_target_type);
+	nft_unregister_expr(&nft_match_type);
+	nft_match_release();
+	nft_target_release();
+}
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
+
+module_init(nft_compat_module_init);
+module_exit(nft_compat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("match");
+MODULE_ALIAS_NFT_EXPR("target");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 1bfeeaf865b6..f169501f1ad4 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -90,14 +90,16 @@ nla_put_failure:
 	return -1;
 }
 
-static const struct nft_data *nft_immediate_get_verdict(const struct nft_expr *expr)
+static int nft_immediate_validate(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr,
+				  const struct nft_data **data)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
 
 	if (priv->dreg == NFT_REG_VERDICT)
-		return &priv->data;
-	else
-		return NULL;
+		*data = &priv->data;
+
+	return 0;
 }
 
 static struct nft_expr_type nft_imm_type;
@@ -108,7 +110,7 @@ static const struct nft_expr_ops nft_imm_ops = {
 	.init		= nft_immediate_init,
 	.destroy	= nft_immediate_destroy,
 	.dump		= nft_immediate_dump,
-	.get_verdict	= nft_immediate_get_verdict,
+	.validate	= nft_immediate_validate,
 };
 
 static struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 7cf13f7e1e94..bc8bdb2c1ba7 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -107,7 +107,9 @@ const struct nft_expr_ops nft_payload_fast_ops = {
 	.dump		= nft_payload_dump,
 };
 
-static const struct nft_expr_ops *nft_payload_select_ops(const struct nlattr * const tb[])
+static const struct nft_expr_ops *
+nft_payload_select_ops(const struct nft_ctx *ctx,
+		       const struct nlattr * const tb[])
 {
 	enum nft_payload_bases base;
 	unsigned int offset, len;
-- 
cgit v1.2.3


From eb31628e37a0a4e01fffd79dcc7f815d2357f53a Mon Sep 17 00:00:00 2001
From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Date: Thu, 10 Oct 2013 13:39:19 +0200
Subject: netfilter: nf_tables: Add support for IPv6 NAT

This patch generalizes the NAT expression to support both IPv4 and IPv6
using the existing IPv4/IPv6 NAT infrastructure. This also adds the
NAT chain type for IPv6.

This patch collapses the following patches that were posted to the
netfilter-devel mailing list, from Tomasz:

* nf_tables: Change NFTA_NAT_ attributes to better semantic significance
* nf_tables: Split IPv4 NAT into NAT expression and IPv4 NAT chain
* nf_tables: Add support for IPv6 NAT expression
* nf_tables: Add support for IPv6 NAT chain
* nf_tables: Fix up build issue on IPv6 NAT support

And, from Pablo Neira Ayuso:

* fix missing dependencies in nft_chain_nat

Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  18 +--
 net/ipv4/netfilter/Kconfig               |   1 +
 net/ipv4/netfilter/nft_chain_nat_ipv4.c  | 156 +---------------------
 net/ipv6/netfilter/Kconfig               |   5 +
 net/ipv6/netfilter/Makefile              |   1 +
 net/ipv6/netfilter/nft_chain_nat_ipv6.c  | 211 +++++++++++++++++++++++++++++
 net/netfilter/Kconfig                    |   6 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_nat.c                  | 220 +++++++++++++++++++++++++++++++
 9 files changed, 457 insertions(+), 162 deletions(-)
 create mode 100644 net/ipv6/netfilter/nft_chain_nat_ipv6.c
 create mode 100644 net/netfilter/nft_nat.c

(limited to 'net/ipv4')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a9c4bce1988f..7d4a1992f89c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -695,18 +695,20 @@ enum nft_nat_types {
  * enum nft_nat_attributes - nf_tables nat expression netlink attributes
  *
  * @NFTA_NAT_TYPE: NAT type (NLA_U32: nft_nat_types)
- * @NFTA_NAT_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
- * @NFTA_NAT_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
- * @NFTA_NAT_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
- * @NFTA_NAT_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_FAMILY: NAT family (NLA_U32)
+ * @NFTA_NAT_REG_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
  */
 enum nft_nat_attributes {
 	NFTA_NAT_UNSPEC,
 	NFTA_NAT_TYPE,
-	NFTA_NAT_ADDR_MIN,
-	NFTA_NAT_ADDR_MAX,
-	NFTA_NAT_PROTO_MIN,
-	NFTA_NAT_PROTO_MAX,
+	NFTA_NAT_FAMILY,
+	NFTA_NAT_REG_ADDR_MIN,
+	NFTA_NAT_REG_ADDR_MAX,
+	NFTA_NAT_REG_PROTO_MIN,
+	NFTA_NAT_REG_PROTO_MAX,
 	__NFTA_NAT_MAX
 };
 #define NFTA_NAT_MAX		(__NFTA_NAT_MAX - 1)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ae65fe98bfbe..1f37ef67f1ac 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -50,6 +50,7 @@ config NFT_CHAIN_ROUTE_IPV4
 
 config NFT_CHAIN_NAT_IPV4
 	depends on NF_TABLES_IPV4
+	depends on NF_NAT_IPV4 && NFT_NAT
 	tristate "IPv4 nf_tables nat chain support"
 
 config IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index e09c201adf84..cf2c792cd971 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,10 +15,8 @@
 #include <linux/list.h>
 #include <linux/skbuff.h>
 #include <linux/ip.h>
-#include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_nat.h>
@@ -27,147 +26,6 @@
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/ip.h>
 
-struct nft_nat {
-	enum nft_registers	sreg_addr_min:8;
-	enum nft_registers	sreg_addr_max:8;
-	enum nft_registers	sreg_proto_min:8;
-	enum nft_registers	sreg_proto_max:8;
-	enum nf_nat_manip_type	type;
-};
-
-static void nft_nat_eval(const struct nft_expr *expr,
-			 struct nft_data data[NFT_REG_MAX + 1],
-			 const struct nft_pktinfo *pkt)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
-
-	memset(&range, 0, sizeof(range));
-	if (priv->sreg_addr_min) {
-		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
-		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
-		range.flags |= NF_NAT_RANGE_MAP_IPS;
-	}
-
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = data[priv->sreg_proto_min].data[0];
-		range.max_proto.all = data[priv->sreg_proto_max].data[0];
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-
-	data[NFT_REG_VERDICT].verdict =
-		nf_nat_setup_info(ct, &range, priv->type);
-}
-
-static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
-	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
-};
-
-static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-			const struct nlattr * const tb[])
-{
-	struct nft_nat *priv = nft_expr_priv(expr);
-	int err;
-
-	if (tb[NFTA_NAT_TYPE] == NULL)
-		return -EINVAL;
-
-	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
-	case NFT_NAT_SNAT:
-		priv->type = NF_NAT_MANIP_SRC;
-		break;
-	case NFT_NAT_DNAT:
-		priv->type = NF_NAT_MANIP_DST;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MIN]) {
-		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
-		err = nft_validate_input_register(priv->sreg_addr_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MAX]) {
-		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
-		err = nft_validate_input_register(priv->sreg_addr_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_addr_max = priv->sreg_addr_min;
-
-	if (tb[NFTA_NAT_PROTO_MIN]) {
-		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
-		err = nft_validate_input_register(priv->sreg_proto_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_PROTO_MAX]) {
-		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
-		err = nft_validate_input_register(priv->sreg_proto_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_proto_max = priv->sreg_proto_min;
-
-	return 0;
-}
-
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-
-	switch (priv->type) {
-	case NF_NAT_MANIP_SRC:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
-			goto nla_put_failure;
-		break;
-	case NF_NAT_MANIP_DST:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
-			goto nla_put_failure;
-		break;
-	}
-
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static struct nft_expr_type nft_nat_type;
-static const struct nft_expr_ops nft_nat_ops = {
-	.type		= &nft_nat_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
-	.eval		= nft_nat_eval,
-	.init		= nft_nat_init,
-	.dump		= nft_nat_dump,
-};
-
-static struct nft_expr_type nft_nat_type __read_mostly = {
-	.name		= "nat",
-	.ops		= &nft_nat_ops,
-	.policy		= nft_nat_policy,
-	.maxattr	= NFTA_NAT_MAX,
-	.owner		= THIS_MODULE,
-};
-
 /*
  * NAT chains
  */
@@ -306,7 +164,7 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-struct nf_chain_type nft_chain_nat_ipv4 = {
+static struct nf_chain_type nft_chain_nat_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
@@ -331,20 +189,11 @@ static int __init nft_chain_nat_init(void)
 	if (err < 0)
 		return err;
 
-	err = nft_register_expr(&nft_nat_type);
-	if (err < 0)
-		goto err;
-
 	return 0;
-
-err:
-	nft_unregister_chain_type(&nft_chain_nat_ipv4);
-	return err;
 }
 
 static void __exit nft_chain_nat_exit(void)
 {
-	nft_unregister_expr(&nft_nat_type);
 	nft_unregister_chain_type(&nft_chain_nat_ipv4);
 }
 
@@ -354,4 +203,3 @@ module_exit(nft_chain_nat_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
-MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 23833064b7b5..7702f9e90a04 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -33,6 +33,11 @@ config NFT_CHAIN_ROUTE_IPV6
 	depends on NF_TABLES_IPV6
 	tristate "IPv6 nf_tables route chain support"
 
+config NFT_CHAIN_NAT_IPV6
+	depends on NF_TABLES_IPV6
+	depends on NF_NAT_IPV6 && NFT_NAT
+	tristate "IPv6 nf_tables nat chain support"
+
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
 	depends on INET && IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index be4913aa524d..d1b4928f34f7 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
new file mode 100644
index 000000000000..e86dcd70dc76
--- /dev/null
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ipv6.h>
+
+/*
+ * IPv6 NAT chains
+ */
+
+static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_nat *nat;
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	__be16 frag_off;
+	int hdrlen;
+	u8 nexthdr;
+	struct nft_pktinfo pkt;
+	unsigned int ret;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+	if (nat == NULL) {
+		/* Conntrack module was loaded late, can't add extension. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL)
+			return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED + IP_CT_IS_REPLY:
+		nexthdr = ipv6_hdr(skb)->nexthdr;
+		hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+					  &nexthdr, &frag_off);
+
+		if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+			if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
+							   ops->hooknum,
+							   hdrlen))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall through */
+	case IP_CT_NEW:
+		if (nf_nat_initialized(ct, maniptype))
+			break;
+
+		nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out);
+
+		ret = nft_do_chain_pktinfo(&pkt, ops);
+		if (ret != NF_ACCEPT)
+			return ret;
+		if (!nf_nat_initialized(ct, maniptype)) {
+			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+	default:
+		break;
+	}
+
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
+		skb_dst_drop(skb);
+
+	return ret;
+}
+
+static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo __maybe_unused;
+	const struct nf_conn *ct __maybe_unused;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+				      &ct->tuplehash[!dir].tuple.dst.u3) ||
+		    (ct->tuplehash[dir].tuple.src.u.all !=
+		     ct->tuplehash[!dir].tuple.dst.u.all))
+			if (nf_xfrm_me_harder(skb, AF_INET6) < 0)
+				ret = NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
+				      &ct->tuplehash[!dir].tuple.src.u3)) {
+			if (ip6_route_me_harder(skb))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+			 ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET6))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+static struct nf_chain_type nft_chain_nat_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "nat",
+	.type		= NFT_CHAIN_T_NAT,
+	.hook_mask	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.fn		= {
+		[NF_INET_PRE_ROUTING]	= nf_nat_ipv6_prerouting,
+		[NF_INET_POST_ROUTING]	= nf_nat_ipv6_postrouting,
+		[NF_INET_LOCAL_OUT]	= nf_nat_ipv6_output,
+		[NF_INET_LOCAL_IN]	= nf_nat_ipv6_fn,
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init nft_chain_nat_ipv6_init(void)
+{
+	int err;
+
+	err = nft_register_chain_type(&nft_chain_nat_ipv6);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static void __exit nft_chain_nat_ipv6_exit(void)
+{
+	nft_unregister_chain_type(&nft_chain_nat_ipv6);
+}
+
+module_init(nft_chain_nat_ipv6_init);
+module_exit(nft_chain_nat_ipv6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 49e362707379..48acec17e27a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -450,6 +450,12 @@ config NFT_LIMIT
 	depends on NF_TABLES
 	tristate "Netfilter nf_tables limit module"
 
+config NFT_NAT
+	depends on NF_TABLES
+	depends on NF_CONNTRACK
+	depends on NF_NAT
+	tristate "Netfilter nf_tables nat module"
+
 config NFT_COMPAT
 	depends on NF_TABLES
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a6781450b6fb..394483b2c193 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
+obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
 #nf_tables-objs			+= nft_meta_target.o
 obj-$(CONFIG_NFT_RBTREE)	+= nft_rbtree.o
 obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
new file mode 100644
index 000000000000..b0b87b2d2411
--- /dev/null
+++ b/net/netfilter/nft_nat.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/string.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+	enum nft_registers      sreg_addr_min:8;
+	enum nft_registers      sreg_addr_max:8;
+	enum nft_registers      sreg_proto_min:8;
+	enum nft_registers      sreg_proto_max:8;
+	int                     family;
+	enum nf_nat_manip_type  type;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+	struct nf_nat_range range;
+
+	memset(&range, 0, sizeof(range));
+	if (priv->sreg_addr_min) {
+		if (priv->family == AF_INET) {
+			range.min_addr.ip = data[priv->sreg_addr_min].data[0];
+			range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+
+		} else {
+			memcpy(range.min_addr.ip6,
+			       data[priv->sreg_addr_min].data,
+			       sizeof(struct nft_data));
+			memcpy(range.max_addr.ip6,
+			       data[priv->sreg_addr_max].data,
+			       sizeof(struct nft_data));
+		}
+		range.flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (priv->sreg_proto_min) {
+		range.min_proto.all = data[priv->sreg_proto_min].data[0];
+		range.max_proto.all = data[priv->sreg_proto_max].data[0];
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+	}
+
+	data[NFT_REG_VERDICT].verdict =
+		nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+	[NFTA_NAT_TYPE]		 = { .type = NLA_U32 },
+	[NFTA_NAT_FAMILY]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_ADDR_MIN]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_ADDR_MAX]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
+	[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_NAT_TYPE] == NULL)
+		return -EINVAL;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+	case NFT_NAT_SNAT:
+		priv->type = NF_NAT_MANIP_SRC;
+		break;
+	case NFT_NAT_DNAT:
+		priv->type = NF_NAT_MANIP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_NAT_FAMILY] == NULL)
+		return -EINVAL;
+
+	priv->family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
+	if (priv->family != AF_INET && priv->family != AF_INET6)
+		return -EINVAL;
+
+	if (tb[NFTA_NAT_REG_ADDR_MIN]) {
+		priv->sreg_addr_min = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_ADDR_MIN]));
+		err = nft_validate_input_register(priv->sreg_addr_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_REG_ADDR_MAX]) {
+		priv->sreg_addr_max = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_ADDR_MAX]));
+		err = nft_validate_input_register(priv->sreg_addr_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_addr_max = priv->sreg_addr_min;
+
+	if (tb[NFTA_NAT_REG_PROTO_MIN]) {
+		priv->sreg_proto_min = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_PROTO_MIN]));
+		err = nft_validate_input_register(priv->sreg_proto_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_REG_PROTO_MAX]) {
+		priv->sreg_proto_max = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_PROTO_MAX]));
+		err = nft_validate_input_register(priv->sreg_proto_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_proto_max = priv->sreg_proto_min;
+
+	return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NF_NAT_MANIP_SRC:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+			goto nla_put_failure;
+		break;
+	case NF_NAT_MANIP_DST:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_PROTO_MIN, htonl(priv->sreg_proto_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_PROTO_MAX, htonl(priv->sreg_proto_max)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+	.type           = &nft_nat_type,
+	.size           = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+	.eval           = nft_nat_eval,
+	.init           = nft_nat_init,
+	.dump           = nft_nat_dump,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+	.name           = "nat",
+	.ops            = &nft_nat_ops,
+	.policy         = nft_nat_policy,
+	.maxattr        = NFTA_NAT_MAX,
+	.owner          = THIS_MODULE,
+};
+
+static int __init nft_nat_module_init(void)
+{
+	int err;
+
+	err = nft_register_expr(&nft_nat_type);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static void __exit nft_nat_module_exit(void)
+{
+	nft_unregister_expr(&nft_nat_type);
+}
+
+module_init(nft_nat_module_init);
+module_exit(nft_nat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_EXPR("nat");
-- 
cgit v1.2.3


From 99633ab29b2131b68089a6c7f60458390860e044 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 10 Oct 2013 23:28:33 +0200
Subject: netfilter: nf_tables: complete net namespace support

Register family per netnamespace to ensure that sets are
only visible in its approapriate namespace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/net_namespace.h             |  4 ++
 include/net/netfilter/nf_tables.h       |  4 +-
 include/net/netns/nftables.h            | 15 ++++++
 net/bridge/netfilter/nf_tables_bridge.c | 32 ++++++++++++-
 net/ipv4/netfilter/nf_tables_ipv4.c     | 32 ++++++++++++-
 net/ipv6/netfilter/nf_tables_ipv6.c     | 33 ++++++++++++-
 net/netfilter/nf_tables_api.c           | 83 ++++++++++++++++++++++-----------
 7 files changed, 168 insertions(+), 35 deletions(-)
 create mode 100644 include/net/netns/nftables.h

(limited to 'net/ipv4')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bcc4a8ed4450..da68c9a90ac5 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -22,6 +22,7 @@
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netns/conntrack.h>
 #endif
+#include <net/netns/nftables.h>
 #include <net/netns/xfrm.h>
 
 struct user_namespace;
@@ -101,6 +102,9 @@ struct net {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct netns_ct		ct;
 #endif
+#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
+	struct netns_nftables	nft;
+#endif
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 	struct netns_nf_frag	nf_frag;
 #endif
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index a68f45f0fe2e..d3272e943aac 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -68,6 +68,7 @@ static inline void nft_data_debug(const struct nft_data *data)
 /**
  *	struct nft_ctx - nf_tables rule/set context
  *
+ *	@net: net namespace
  * 	@skb: netlink skb
  * 	@nlh: netlink message header
  * 	@afi: address family info
@@ -76,6 +77,7 @@ static inline void nft_data_debug(const struct nft_data *data)
  *	@nla: netlink attributes
  */
 struct nft_ctx {
+	struct net			*net;
 	const struct sk_buff		*skb;
 	const struct nlmsghdr		*nlh;
 	const struct nft_af_info	*afi;
@@ -462,7 +464,7 @@ struct nft_af_info {
 	nf_hookfn			*hooks[NF_MAX_HOOKS];
 };
 
-extern int nft_register_afinfo(struct nft_af_info *);
+extern int nft_register_afinfo(struct net *, struct nft_af_info *);
 extern void nft_unregister_afinfo(struct nft_af_info *);
 
 struct nf_chain_type {
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
new file mode 100644
index 000000000000..a98b1c5d9913
--- /dev/null
+++ b/include/net/netns/nftables.h
@@ -0,0 +1,15 @@
+#ifndef _NETNS_NFTABLES_H_
+#define _NETNS_NFTABLES_H_
+
+#include <linux/list.h>
+
+struct nft_af_info;
+
+struct netns_nftables {
+	struct list_head	af_info;
+	struct nft_af_info	*ipv4;
+	struct nft_af_info	*ipv6;
+	struct nft_af_info	*bridge;
+};
+
+#endif
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index bc5c21c911c0..e8cb016fa34d 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -19,14 +19,42 @@ static struct nft_af_info nft_af_bridge __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+static int nf_tables_bridge_init_net(struct net *net)
+{
+	net->nft.bridge = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.bridge == NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.bridge, &nft_af_bridge, sizeof(nft_af_bridge));
+
+	if (nft_register_afinfo(net, net->nft.bridge) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.bridge);
+	return -ENOMEM;
+}
+
+static void nf_tables_bridge_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.bridge);
+	kfree(net->nft.bridge);
+}
+
+static struct pernet_operations nf_tables_bridge_net_ops = {
+	.init	= nf_tables_bridge_init_net,
+	.exit	= nf_tables_bridge_exit_net,
+};
+
 static int __init nf_tables_bridge_init(void)
 {
-	return nft_register_afinfo(&nft_af_bridge);
+	return register_pernet_subsys(&nf_tables_bridge_net_ops);
 }
 
 static void __exit nf_tables_bridge_exit(void)
 {
-	nft_unregister_afinfo(&nft_af_bridge);
+	return unregister_pernet_subsys(&nf_tables_bridge_net_ops);
 }
 
 module_init(nf_tables_bridge_init);
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index c61cffb9b760..8f7536be1322 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -14,6 +14,7 @@
 #include <linux/ip.h>
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
 #include <net/ip.h>
 #include <net/net_namespace.h>
 #include <net/netfilter/nf_tables_ipv4.h>
@@ -47,6 +48,33 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+static int nf_tables_ipv4_init_net(struct net *net)
+{
+	net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.ipv4 == NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
+
+	if (nft_register_afinfo(net, net->nft.ipv4) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.ipv4);
+	return -ENOMEM;
+}
+
+static void nf_tables_ipv4_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.ipv4);
+	kfree(net->nft.ipv4);
+}
+
+static struct pernet_operations nf_tables_ipv4_net_ops = {
+	.init	= nf_tables_ipv4_init_net,
+	.exit	= nf_tables_ipv4_exit_net,
+};
 
 static unsigned int
 nft_do_chain_ipv4(const struct nf_hook_ops *ops,
@@ -83,12 +111,12 @@ static struct nf_chain_type filter_ipv4 = {
 static int __init nf_tables_ipv4_init(void)
 {
 	nft_register_chain_type(&filter_ipv4);
-	return nft_register_afinfo(&nft_af_ipv4);
+	return register_pernet_subsys(&nf_tables_ipv4_net_ops);
 }
 
 static void __exit nf_tables_ipv4_exit(void)
 {
-	nft_unregister_afinfo(&nft_af_ipv4);
+	unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
 	nft_unregister_chain_type(&filter_ipv4);
 }
 
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 42f905a808a3..d77db8a13505 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -45,6 +45,34 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static int nf_tables_ipv6_init_net(struct net *net)
+{
+	net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.ipv6 == NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6));
+
+	if (nft_register_afinfo(net, net->nft.ipv6) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.ipv6);
+	return -ENOMEM;
+}
+
+static void nf_tables_ipv6_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.ipv6);
+	kfree(net->nft.ipv6);
+}
+
+static struct pernet_operations nf_tables_ipv6_net_ops = {
+	.init	= nf_tables_ipv6_init_net,
+	.exit	= nf_tables_ipv6_exit_net,
+};
+
 static unsigned int
 nft_do_chain_ipv6(const struct nf_hook_ops *ops,
 		  struct sk_buff *skb,
@@ -82,11 +110,12 @@ static struct nf_chain_type filter_ipv6 = {
 static int __init nf_tables_ipv6_init(void)
 {
 	nft_register_chain_type(&filter_ipv6);
-	return nft_register_afinfo(&nft_af_ipv6);
+	return register_pernet_subsys(&nf_tables_ipv6_net_ops);
 }
+
 static void __exit nf_tables_ipv6_exit(void)
 {
-	nft_unregister_afinfo(&nft_af_ipv6);
+	unregister_pernet_subsys(&nf_tables_ipv6_net_ops);
 	nft_unregister_chain_type(&filter_ipv6);
 }
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a4dd7ce5ec3e..e1ee85047ec1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -18,9 +18,9 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
 #include <net/sock.h>
 
-static LIST_HEAD(nf_tables_afinfo);
 static LIST_HEAD(nf_tables_expressions);
 
 /**
@@ -31,11 +31,11 @@ static LIST_HEAD(nf_tables_expressions);
  *	Register the address family for use with nf_tables. Returns zero on
  *	success or a negative errno code otherwise.
  */
-int nft_register_afinfo(struct nft_af_info *afi)
+int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
 {
 	INIT_LIST_HEAD(&afi->tables);
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_add_tail(&afi->list, &nf_tables_afinfo);
+	list_add_tail(&afi->list, &net->nft.af_info);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	return 0;
 }
@@ -56,22 +56,23 @@ void nft_unregister_afinfo(struct nft_af_info *afi)
 }
 EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
 
-static struct nft_af_info *nft_afinfo_lookup(int family)
+static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family)
 {
 	struct nft_af_info *afi;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (afi->family == family)
 			return afi;
 	}
 	return NULL;
 }
 
-static struct nft_af_info *nf_tables_afinfo_lookup(int family, bool autoload)
+static struct nft_af_info *
+nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
 {
 	struct nft_af_info *afi;
 
-	afi = nft_afinfo_lookup(family);
+	afi = nft_afinfo_lookup(net, family);
 	if (afi != NULL)
 		return afi;
 #ifdef CONFIG_MODULES
@@ -79,7 +80,7 @@ static struct nft_af_info *nf_tables_afinfo_lookup(int family, bool autoload)
 		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 		request_module("nft-afinfo-%u", family);
 		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		afi = nft_afinfo_lookup(family);
+		afi = nft_afinfo_lookup(net, family);
 		if (afi != NULL)
 			return ERR_PTR(-EAGAIN);
 	}
@@ -232,9 +233,10 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (family != NFPROTO_UNSPEC && family != afi->family)
 			continue;
 
@@ -268,6 +270,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	struct sk_buff *skb2;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	int err;
 
@@ -278,7 +281,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -379,9 +382,10 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 	const struct nlattr *name;
 	struct nft_af_info *afi;
 	struct nft_table *table;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, true);
+	afi = nf_tables_afinfo_lookup(net, family, true);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -433,9 +437,10 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct nft_af_info *afi;
 	struct nft_table *table;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -663,9 +668,10 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (family != NFPROTO_UNSPEC && family != afi->family)
 			continue;
 
@@ -702,6 +708,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	struct sk_buff *skb2;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	int err;
 
@@ -712,7 +719,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -813,6 +820,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	struct nft_chain *chain;
 	struct nft_base_chain *basechain = NULL;
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	u64 handle = 0;
 	int err;
@@ -820,7 +828,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(family, true);
+	afi = nf_tables_afinfo_lookup(net, family, true);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1010,9 +1018,10 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1050,6 +1059,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 			 const struct nft_chain *chain,
 			 const struct nlattr * const *nla)
 {
+	ctx->net   = sock_net(skb->sk);
 	ctx->skb   = skb;
 	ctx->nlh   = nlh;
 	ctx->afi   = afi;
@@ -1361,9 +1371,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
 	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (family != NFPROTO_UNSPEC && family != afi->family)
 			continue;
 
@@ -1402,6 +1413,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
 	struct sk_buff *skb2;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	int err;
 
@@ -1412,7 +1424,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1477,6 +1489,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
+	struct net *net = sock_net(skb->sk);
 	struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *old_rule = NULL;
@@ -1490,7 +1503,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, create);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1585,12 +1598,13 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
+	struct net *net = sock_net(skb->sk);
 	const struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *tmp;
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1697,11 +1711,12 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 				     const struct nlmsghdr *nlh,
 				     const struct nlattr * const nla[])
 {
+	struct net *net = sock_net(skb->sk);
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
 	const struct nft_table *table = NULL;
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, false);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1818,12 +1833,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
 {
 	struct sk_buff *skb;
 	u32 portid = NETLINK_CB(ctx->skb).portid;
-	struct net *net = sock_net(ctx->skb->sk);
 	bool report;
 	int err;
 
 	report = nlmsg_report(ctx->nlh);
-	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+	if (!report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
 		return 0;
 
 	err = -ENOBUFS;
@@ -1837,11 +1851,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+	err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, report,
 			     GFP_KERNEL);
 err:
 	if (err < 0)
-		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+		nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
 	return err;
 }
 
@@ -1974,6 +1988,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_set_ops *ops;
 	const struct nft_af_info *afi;
+	struct net *net = sock_net(skb->sk);
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nft_ctx ctx;
@@ -2032,7 +2047,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, create);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -2219,8 +2234,9 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
+	struct net *net = sock_net(skb->sk);
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, false);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -3011,6 +3027,16 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
 }
 EXPORT_SYMBOL_GPL(nft_data_dump);
 
+static int nf_tables_init_net(struct net *net)
+{
+	INIT_LIST_HEAD(&net->nft.af_info);
+	return 0;
+}
+
+static struct pernet_operations nf_tables_net_ops = {
+	.init	= nf_tables_init_net,
+};
+
 static int __init nf_tables_module_init(void)
 {
 	int err;
@@ -3031,7 +3057,7 @@ static int __init nf_tables_module_init(void)
 		goto err3;
 
 	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
-	return 0;
+	return register_pernet_subsys(&nf_tables_net_ops);
 err3:
 	nf_tables_core_module_exit();
 err2:
@@ -3042,6 +3068,7 @@ err1:
 
 static void __exit nf_tables_module_exit(void)
 {
+	unregister_pernet_subsys(&nf_tables_net_ops);
 	nfnetlink_subsys_unregister(&nf_tables_subsys);
 	nf_tables_core_module_exit();
 	kfree(info);
-- 
cgit v1.2.3


From ed683f138b3dbc8a5e878e24a0bfa0bb61043a09 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 7 Oct 2013 22:53:08 +0200
Subject: netfilter: nf_tables: add ARP filtering support

This patch registers the ARP family and he filter chain type
for this family.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/nftables.h       |   1 +
 net/ipv4/netfilter/Kconfig         |   4 ++
 net/ipv4/netfilter/Makefile        |   1 +
 net/ipv4/netfilter/nf_tables_arp.c | 102 +++++++++++++++++++++++++++++++++++++
 4 files changed, 108 insertions(+)
 create mode 100644 net/ipv4/netfilter/nf_tables_arp.c

(limited to 'net/ipv4')

diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 08a4248a12b5..15d056d534e3 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -10,6 +10,7 @@ struct netns_nftables {
 	struct list_head	commit_list;
 	struct nft_af_info	*ipv4;
 	struct nft_af_info	*ipv6;
+	struct nft_af_info	*arp;
 	struct nft_af_info	*bridge;
 	u8			gencursor;
 	u8			genctr;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1f37ef67f1ac..40d56073cd19 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -53,6 +53,10 @@ config NFT_CHAIN_NAT_IPV4
 	depends on NF_NAT_IPV4 && NFT_NAT
 	tristate "IPv4 nf_tables nat chain support"
 
+config NF_TABLES_ARP
+	depends on NF_TABLES
+	tristate "ARP nf_tables support"
+
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 91e0bd71a6d3..19df72b7ba88 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
 obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
+obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
new file mode 100644
index 000000000000..3e67ef1c676f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netfilter_arp.h>
+#include <net/netfilter/nf_tables.h>
+
+static struct nft_af_info nft_af_arp __read_mostly = {
+	.family		= NFPROTO_ARP,
+	.nhooks		= NF_ARP_NUMHOOKS,
+	.owner		= THIS_MODULE,
+};
+
+static int nf_tables_arp_init_net(struct net *net)
+{
+	net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.arp== NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
+
+	if (nft_register_afinfo(net, net->nft.arp) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.arp);
+	return -ENOMEM;
+}
+
+static void nf_tables_arp_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.arp);
+	kfree(net->nft.arp);
+}
+
+static struct pernet_operations nf_tables_arp_net_ops = {
+	.init   = nf_tables_arp_init_net,
+	.exit   = nf_tables_arp_exit_net,
+};
+
+static unsigned int
+nft_do_chain_arp(const struct nf_hook_ops *ops,
+		  struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, ops, skb, in, out);
+
+	return nft_do_chain_pktinfo(&pkt, ops);
+}
+
+static struct nf_chain_type filter_arp = {
+	.family		= NFPROTO_ARP,
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.hook_mask	= (1 << NF_ARP_IN) |
+			  (1 << NF_ARP_OUT) |
+			  (1 << NF_ARP_FORWARD),
+	.fn		= {
+		[NF_ARP_IN]		= nft_do_chain_arp,
+		[NF_ARP_OUT]		= nft_do_chain_arp,
+		[NF_ARP_FORWARD]	= nft_do_chain_arp,
+	},
+};
+
+static int __init nf_tables_arp_init(void)
+{
+	int ret;
+
+	nft_register_chain_type(&filter_arp);
+	ret = register_pernet_subsys(&nf_tables_arp_net_ops);
+	if (ret < 0)
+		nft_unregister_chain_type(&filter_arp);
+
+	return ret;
+}
+
+static void __exit nf_tables_arp_exit(void)
+{
+	unregister_pernet_subsys(&nf_tables_arp_net_ops);
+	nft_unregister_chain_type(&filter_arp);
+}
+
+module_init(nf_tables_arp_init);
+module_exit(nf_tables_arp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
-- 
cgit v1.2.3


From ce4ff76c15a877a62097807a35518fc808c1853c Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 15:38:44 +0800
Subject: netfilter: ipt_CLUSTERIP: make proc directory per net namespace

Create /proc/net/ipt_CLUSTERIP directory for per net namespace.
Right now,only allow to create entries under the ipt_CLUSTERIP
in init net namespace.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 70 +++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0b732efd32e2..e66b91b92843 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -28,6 +28,7 @@
 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <net/checksum.h>
 #include <net/ip.h>
 
@@ -64,9 +65,16 @@ static DEFINE_SPINLOCK(clusterip_lock);
 
 #ifdef CONFIG_PROC_FS
 static const struct file_operations clusterip_proc_fops;
-static struct proc_dir_entry *clusterip_procdir;
 #endif
 
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *procdir;
+#endif
+};
+
 static inline void
 clusterip_config_get(struct clusterip_config *c)
 {
@@ -158,6 +166,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 			struct net_device *dev)
 {
 	struct clusterip_config *c;
+	struct clusterip_net *cn = net_generic(&init_net, clusterip_net_id);
 
 	c = kzalloc(sizeof(*c), GFP_ATOMIC);
 	if (!c)
@@ -180,7 +189,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 		/* create proc dir entry */
 		sprintf(buffer, "%pI4", &ip);
 		c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
-					  clusterip_procdir,
+					  cn->procdir,
 					  &clusterip_proc_fops, c);
 		if (!c->pde) {
 			kfree(c);
@@ -698,48 +707,71 @@ static const struct file_operations clusterip_proc_fops = {
 
 #endif /* CONFIG_PROC_FS */
 
+static int clusterip_net_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+	cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+	if (!cn->procdir) {
+		pr_err("Unable to proc dir entry\n");
+		return -ENOMEM;
+	}
+#endif /* CONFIG_PROC_FS */
+
+	return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+	.init = clusterip_net_init,
+	.exit = clusterip_net_exit,
+	.id   = &clusterip_net_id,
+	.size = sizeof(struct clusterip_net),
+};
+
 static int __init clusterip_tg_init(void)
 {
 	int ret;
 
-	ret = xt_register_target(&clusterip_tg_reg);
+	ret = register_pernet_subsys(&clusterip_net_ops);
 	if (ret < 0)
 		return ret;
 
+	ret = xt_register_target(&clusterip_tg_reg);
+	if (ret < 0)
+		goto cleanup_subsys;
+
 	ret = nf_register_hook(&cip_arp_ops);
 	if (ret < 0)
 		goto cleanup_target;
 
-#ifdef CONFIG_PROC_FS
-	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
-	if (!clusterip_procdir) {
-		pr_err("Unable to proc dir entry\n");
-		ret = -ENOMEM;
-		goto cleanup_hook;
-	}
-#endif /* CONFIG_PROC_FS */
-
 	pr_info("ClusterIP Version %s loaded successfully\n",
 		CLUSTERIP_VERSION);
+
 	return 0;
 
-#ifdef CONFIG_PROC_FS
-cleanup_hook:
-	nf_unregister_hook(&cip_arp_ops);
-#endif /* CONFIG_PROC_FS */
 cleanup_target:
 	xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+	unregister_pernet_subsys(&clusterip_net_ops);
 	return ret;
 }
 
 static void __exit clusterip_tg_exit(void)
 {
 	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-#ifdef CONFIG_PROC_FS
-	proc_remove(clusterip_procdir);
-#endif
+
 	nf_unregister_hook(&cip_arp_ops);
 	xt_unregister_target(&clusterip_tg_reg);
+	unregister_pernet_subsys(&clusterip_net_ops);
 
 	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
 	rcu_barrier_bh();
-- 
cgit v1.2.3


From 26a89e435462bfdde586ad062bf190cdbfe53a49 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 15:38:45 +0800
Subject: netfilter: ipt_CLUSTERIP: make clusterip_list per net namespace

clusterip_configs should be per net namespace, so operate
cluster in one net namespace won't affect other net
namespace. right now, only allow to operate the clusterip_configs
of init net namespace.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e66b91b92843..8ef3e6f38635 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -58,8 +58,6 @@ struct clusterip_config {
 	struct rcu_head rcu;
 };
 
-static LIST_HEAD(clusterip_configs);
-
 /* clusterip_lock protects the clusterip_configs list */
 static DEFINE_SPINLOCK(clusterip_lock);
 
@@ -70,6 +68,7 @@ static const struct file_operations clusterip_proc_fops;
 static int clusterip_net_id __read_mostly;
 
 struct clusterip_net {
+	struct list_head configs;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *procdir;
 #endif
@@ -124,8 +123,9 @@ static struct clusterip_config *
 __clusterip_config_find(__be32 clusterip)
 {
 	struct clusterip_config *c;
+	struct clusterip_net *cn = net_generic(&init_net, clusterip_net_id);
 
-	list_for_each_entry_rcu(c, &clusterip_configs, list) {
+	list_for_each_entry_rcu(c, &cn->configs, list) {
 		if (c->clusterip == clusterip)
 			return c;
 	}
@@ -199,7 +199,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 #endif
 
 	spin_lock_bh(&clusterip_lock);
-	list_add_rcu(&c->list, &clusterip_configs);
+	list_add_rcu(&c->list, &cn->configs);
 	spin_unlock_bh(&clusterip_lock);
 
 	return c;
@@ -709,9 +709,11 @@ static const struct file_operations clusterip_proc_fops = {
 
 static int clusterip_net_init(struct net *net)
 {
-#ifdef CONFIG_PROC_FS
 	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
+	INIT_LIST_HEAD(&cn->configs);
+
+#ifdef CONFIG_PROC_FS
 	cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
 	if (!cn->procdir) {
 		pr_err("Unable to proc dir entry\n");
-- 
cgit v1.2.3


From f1e8077f490cff4253b197154bf2affaa0ca08e3 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 15:38:46 +0800
Subject: netfilter: ipt_CLUSTERIP: make clusterip_lock per net namespace

this lock is used for protecting clusterip_configs of per
net namespace, it should be per net namespace too.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 8ef3e6f38635..1bf5aa3096c1 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -58,9 +58,6 @@ struct clusterip_config {
 	struct rcu_head rcu;
 };
 
-/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_SPINLOCK(clusterip_lock);
-
 #ifdef CONFIG_PROC_FS
 static const struct file_operations clusterip_proc_fops;
 #endif
@@ -69,6 +66,9 @@ static int clusterip_net_id __read_mostly;
 
 struct clusterip_net {
 	struct list_head configs;
+	/* lock protects the configs list */
+	spinlock_t lock;
+
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *procdir;
 #endif
@@ -99,10 +99,12 @@ clusterip_config_put(struct clusterip_config *c)
 static inline void
 clusterip_config_entry_put(struct clusterip_config *c)
 {
+	struct clusterip_net *cn = net_generic(&init_net, clusterip_net_id);
+
 	local_bh_disable();
-	if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+	if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
 		list_del_rcu(&c->list);
-		spin_unlock(&clusterip_lock);
+		spin_unlock(&cn->lock);
 		local_bh_enable();
 
 		dev_mc_del(c->dev, c->clustermac);
@@ -198,9 +200,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 	}
 #endif
 
-	spin_lock_bh(&clusterip_lock);
+	spin_lock_bh(&cn->lock);
 	list_add_rcu(&c->list, &cn->configs);
-	spin_unlock_bh(&clusterip_lock);
+	spin_unlock_bh(&cn->lock);
 
 	return c;
 }
@@ -713,6 +715,8 @@ static int clusterip_net_init(struct net *net)
 
 	INIT_LIST_HEAD(&cn->configs);
 
+	spin_lock_init(&cn->lock);
+
 #ifdef CONFIG_PROC_FS
 	cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
 	if (!cn->procdir) {
-- 
cgit v1.2.3


From b5ef0f85bf76986e5076cd1e0820fa4e61325772 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 15:38:47 +0800
Subject: netfilter: ipt_CLUSTERIP: add parameter net in
 clusterip_config_find_get

Inorder to find clusterip_config in net namespace.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1bf5aa3096c1..b7fc9d5e06df 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -122,10 +122,10 @@ clusterip_config_entry_put(struct clusterip_config *c)
 }
 
 static struct clusterip_config *
-__clusterip_config_find(__be32 clusterip)
+__clusterip_config_find(struct net *net, __be32 clusterip)
 {
 	struct clusterip_config *c;
-	struct clusterip_net *cn = net_generic(&init_net, clusterip_net_id);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
 	list_for_each_entry_rcu(c, &cn->configs, list) {
 		if (c->clusterip == clusterip)
@@ -136,12 +136,12 @@ __clusterip_config_find(__be32 clusterip)
 }
 
 static inline struct clusterip_config *
-clusterip_config_find_get(__be32 clusterip, int entry)
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
 {
 	struct clusterip_config *c;
 
 	rcu_read_lock_bh();
-	c = __clusterip_config_find(clusterip);
+	c = __clusterip_config_find(net, clusterip);
 	if (c) {
 		if (unlikely(!atomic_inc_not_zero(&c->refcount)))
 			c = NULL;
@@ -381,7 +381,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 
 	/* FIXME: further sanity checks */
 
-	config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+	config = clusterip_config_find_get(&init_net, e->ip.dst.s_addr, 1);
 	if (!config) {
 		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
 			pr_info("no config found for %pI4, need 'new'\n",
@@ -519,7 +519,7 @@ arp_mangle(unsigned int hook,
 
 	/* if there is no clusterip configuration for the arp reply's
 	 * source ip, we don't want to mangle it */
-	c = clusterip_config_find_get(payload->src_ip, 0);
+	c = clusterip_config_find_get(&init_net, payload->src_ip, 0);
 	if (!c)
 		return NF_ACCEPT;
 
-- 
cgit v1.2.3


From f58d7866018dedae7ec67e152402b8ede17ce39e Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 15:38:48 +0800
Subject: netfilter: ipt_CLUSTERIP: create proc entry under proper
 ipt_CLUSTERIP directory

Create proc entries under the ipt_CLUSTERIP directory of proper
net namespace.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index b7fc9d5e06df..c93dfd2821ac 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -168,7 +168,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 			struct net_device *dev)
 {
 	struct clusterip_config *c;
-	struct clusterip_net *cn = net_generic(&init_net, clusterip_net_id);
+	struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
 
 	c = kzalloc(sizeof(*c), GFP_ATOMIC);
 	if (!c)
-- 
cgit v1.2.3


From d86946d2c5b4e519ffe435c2deeb2c9436ceb04f Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 15:38:49 +0800
Subject: netfilter: ipt_CLUSTERIP: use proper net namespace to operate
 CLUSTERIP

we can allow users in uninit net namespace to operate ipt_CLUSTERIP
now.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index c93dfd2821ac..ecd808a93b63 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -99,7 +99,8 @@ clusterip_config_put(struct clusterip_config *c)
 static inline void
 clusterip_config_entry_put(struct clusterip_config *c)
 {
-	struct clusterip_net *cn = net_generic(&init_net, clusterip_net_id);
+	struct net *net = dev_net(c->dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
 	local_bh_disable();
 	if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
@@ -381,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 
 	/* FIXME: further sanity checks */
 
-	config = clusterip_config_find_get(&init_net, e->ip.dst.s_addr, 1);
+	config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
 	if (!config) {
 		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
 			pr_info("no config found for %pI4, need 'new'\n",
@@ -395,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 				return -EINVAL;
 			}
 
-			dev = dev_get_by_name(&init_net, e->ip.iniface);
+			dev = dev_get_by_name(par->net, e->ip.iniface);
 			if (!dev) {
 				pr_info("no such interface %s\n",
 					e->ip.iniface);
@@ -503,6 +504,7 @@ arp_mangle(unsigned int hook,
 	struct arphdr *arp = arp_hdr(skb);
 	struct arp_payload *payload;
 	struct clusterip_config *c;
+	struct net *net = dev_net(in ? in : out);
 
 	/* we don't care about non-ethernet and non-ipv4 ARP */
 	if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -519,7 +521,7 @@ arp_mangle(unsigned int hook,
 
 	/* if there is no clusterip configuration for the arp reply's
 	 * source ip, we don't want to mangle it */
-	c = clusterip_config_find_get(&init_net, payload->src_ip, 0);
+	c = clusterip_config_find_get(net, payload->src_ip, 0);
 	if (!c)
 		return NF_ACCEPT;
 
-- 
cgit v1.2.3


From c1d607cc4a8ea1ef89d7f6f5728112bc5a52f2f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Oct 2013 08:54:49 -0700
Subject: inet_diag: use sock_gen_put()

TCP listener refactoring, part 6 :

Use sock_gen_put() from inet_diag_dump_one_icsk() for future
SYN_RECV support.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 41e1c3ea8b51..56a964a553d2 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -336,12 +336,9 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 		err = 0;
 
 out:
-	if (sk) {
-		if (sk->sk_state == TCP_TIME_WAIT)
-			inet_twsk_put((struct inet_timewait_sock *)sk);
-		else
-			sock_put(sk);
-	}
+	if (sk)
+		sock_gen_put(sk);
+
 out_nosk:
 	return err;
 }
-- 
cgit v1.2.3


From 031afe4990a7c9dbff41a3a742c44d3e740ea0a1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Sat, 12 Oct 2013 10:16:27 -0700
Subject: tcp: fix incorrect ca_state in tail loss probe

On receiving an ACK that covers the loss probe sequence, TLP
immediately sets the congestion state to Open, even though some packets
are not recovered and retransmisssion are on the way.  The later ACks
may trigger a WARN_ON check in step D of tcp_fastretrans_alert(), e.g.,
https://bugzilla.redhat.com/show_bug.cgi?id=989251

The fix is to follow the similar procedure in recovery by calling
tcp_try_keep_open(). The sender switches to Open state if no packets
are retransmissted. Otherwise it goes to Disorder and let subsequent
ACKs move the state to Recovery or Open.

Reported-By: Michael Sterrett <michael@sterretts.net>
Tested-By: Dormando <dormando@rydia.net>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 113dc5f17d47..53974c7c04fe 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3291,7 +3291,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 			tcp_init_cwnd_reduction(sk, true);
 			tcp_set_ca_state(sk, TCP_CA_CWR);
 			tcp_end_cwnd_reduction(sk);
-			tcp_set_ca_state(sk, TCP_CA_Open);
+			tcp_try_keep_open(sk);
 			NET_INC_STATS_BH(sock_net(sk),
 					 LINUX_MIB_TCPLOSSPROBERECOVERY);
 		}
-- 
cgit v1.2.3


From c52e2421f7368fd36cbe330d2cf41b10452e39a9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Oct 2013 11:54:30 -0700
Subject: tcp: must unclone packets before mangling them

TCP stack should make sure it owns skbs before mangling them.

We had various crashes using bnx2x, and it turned out gso_size
was cleared right before bnx2x driver was populating TC descriptor
of the _previous_ packet send. TCP stack can sometime retransmit
packets that are still in Qdisc.

Of course we could make bnx2x driver more robust (using
ACCESS_ONCE(shinfo->gso_size) for example), but the bug is TCP stack.

We have identified two points where skb_unclone() was needed.

This patch adds a WARN_ON_ONCE() to warn us if we missed another
fix of this kind.

Kudos to Neal for finding the root cause of this bug. Its visible
using small MSS.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c6f01f2cdb32..8fad1c155688 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -986,6 +986,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
 				 unsigned int mss_now)
 {
+	/* Make sure we own this skb before messing gso_size/gso_segs */
+	WARN_ON_ONCE(skb_cloned(skb));
+
 	if (skb->len <= mss_now || !sk_can_gso(sk) ||
 	    skb->ip_summed == CHECKSUM_NONE) {
 		/* Avoid the costly divide in the normal
@@ -1067,9 +1070,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	if (nsize < 0)
 		nsize = 0;
 
-	if (skb_cloned(skb) &&
-	    skb_is_nonlinear(skb) &&
-	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+	if (skb_unclone(skb, GFP_ATOMIC))
 		return -ENOMEM;
 
 	/* Get a new skb... force flag on. */
@@ -2344,6 +2345,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		int oldpcount = tcp_skb_pcount(skb);
 
 		if (unlikely(oldpcount > 1)) {
+			if (skb_unclone(skb, GFP_ATOMIC))
+				return -ENOMEM;
 			tcp_init_tso_segs(sk, skb, cur_mss);
 			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
 		}
-- 
cgit v1.2.3


From 8f26fb1c1ed81c33f5d87c5936f4d9d1b4118918 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Oct 2013 12:24:54 -0700
Subject: tcp: remove the sk_can_gso() check from tcp_set_skb_tso_segs()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sk_can_gso() should only be used as a hint in tcp_sendmsg() to build GSO
packets in the first place. (As a performance hint)

Once we have GSO packets in write queue, we can not decide they are no
longer GSO only because flow now uses a route which doesn't handle
TSO/GSO.

Core networking stack handles the case very well for us, all we need
is keeping track of packet counts in MSS terms, regardless of
segmentation done later (in GSO or hardware)

Right now, if  tcp_fragment() splits a GSO packet in two parts,
@left and @right, and route changed through a non GSO device,
both @left and @right have pcount set to 1, which is wrong,
and leads to incorrect packet_count tracking.

This problem was added in commit d5ac99a648 ("[TCP]: skb pcount with MTU
discovery")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8fad1c155688..d46f2143305c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -989,8 +989,7 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
 	/* Make sure we own this skb before messing gso_size/gso_segs */
 	WARN_ON_ONCE(skb_cloned(skb));
 
-	if (skb->len <= mss_now || !sk_can_gso(sk) ||
-	    skb->ip_summed == CHECKSUM_NONE) {
+	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
 		 */
-- 
cgit v1.2.3


From 0baf2b35fc70ab16c385963d2502da26a55d2cb7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Oct 2013 02:49:04 -0700
Subject: ipv4: shrink rt_cache_stat

Half of the rt_cache_stat fields are no longer used after IP
route cache removal, lets shrink this per cpu area.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  8 --------
 net/ipv4/route.c    | 16 ++++++++--------
 2 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 0ad8e0102386..dd4ae0029fd8 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -88,22 +88,14 @@ struct ip_rt_acct {
 };
 
 struct rt_cache_stat {
-        unsigned int in_hit;
         unsigned int in_slow_tot;
         unsigned int in_slow_mc;
         unsigned int in_no_route;
         unsigned int in_brd;
         unsigned int in_martian_dst;
         unsigned int in_martian_src;
-        unsigned int out_hit;
         unsigned int out_slow_tot;
         unsigned int out_slow_mc;
-        unsigned int gc_total;
-        unsigned int gc_ignored;
-        unsigned int gc_goal_miss;
-        unsigned int gc_dst_overflow;
-        unsigned int in_hlist_search;
-        unsigned int out_hlist_search;
 };
 
 extern struct ip_rt_acct __percpu *ip_rt_acct;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6011615e810d..d2d325382b13 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -295,7 +295,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 		   dst_entries_get_slow(&ipv4_dst_ops),
-		   st->in_hit,
+		   0, /* st->in_hit */
 		   st->in_slow_tot,
 		   st->in_slow_mc,
 		   st->in_no_route,
@@ -303,16 +303,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 		   st->in_martian_dst,
 		   st->in_martian_src,
 
-		   st->out_hit,
+		   0, /* st->out_hit */
 		   st->out_slow_tot,
 		   st->out_slow_mc,
 
-		   st->gc_total,
-		   st->gc_ignored,
-		   st->gc_goal_miss,
-		   st->gc_dst_overflow,
-		   st->in_hlist_search,
-		   st->out_hlist_search
+		   0, /* st->gc_total */
+		   0, /* st->gc_ignored */
+		   0, /* st->gc_goal_miss */
+		   0, /* st->gc_dst_overflow */
+		   0, /* st->in_hlist_search */
+		   0  /* st->out_hlist_search */
 		);
 	return 0;
 }
-- 
cgit v1.2.3


From 28be6e07e8bccee76b51bca8fdba52c1b28fc77c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Oct 2013 10:36:17 -0700
Subject: tcp: rename tcp_tso_segment()

Rename tcp_tso_segment() to tcp_gso_segment(), to better reflect
what is going on, and ease grep games.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        | 2 +-
 net/ipv4/tcp_offload.c   | 6 +++---
 net/ipv6/tcpv6_offload.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1db3a016bff6..372dcccfeed0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1548,7 +1548,7 @@ extern struct request_sock_ops tcp6_request_sock_ops;
 
 void tcp_v4_destroy_sock(struct sock *sk);
 
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 				netdev_features_t features);
 struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb);
 int tcp_gro_complete(struct sk_buff *skb);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 3a7525e6c086..8e3113f46ec1 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,7 +14,7 @@
 #include <net/tcp.h>
 #include <net/protocol.h>
 
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 				netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
@@ -139,7 +139,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
 out:
 	return segs;
 }
-EXPORT_SYMBOL(tcp_tso_segment);
+EXPORT_SYMBOL(tcp_gso_segment);
 
 struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
@@ -320,7 +320,7 @@ static int tcp4_gro_complete(struct sk_buff *skb)
 static const struct net_offload tcpv4_offload = {
 	.callbacks = {
 		.gso_send_check	=	tcp_v4_gso_send_check,
-		.gso_segment	=	tcp_tso_segment,
+		.gso_segment	=	tcp_gso_segment,
 		.gro_receive	=	tcp4_gro_receive,
 		.gro_complete	=	tcp4_gro_complete,
 	},
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 2ec6bf6a0aa0..c1097c798900 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -83,7 +83,7 @@ static int tcp6_gro_complete(struct sk_buff *skb)
 static const struct net_offload tcpv6_offload = {
 	.callbacks = {
 		.gso_send_check	=	tcp_v6_gso_send_check,
-		.gso_segment	=	tcp_tso_segment,
+		.gso_segment	=	tcp_gso_segment,
 		.gro_receive	=	tcp6_gro_receive,
 		.gro_complete	=	tcp6_gro_complete,
 	},
-- 
cgit v1.2.3


From 77dfca7e45518ea87a4e90f9142b87687f55f708 Mon Sep 17 00:00:00 2001
From: "baker.zhang" <baker.kernel@gmail.com>
Date: Sun, 13 Oct 2013 19:50:09 +0800
Subject: fib_trie: remove duplicated rcu lock

fib_table_lookup has included the rcu lock protection.

Signed-off-by: baker.zhang <baker.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b3f627ac4ed8..d846304b7b89 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -933,7 +933,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 		local_bh_disable();
 
 		frn->tb_id = tb->tb_id;
-		rcu_read_lock();
 		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
 
 		if (!frn->err) {
@@ -942,7 +941,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 			frn->type = res.type;
 			frn->scope = res.scope;
 		}
-		rcu_read_unlock();
 		local_bh_enable();
 	}
 }
-- 
cgit v1.2.3


From 9877b25382e770618d0a36a3024d8a3c67eee9ea Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 17 Oct 2013 13:34:11 -0700
Subject: fib: Use const struct nl_info * in rtmsg_fib

The rtmsg_fib function doesn't modify this argument so mark
it const.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_lookup.h    | 2 +-
 net/ipv4/fib_semantics.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index af0f14aba169..50cfb3ef560f 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -32,7 +32,7 @@ extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 			 int dst_len, u8 tos, struct fib_info *fi,
 			 unsigned int);
 extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
-		      int dst_len, u32 tb_id, struct nl_info *info,
+		      int dst_len, u32 tb_id, const struct nl_info *info,
 		      unsigned int nlm_flags);
 extern struct fib_alias *fib_find_alias(struct list_head *fah,
 					u8 tos, u32 prio);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d5dbca5ecf62..e63f47a4e651 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -380,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 }
 
 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
-	       int dst_len, u32 tb_id, struct nl_info *info,
+	       int dst_len, u32 tb_id, const struct nl_info *info,
 	       unsigned int nlm_flags)
 {
 	struct sk_buff *skb;
-- 
cgit v1.2.3


From 675297c904b419542a93de64afaf2a6bd72e35e3 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 16 Oct 2013 12:36:51 -0400
Subject: tcp: remove redundant code in __tcp_retransmit_skb()

Remove the specialized code in __tcp_retransmit_skb() that tries to trim
any ACKed payload preceding a FIN before we retransmit (this was added
in 1999 in v2.2.3pre3). This trimming code was made unreachable by the
more general code added above it that uses tcp_trim_head() to trim any
ACKed payload, with or without a FIN (this was added in "[NET]: Add
segmentation offload support to TCP." in 2002 circa v2.5.33).

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e5ce0e1d13b7..ce7c4d9d9195 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2351,21 +2351,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 
 	tcp_retrans_try_collapse(sk, skb, cur_mss);
 
-	/* Some Solaris stacks overoptimize and ignore the FIN on a
-	 * retransmit when old data is attached.  So strip it off
-	 * since it is cheap to do so and saves bytes on the network.
-	 */
-	if (skb->len > 0 &&
-	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
-	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
-		if (!pskb_trim(skb, 0)) {
-			/* Reuse, even though it does some unnecessary work */
-			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
-					     TCP_SKB_CB(skb)->tcp_flags);
-			skb->ip_summed = CHECKSUM_NONE;
-		}
-	}
-
 	/* Make a copy, if the first transmission SKB clone we made
 	 * is still in somebody's hands, else make a clone.
 	 */
-- 
cgit v1.2.3


From 47d27aad44169372f358cda88a223883f6760fa5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Oct 2013 13:13:27 -0700
Subject: ipv4: gso: send_check() & segment() cleanups

inet_gso_segment() and inet_gso_send_check() are called by
skb_mac_gso_segment() under rcu lock, no need to use
rcu_read_lock() / rcu_read_unlock()

Avoid calling ip_hdr() twice per function.

We can use ip_send_check() helper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 35913fb77dc8..4f8cd4fc451d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1254,20 +1254,19 @@ static int inet_gso_send_check(struct sk_buff *skb)
 	if (ihl < sizeof(*iph))
 		goto out;
 
+	proto = iph->protocol;
+
+	/* Warning: after this point, iph might be no longer valid */
 	if (unlikely(!pskb_may_pull(skb, ihl)))
 		goto out;
-
 	__skb_pull(skb, ihl);
+
 	skb_reset_transport_header(skb);
-	iph = ip_hdr(skb);
-	proto = iph->protocol;
 	err = -EPROTONOSUPPORT;
 
-	rcu_read_lock();
 	ops = rcu_dereference(inet_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_send_check))
 		err = ops->callbacks.gso_send_check(skb);
-	rcu_read_unlock();
 
 out:
 	return err;
@@ -1305,23 +1304,23 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	if (ihl < sizeof(*iph))
 		goto out;
 
+	id = ntohs(iph->id);
+	proto = iph->protocol;
+
+	/* Warning: after this point, iph might be no longer valid */
 	if (unlikely(!pskb_may_pull(skb, ihl)))
 		goto out;
+	__skb_pull(skb, ihl);
 
 	tunnel = !!skb->encapsulation;
 
-	__skb_pull(skb, ihl);
 	skb_reset_transport_header(skb);
-	iph = ip_hdr(skb);
-	id = ntohs(iph->id);
-	proto = iph->protocol;
+
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
-	rcu_read_lock();
 	ops = rcu_dereference(inet_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment))
 		segs = ops->callbacks.gso_segment(skb, features);
-	rcu_read_unlock();
 
 	if (IS_ERR_OR_NULL(segs))
 		goto out;
@@ -1339,8 +1338,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 			iph->id = htons(id++);
 		}
 		iph->tot_len = htons(skb->len - skb->mac_len);
-		iph->check = 0;
-		iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+		ip_send_check(iph);
 	} while ((skb = skb->next));
 
 out:
-- 
cgit v1.2.3


From 7e58487b8cf5871d2a0fa03892dbd4b3a620d07f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 18 Oct 2013 13:48:24 -0700
Subject: net: ipv4/ipv6: Remove extern from function prototypes

There are a mix of function prototypes with and without extern
in the kernel sources.  Standardize on not using extern for
function prototypes.

Function prototypes don't need to be written with extern.
extern is assumed by the compiler.  Its use is as unnecessary as
using auto to declare automatic/local variables in a block.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_lookup.h | 26 +++++++++++---------------
 net/ipv4/tcp_vegas.h  | 10 +++++-----
 net/ipv4/udp_impl.h   | 36 ++++++++++++++++++------------------
 net/ipv6/udp_impl.h   | 41 ++++++++++++++++++++---------------------
 4 files changed, 54 insertions(+), 59 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 50cfb3ef560f..388d113fd289 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -24,21 +24,17 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
 }
 
 /* Exported by fib_semantics.c */
-extern void fib_release_info(struct fib_info *);
-extern struct fib_info *fib_create_info(struct fib_config *cfg);
-extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
-extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
-			 u32 tb_id, u8 type, __be32 dst,
-			 int dst_len, u8 tos, struct fib_info *fi,
-			 unsigned int);
-extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
-		      int dst_len, u32 tb_id, const struct nl_info *info,
-		      unsigned int nlm_flags);
-extern struct fib_alias *fib_find_alias(struct list_head *fah,
-					u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
-			    struct fib_info **last_resort,
-			    int *last_idx, int dflt);
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg);
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
+		  u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
+		  unsigned int);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+	       u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
+int fib_detect_death(struct fib_info *fi, int order,
+		     struct fib_info **last_resort, int *last_idx, int dflt);
 
 static inline void fib_result_assign(struct fib_result *res,
 				     struct fib_info *fi)
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 6c0eea2f8249..0531b99d8637 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -15,10 +15,10 @@ struct vegas {
 	u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
 };
 
-extern void tcp_vegas_init(struct sock *sk);
-extern void tcp_vegas_state(struct sock *sk, u8 ca_state);
-extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
-extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+void tcp_vegas_init(struct sock *sk);
+void tcp_vegas_state(struct sock *sk, u8 ca_state);
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
 
 #endif	/* __TCP_VEGAS_H */
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 5a681e298b90..f3c27899f62b 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,30 +5,30 @@
 #include <net/protocol.h>
 #include <net/inet_common.h>
 
-extern int  	__udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
-extern void 	__udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
+void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
 
-extern int	udp_v4_get_port(struct sock *sk, unsigned short snum);
+int udp_v4_get_port(struct sock *sk, unsigned short snum);
 
-extern int	udp_setsockopt(struct sock *sk, int level, int optname,
-			       char __user *optval, unsigned int optlen);
-extern int	udp_getsockopt(struct sock *sk, int level, int optname,
-			       char __user *optval, int __user *optlen);
+int udp_setsockopt(struct sock *sk, int level, int optname,
+		   char __user *optval, unsigned int optlen);
+int udp_getsockopt(struct sock *sk, int level, int optname,
+		   char __user *optval, int __user *optlen);
 
 #ifdef CONFIG_COMPAT
-extern int	compat_udp_setsockopt(struct sock *sk, int level, int optname,
-				      char __user *optval, unsigned int optlen);
-extern int	compat_udp_getsockopt(struct sock *sk, int level, int optname,
-				      char __user *optval, int __user *optlen);
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen);
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen);
 #endif
-extern int	udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
-			    size_t len, int noblock, int flags, int *addr_len);
-extern int	udp_sendpage(struct sock *sk, struct page *page, int offset,
-			     size_t size, int flags);
-extern int	udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-extern void	udp_destroy_sock(struct sock *sk);
+int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t len, int noblock, int flags, int *addr_len);
+int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
+		 int flags);
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void udp_destroy_sock(struct sock *sk);
 
 #ifdef CONFIG_PROC_FS
-extern int	udp4_seq_show(struct seq_file *seq, void *v);
+int udp4_seq_show(struct seq_file *seq, void *v);
 #endif
 #endif	/* _UDP4_IMPL_H */
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 4691ed50a928..c779c3c90b9d 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -7,33 +7,32 @@
 #include <net/inet_common.h>
 #include <net/transp_v6.h>
 
-extern int  	__udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
-extern void 	__udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
-			       u8 , u8 , int , __be32 , struct udp_table *);
+int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
+void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+		    __be32, struct udp_table *);
 
-extern int	udp_v6_get_port(struct sock *sk, unsigned short snum);
+int udp_v6_get_port(struct sock *sk, unsigned short snum);
 
-extern int	udpv6_getsockopt(struct sock *sk, int level, int optname,
-				 char __user *optval, int __user *optlen);
-extern int	udpv6_setsockopt(struct sock *sk, int level, int optname,
-				 char __user *optval, unsigned int optlen);
+int udpv6_getsockopt(struct sock *sk, int level, int optname,
+		     char __user *optval, int __user *optlen);
+int udpv6_setsockopt(struct sock *sk, int level, int optname,
+		     char __user *optval, unsigned int optlen);
 #ifdef CONFIG_COMPAT
-extern int	compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
-					char __user *optval, unsigned int optlen);
-extern int	compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
-				       char __user *optval, int __user *optlen);
+int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, unsigned int optlen);
+int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, int __user *optlen);
 #endif
-extern int	udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
-			      struct msghdr *msg, size_t len);
-extern int	udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
-			      struct msghdr *msg, size_t len,
-			      int noblock, int flags, int *addr_len);
-extern int	udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
-extern void	udpv6_destroy_sock(struct sock *sk);
+int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		  size_t len);
+int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		  size_t len, int noblock, int flags, int *addr_len);
+int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void udpv6_destroy_sock(struct sock *sk);
 
-extern void udp_v6_clear_sk(struct sock *sk, int size);
+void udp_v6_clear_sk(struct sock *sk, int size);
 
 #ifdef CONFIG_PROC_FS
-extern int	udp6_seq_show(struct seq_file *seq, void *v);
+int udp6_seq_show(struct seq_file *seq, void *v);
 #endif
 #endif	/* _UDP6_IMPL_H */
-- 
cgit v1.2.3


From e93b7d748be887cd7639b113ba7d7ef792a7efb9 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sat, 19 Oct 2013 12:29:17 +0200
Subject: ip_output: do skb ufo init for peeked non ufo skb as well

Now, if user application does:
sendto len<mtu flag MSG_MORE
sendto len>mtu flag 0
The skb is not treated as fragmented one because it is not initialized
that way. So move the initialization to fix this.

introduced by:
commit e89e9cf539a28df7d0eb1d0a545368e9920b34ac "[IPv4/IPv6]: UFO Scatter-gather approach"

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a04d872c54f9..3982eabf61e1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -772,15 +772,20 @@ static inline int ip_ufo_append_data(struct sock *sk,
 		/* initialize protocol header pointer */
 		skb->transport_header = skb->network_header + fragheaderlen;
 
-		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum = 0;
 
-		/* specify the length of each IP datagram fragment */
-		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
-		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
 		__skb_queue_tail(queue, skb);
+	} else if (skb_is_gso(skb)) {
+		goto append;
 	}
 
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	/* specify the length of each IP datagram fragment */
+	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
+	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
+append:
 	return skb_append_datato_frags(sk, skb, getfrag, from,
 				       (length - transhdrlen));
 }
-- 
cgit v1.2.3


From 2d26f0a3c0e22f6b3096a2503d086e4b5e99d708 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 19 Oct 2013 11:42:55 -0700
Subject: ipv4: generalize gre_handle_offloads

This patch makes gre_handle_offloads() more generic
and rename it to iptunnel_handle_offloads()

This will be used to add GSO/TSO support to IPIP tunnels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gre.h         |  8 +++++++-
 include/net/ip_tunnels.h  |  3 +++
 net/ipv4/gre_demux.c      | 29 -----------------------------
 net/ipv4/ip_tunnel_core.c | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/gre.h b/include/net/gre.h
index 57e4afdf7879..dcd9ae3270d3 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -38,7 +38,13 @@ void gre_offload_exit(void);
 
 void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 		      int hdr_len);
-struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
+
+static inline struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
+						  bool gre_csum)
+{
+	return iptunnel_handle_offloads(skb, gre_csum, SKB_GSO_GRE);
+}
+
 
 static inline int ip_gre_calc_hlen(__be16 o_flags)
 {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a0a4a100f5c9..732f8c6ae975 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -150,6 +150,9 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, __u8 proto,
 		  __u8 tos, __u8 ttl, __be16 df, bool xnet);
 
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum,
+					 int gso_type_mask);
+
 static inline void iptunnel_xmit_stats(int err,
 				       struct net_device_stats *err_stats,
 				       struct pcpu_tstats __percpu *stats)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 736c9fc3ef93..5893e99e8299 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -93,35 +93,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 }
 EXPORT_SYMBOL_GPL(gre_build_header);
 
-struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
-{
-	int err;
-
-	if (likely(!skb->encapsulation)) {
-		skb_reset_inner_headers(skb);
-		skb->encapsulation = 1;
-	}
-
-	if (skb_is_gso(skb)) {
-		err = skb_unclone(skb, GFP_ATOMIC);
-		if (unlikely(err))
-			goto error;
-		skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
-		return skb;
-	} else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) {
-		err = skb_checksum_help(skb);
-		if (unlikely(err))
-			goto error;
-	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
-		skb->ip_summed = CHECKSUM_NONE;
-
-	return skb;
-error:
-	kfree_skb(skb);
-	return ERR_PTR(err);
-}
-EXPORT_SYMBOL_GPL(gre_handle_offloads);
-
 static __sum16 check_checksum(struct sk_buff *skb)
 {
 	__sum16 csum = 0;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index c31e3ad98ef2..42ffbc8d65c6 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -116,3 +116,36 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
+					 bool csum_help,
+					 int gso_type_mask)
+{
+	int err;
+
+	if (likely(!skb->encapsulation)) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
+	if (skb_is_gso(skb)) {
+		err = skb_unclone(skb, GFP_ATOMIC);
+		if (unlikely(err))
+			goto error;
+		skb_shinfo(skb)->gso_type |= gso_type_mask;
+		return skb;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+		err = skb_checksum_help(skb);
+		if (unlikely(err))
+			goto error;
+	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
+		skb->ip_summed = CHECKSUM_NONE;
+
+	return skb;
+error:
+	kfree_skb(skb);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
-- 
cgit v1.2.3


From 3347c960295583eee3fd58e5c539fb1972fbc005 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 19 Oct 2013 11:42:56 -0700
Subject: ipv4: gso: make inet_gso_segment() stackable

In order to support GSO on IPIP, we need to make
inet_gso_segment() stackable.

It should not assume network header starts right after mac
header.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  7 +++++--
 net/core/dev.c         |  2 ++
 net/ipv4/af_inet.c     | 25 ++++++++++++++++++-------
 3 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ba74474836c0..cad1e0c5cc04 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2722,9 +2722,12 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 /* Keeps track of mac header offset relative to skb->head.
  * It is useful for TSO of Tunneling protocol. e.g. GRE.
  * For non-tunnel skb it points to skb_mac_header() and for
- * tunnel skb it points to outer mac header. */
+ * tunnel skb it points to outer mac header.
+ * Keeps track of level of encapsulation of network headers.
+ */
 struct skb_gso_cb {
-	int mac_offset;
+	int	mac_offset;
+	int	encap_level;
 };
 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb)
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 1b6eadf69289..0918aadc20fd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2377,6 +2377,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 	}
 
 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+	SKB_GSO_CB(skb)->encap_level = 0;
+
 	skb_reset_mac_header(skb);
 	skb_reset_mac_len(skb);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4f8cd4fc451d..5783ab5b5ef8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1273,16 +1273,17 @@ out:
 }
 
 static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
-	netdev_features_t features)
+					netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
+	unsigned int offset = 0;
 	struct iphdr *iph;
+	bool tunnel;
 	int proto;
+	int nhoff;
 	int ihl;
 	int id;
-	unsigned int offset = 0;
-	bool tunnel;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
 		     ~(SKB_GSO_TCPV4 |
@@ -1296,6 +1297,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       0)))
 		goto out;
 
+	skb_reset_network_header(skb);
+	nhoff = skb_network_header(skb) - skb_mac_header(skb);
 	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
 		goto out;
 
@@ -1312,7 +1315,10 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		goto out;
 	__skb_pull(skb, ihl);
 
-	tunnel = !!skb->encapsulation;
+	tunnel = SKB_GSO_CB(skb)->encap_level > 0;
+	if (tunnel)
+		features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	SKB_GSO_CB(skb)->encap_level += ihl;
 
 	skb_reset_transport_header(skb);
 
@@ -1327,18 +1333,23 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 
 	skb = segs;
 	do {
-		iph = ip_hdr(skb);
+		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
 		if (!tunnel && proto == IPPROTO_UDP) {
 			iph->id = htons(id);
 			iph->frag_off = htons(offset >> 3);
 			if (skb->next != NULL)
 				iph->frag_off |= htons(IP_MF);
-			offset += (skb->len - skb->mac_len - iph->ihl * 4);
+			offset += skb->len - nhoff - ihl;
 		} else  {
 			iph->id = htons(id++);
 		}
-		iph->tot_len = htons(skb->len - skb->mac_len);
+		iph->tot_len = htons(skb->len - nhoff);
 		ip_send_check(iph);
+		if (tunnel) {
+			skb_reset_inner_headers(skb);
+			skb->encapsulation = 1;
+		}
+		skb->network_header = (u8 *)iph - skb->head;
 	} while ((skb = skb->next));
 
 out:
-- 
cgit v1.2.3


From cb32f511a70be8967ac9025cf49c44324ced9a39 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 19 Oct 2013 11:42:57 -0700
Subject: ipip: add GSO/TSO support

Now inet_gso_segment() is stackable, its relatively easy to
implement GSO/TSO support for IPIP

Performance results, when segmentation is done after tunnel
device (as no NIC is yet enabled for TSO IPIP support) :

Before patch :

lpq83:~# ./netperf -H 7.7.9.84 -Cc
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    10.00      3357.88   5.09     3.70     2.983   2.167

After patch :

lpq83:~# ./netperf -H 7.7.9.84 -Cc
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    10.00      7710.19   4.52     6.62     1.152   1.687

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  2 ++
 include/linux/skbuff.h          |  6 ++++--
 net/core/ethtool.c              |  1 +
 net/ipv4/af_inet.c              |  9 +++++++++
 net/ipv4/gre_offload.c          |  3 ++-
 net/ipv4/ipip.c                 | 11 ++++++-----
 net/ipv4/tcp_offload.c          |  1 +
 net/ipv4/udp_offload.c          |  1 +
 net/ipv6/ip6_offload.c          |  1 +
 net/ipv6/udp_offload.c          |  1 +
 net/mpls/mpls_gso.c             |  1 +
 11 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a2a89a5c7be5..8dad68cede1c 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -42,6 +42,7 @@ enum {
 	NETIF_F_TSO6_BIT,		/* ... TCPv6 segmentation */
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
+	NETIF_F_GSO_IPIP_BIT,		/* ... IPIP tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_MPLS_BIT,		/* ... MPLS segmentation */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
@@ -107,6 +108,7 @@ enum {
 #define NETIF_F_RXFCS		__NETIF_F(RXFCS)
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
 #define NETIF_F_GSO_GRE		__NETIF_F(GSO_GRE)
+#define NETIF_F_GSO_IPIP	__NETIF_F(GSO_IPIP)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_MPLS	__NETIF_F(GSO_MPLS)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cad1e0c5cc04..60729134d253 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -318,9 +318,11 @@ enum {
 
 	SKB_GSO_GRE = 1 << 6,
 
-	SKB_GSO_UDP_TUNNEL = 1 << 7,
+	SKB_GSO_IPIP = 1 << 7,
 
-	SKB_GSO_MPLS = 1 << 8,
+	SKB_GSO_UDP_TUNNEL = 1 << 8,
+
+	SKB_GSO_MPLS = 1 << 9,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 78e9d9223e40..8cab7744790e 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -81,6 +81,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation",
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
+	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_MPLS_BIT] =	 "tx-mpls-segmentation",
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5783ab5b5ef8..4049906010f7 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1291,6 +1291,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
+		       SKB_GSO_IPIP |
 		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_MPLS |
@@ -1656,6 +1657,13 @@ static struct packet_offload ip_packet_offload __read_mostly = {
 	},
 };
 
+static const struct net_offload ipip_offload = {
+	.callbacks = {
+		.gso_send_check = inet_gso_send_check,
+		.gso_segment	= inet_gso_segment,
+	},
+};
+
 static int __init ipv4_offload_init(void)
 {
 	/*
@@ -1667,6 +1675,7 @@ static int __init ipv4_offload_init(void)
 		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
 
 	dev_add_offload(&ip_packet_offload);
+	inet_add_offload(&ipip_offload, IPPROTO_IPIP);
 	return 0;
 }
 
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 55e6bfb3a289..e5d436188464 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -39,7 +39,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_UDP |
 				  SKB_GSO_DODGY |
 				  SKB_GSO_TCP_ECN |
-				  SKB_GSO_GRE)))
+				  SKB_GSO_GRE |
+				  SKB_GSO_IPIP)))
 		goto out;
 
 	if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7f80fb4b82d3..fe3e9f7f1f0b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -220,17 +220,17 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb->protocol != htons(ETH_P_IP)))
 		goto tx_error;
 
-	if (likely(!skb->encapsulation)) {
-		skb_reset_inner_headers(skb);
-		skb->encapsulation = 1;
-	}
+	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	if (IS_ERR(skb))
+		goto out;
 
 	ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
 	return NETDEV_TX_OK;
 
 tx_error:
-	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
+out:
+	dev->stats.tx_errors++;
 	return NETDEV_TX_OK;
 }
 
@@ -275,6 +275,7 @@ static const struct net_device_ops ipip_netdev_ops = {
 #define IPIP_FEATURES (NETIF_F_SG |		\
 		       NETIF_F_FRAGLIST |	\
 		       NETIF_F_HIGHDMA |	\
+		       NETIF_F_GSO_SOFTWARE |	\
 		       NETIF_F_HW_CSUM)
 
 static void ipip_tunnel_setup(struct net_device *dev)
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 8e3113f46ec1..dfc96b00673e 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -56,6 +56,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_TCP_ECN |
 			       SKB_GSO_TCPV6 |
 			       SKB_GSO_GRE |
+			       SKB_GSO_IPIP |
 			       SKB_GSO_MPLS |
 			       SKB_GSO_UDP_TUNNEL |
 			       0) ||
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f35eccaa855e..83206de2bc76 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -52,6 +52,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 
 		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
+				      SKB_GSO_IPIP |
 				      SKB_GSO_GRE | SKB_GSO_MPLS) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index b405fba91c72..5c2fc1d04196 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -96,6 +96,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
+		       SKB_GSO_IPIP |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_MPLS |
 		       SKB_GSO_TCPV6 |
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 60559511bd9c..f63780ff3732 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -64,6 +64,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_GRE |
+				      SKB_GSO_IPIP |
 				      SKB_GSO_MPLS) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 1bec1219ab81..851cd880b0c0 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -33,6 +33,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_DODGY |
 				  SKB_GSO_TCP_ECN |
 				  SKB_GSO_GRE |
+				  SKB_GSO_IPIP |
 				  SKB_GSO_MPLS)))
 		goto out;
 
-- 
cgit v1.2.3


From 65cd8033ff375b68037df61603ee68070dc48578 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 19 Oct 2013 21:48:51 +0200
Subject: ipv4: split inet_ehashfn to hash functions per compilation unit

This duplicates a bit of code but let's us easily introduce
separate secret keys later. The separate compilation units are
ipv4/inet_hashtabbles.o, ipv4/udp.o and rds/connection.o.

Cc: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h    | 22 ++++++----------------
 net/ipv4/inet_hashtables.c | 21 +++++++++++++++++++++
 net/ipv4/udp.c             | 16 ++++++++++++----
 net/rds/connection.c       |  6 +++---
 4 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 06da91efbc83..7a6c7f80a8fd 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -208,26 +208,16 @@ extern u32 inet_ehash_secret;
 extern u32 ipv6_hash_secret;
 void build_ehash_secret(void);
 
-static inline unsigned int inet_ehashfn(struct net *net,
-					const __be32 laddr, const __u16 lport,
-					const __be32 faddr, const __be16 fport)
+static inline unsigned int __inet_ehashfn(const __be32 laddr,
+					  const __u16 lport,
+					  const __be32 faddr,
+					  const __be16 fport,
+					  u32 initval)
 {
 	return jhash_3words((__force __u32) laddr,
 			    (__force __u32) faddr,
 			    ((__u32) lport) << 16 | (__force __u32)fport,
-			    inet_ehash_secret + net_hash_mix(net));
-}
-
-static inline int inet_sk_ehashfn(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const __be32 laddr = inet->inet_rcv_saddr;
-	const __u16 lport = inet->inet_num;
-	const __be32 faddr = inet->inet_daddr;
-	const __be16 fport = inet->inet_dport;
-	struct net *net = sock_net(sk);
-
-	return inet_ehashfn(net, laddr, lport, faddr, fport);
+			    initval);
 }
 
 static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index a4b66bbe4f21..18aa668d0cc9 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -24,6 +24,27 @@
 #include <net/secure_seq.h>
 #include <net/ip.h>
 
+static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
+				 const __u16 lport, const __be32 faddr,
+				 const __be16 fport)
+{
+	return __inet_ehashfn(laddr, lport, faddr, fport,
+			      inet_ehash_secret + net_hash_mix(net));
+}
+
+
+static unsigned int inet_sk_ehashfn(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const __be32 laddr = inet->inet_rcv_saddr;
+	const __u16 lport = inet->inet_num;
+	const __be32 faddr = inet->inet_daddr;
+	const __be16 fport = inet->inet_dport;
+	struct net *net = sock_net(sk);
+
+	return inet_ehashfn(net, laddr, lport, faddr, fport);
+}
+
 /*
  * Allocate and initialize a new local port bind bucket.
  * The bindhash mutex for snum's hash chain must be held here.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9f27bb800607..b4437c7db6ce 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -407,6 +407,14 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 	return score;
 }
 
+static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
+				 const __u16 lport, const __be32 faddr,
+				 const __be16 fport)
+{
+	return __inet_ehashfn(laddr, lport, faddr, fport,
+			      inet_ehash_secret + net_hash_mix(net));
+}
+
 
 /* called with read_rcu_lock() */
 static struct sock *udp4_lib_lookup2(struct net *net,
@@ -430,8 +438,8 @@ begin:
 			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
-				hash = inet_ehashfn(net, daddr, hnum,
-						    saddr, sport);
+				hash = udp_ehashfn(net, daddr, hnum,
+						   saddr, sport);
 				matches = 1;
 			}
 		} else if (score == badness && reuseport) {
@@ -511,8 +519,8 @@ begin:
 			badness = score;
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
-				hash = inet_ehashfn(net, daddr, hnum,
-						    saddr, sport);
+				hash = udp_ehashfn(net, daddr, hnum,
+						   saddr, sport);
 				matches = 1;
 			}
 		} else if (score == badness && reuseport) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 642ad42c416b..45e23660437a 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -52,9 +52,9 @@ static struct kmem_cache *rds_conn_slab;
 static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 {
 	/* Pass NULL, don't need struct net for hash */
-	unsigned long hash = inet_ehashfn(NULL,
-					  be32_to_cpu(laddr), 0,
-					  be32_to_cpu(faddr), 0);
+	unsigned long hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
+					    be32_to_cpu(faddr), 0,
+					    inet_ehash_secret);
 	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
 }
 
-- 
cgit v1.2.3


From b23a002fc6f0c19846ee0382f019429af54a27e9 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 19 Oct 2013 21:48:56 +0200
Subject: inet: split syncookie keys for ipv4 and ipv6 and initialize with
 net_get_random_once

This patch splits the secret key for syncookies for ipv4 and ipv6 and
initializes them with net_get_random_once. This change was the reason I
did this series. I think the initialization of the syncookie_secret is
way to early.

Cc: Florian Westphal <fw@strlen.de>
Cc: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  1 -
 net/ipv4/syncookies.c | 15 +++++----------
 net/ipv6/syncookies.c | 12 +++++++++---
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 372dcccfeed0..f30326f1c92b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -475,7 +475,6 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
 
 /* From syncookies.c */
-extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
 int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 		      u32 cookie);
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3b64c59b4109..b95331e6c077 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -25,15 +25,7 @@
 
 extern int sysctl_tcp_syncookies;
 
-__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
-EXPORT_SYMBOL(syncookie_secret);
-
-static __init int init_syncookies(void)
-{
-	get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
-	return 0;
-}
-__initcall(init_syncookies);
+static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
 
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -44,8 +36,11 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
 {
-	__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
+	__u32 *tmp;
+
+	net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
 
+	tmp  = __get_cpu_var(ipv4_cookie_scratch);
 	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
 	tmp[0] = (__force u32)saddr;
 	tmp[1] = (__force u32)daddr;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index d04d3f1dd9b7..535a3ad262f1 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -24,6 +24,8 @@
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
+static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS];
+
 /* RFC 2460, Section 8.3:
  * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
  *
@@ -61,14 +63,18 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
 static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
 {
-	__u32 *tmp = __get_cpu_var(ipv6_cookie_scratch);
+	__u32 *tmp;
+
+	net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
+
+	tmp  = __get_cpu_var(ipv6_cookie_scratch);
 
 	/*
 	 * we have 320 bits of information to hash, copy in the remaining
-	 * 192 bits required for sha_transform, from the syncookie_secret
+	 * 192 bits required for sha_transform, from the syncookie6_secret
 	 * and overwrite the digest with the secret
 	 */
-	memcpy(tmp + 10, syncookie_secret[c], 44);
+	memcpy(tmp + 10, syncookie6_secret[c], 44);
 	memcpy(tmp, saddr, 16);
 	memcpy(tmp + 4, daddr, 16);
 	tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
-- 
cgit v1.2.3


From 1bbdceef1e535add893bf71d7b7ab102e4eb69eb Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 19 Oct 2013 21:48:57 +0200
Subject: inet: convert inet_ehash_secret and ipv6_hash_secret to
 net_get_random_once

Initialize the ehash and ipv6_hash_secrets with net_get_random_once.

Each compilation unit gets its own secret now:
  ipv4/inet_hashtables.o
  ipv4/udp.o
  ipv6/inet6_hashtables.o
  ipv6/udp.o
  rds/connection.o

The functions still get inlined into the hashing functions. In the fast
path we have at most two (needed in ipv6) if (unlikely(...)).

Cc: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h     |  4 ----
 net/ipv4/af_inet.c          | 27 ---------------------------
 net/ipv4/inet_hashtables.c  |  4 ++++
 net/ipv4/udp.c              |  6 +++++-
 net/ipv6/af_inet6.c         |  5 -----
 net/ipv6/inet6_hashtables.c | 15 ++++++++++++---
 net/ipv6/udp.c              | 17 ++++++++++++++---
 net/rds/connection.c        | 12 +++++++++---
 8 files changed, 44 insertions(+), 46 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 7a6c7f80a8fd..1833c3f389ee 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -204,10 +204,6 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 
 int inet_sk_rebuild_header(struct sock *sk);
 
-extern u32 inet_ehash_secret;
-extern u32 ipv6_hash_secret;
-void build_ehash_secret(void);
-
 static inline unsigned int __inet_ehashfn(const __be32 laddr,
 					  const __u16 lport,
 					  const __be32 faddr,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4049906010f7..9433a6186f54 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -245,29 +245,6 @@ out:
 }
 EXPORT_SYMBOL(inet_listen);
 
-u32 inet_ehash_secret __read_mostly;
-EXPORT_SYMBOL(inet_ehash_secret);
-
-u32 ipv6_hash_secret __read_mostly;
-EXPORT_SYMBOL(ipv6_hash_secret);
-
-/*
- * inet_ehash_secret must be set exactly once, and to a non nul value
- * ipv6_hash_secret must be set exactly once.
- */
-void build_ehash_secret(void)
-{
-	u32 rnd;
-
-	do {
-		get_random_bytes(&rnd, sizeof(rnd));
-	} while (rnd == 0);
-
-	if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0)
-		get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
-}
-EXPORT_SYMBOL(build_ehash_secret);
-
 /*
  *	Create an inet socket.
  */
@@ -284,10 +261,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 	int try_loading_module = 0;
 	int err;
 
-	if (unlikely(!inet_ehash_secret))
-		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
-			build_ehash_secret();
-
 	sock->state = SS_UNCONNECTED;
 
 	/* Look for the requested type/protocol pair. */
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 18aa668d0cc9..8b9cf279450d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -28,6 +28,10 @@ static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
 				 const __u16 lport, const __be32 faddr,
 				 const __be16 fport)
 {
+	static u32 inet_ehash_secret __read_mostly;
+
+	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
 	return __inet_ehashfn(laddr, lport, faddr, fport,
 			      inet_ehash_secret + net_hash_mix(net));
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b4437c7db6ce..89909dd730dd 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -411,8 +411,12 @@ static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
 				 const __u16 lport, const __be32 faddr,
 				 const __be16 fport)
 {
+	static u32 udp_ehash_secret __read_mostly;
+
+	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
+
 	return __inet_ehashfn(laddr, lport, faddr, fport,
-			      inet_ehash_secret + net_hash_mix(net));
+			      udp_ehash_secret + net_hash_mix(net));
 }
 
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index a2cb07cd3850..20af1fb81c83 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -110,11 +110,6 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 	int try_loading_module = 0;
 	int err;
 
-	if (sock->type != SOCK_RAW &&
-	    sock->type != SOCK_DGRAM &&
-	    !inet_ehash_secret)
-		build_ehash_secret();
-
 	/* Look for the requested type/protocol pair. */
 lookup_protocol:
 	err = -ESOCKTNOSUPPORT;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index fa7dd3856c55..262e13c02ec2 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -29,10 +29,19 @@ static unsigned int inet6_ehashfn(struct net *net,
 				  const struct in6_addr *faddr,
 				  const __be16 fport)
 {
-	const u32 lhash = (__force u32)laddr->s6_addr32[3];
-	const u32 fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+	static u32 inet6_ehash_secret __read_mostly;
+	static u32 ipv6_hash_secret __read_mostly;
+
+	u32 lhash, fhash;
+
+	net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
+	net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
+
+	lhash = (__force u32)laddr->s6_addr32[3];
+	fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+
 	return __inet6_ehashfn(lhash, lport, fhash, fport,
-			       inet_ehash_secret + net_hash_mix(net));
+			       inet6_ehash_secret + net_hash_mix(net));
 }
 
 static int inet6_sk_ehashfn(const struct sock *sk)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 324bd36c23bc..44fc4e3d661f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -59,10 +59,21 @@ static unsigned int udp6_ehashfn(struct net *net,
 				  const struct in6_addr *faddr,
 				  const __be16 fport)
 {
-	const u32 lhash = (__force u32)laddr->s6_addr32[3];
-	const u32 fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+	static u32 udp6_ehash_secret __read_mostly;
+	static u32 udp_ipv6_hash_secret __read_mostly;
+
+	u32 lhash, fhash;
+
+	net_get_random_once(&udp6_ehash_secret,
+			    sizeof(udp6_ehash_secret));
+	net_get_random_once(&udp_ipv6_hash_secret,
+			    sizeof(udp_ipv6_hash_secret));
+
+	lhash = (__force u32)laddr->s6_addr32[3];
+	fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret);
+
 	return __inet6_ehashfn(lhash, lport, fhash, fport,
-			       inet_ehash_secret + net_hash_mix(net));
+			       udp_ipv6_hash_secret + net_hash_mix(net));
 }
 
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 45e23660437a..378c3a6acf84 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -51,10 +51,16 @@ static struct kmem_cache *rds_conn_slab;
 
 static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 {
+	static u32 rds_hash_secret __read_mostly;
+
+	unsigned long hash;
+
+	net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+
 	/* Pass NULL, don't need struct net for hash */
-	unsigned long hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
-					    be32_to_cpu(faddr), 0,
-					    inet_ehash_secret);
+	hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
+			      be32_to_cpu(faddr), 0,
+			      rds_hash_secret);
 	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
 }
 
-- 
cgit v1.2.3


From 222e83d2e0aecb6a5e8d42b1a8d51332a1eba960 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 19 Oct 2013 21:48:58 +0200
Subject: tcp: switch tcp_fastopen key generation to net_get_random_once

Changed key initialization of tcp_fastopen cookies to net_get_random_once.

If the user sets a custom key net_get_random_once must be called at
least once to ensure we don't overwrite the user provided key when the
first cookie is generated later on.

Cc: Yuchung Cheng <ycheng@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h          |  2 +-
 net/ipv4/sysctl_net_ipv4.c |  5 +++++
 net/ipv4/tcp_fastopen.c    | 27 ++++++++++++++++-----------
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f30326f1c92b..b12e29a76590 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1322,7 +1322,7 @@ extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 int tcp_fastopen_reset_cipher(void *key, unsigned int len);
 void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
 			     struct tcp_fastopen_cookie *foc);
-
+void tcp_fastopen_init_key_once(bool publish);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
 /* Fastopen key context */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index c08f096d46b5..4b161d5aba0b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -274,6 +274,11 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
+		/* Generate a dummy secret but don't publish it. This
+		 * is needed so we don't regenerate a new key on the
+		 * first invocation of tcp_fastopen_cookie_gen
+		 */
+		tcp_fastopen_init_key_once(false);
 		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
 	}
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ab7bd35bb312..766032b4a6c3 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -14,6 +14,20 @@ struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 
 static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
 
+void tcp_fastopen_init_key_once(bool publish)
+{
+	static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+	/* tcp_fastopen_reset_cipher publishes the new context
+	 * atomically, so we allow this race happening here.
+	 *
+	 * All call sites of tcp_fastopen_cookie_gen also check
+	 * for a valid cookie, so this is an acceptable risk.
+	 */
+	if (net_get_random_once(key, sizeof(key)) && publish)
+		tcp_fastopen_reset_cipher(key, sizeof(key));
+}
+
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
 {
 	struct tcp_fastopen_context *ctx =
@@ -70,6 +84,8 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
 	__be32 path[4] = { src, dst, 0, 0 };
 	struct tcp_fastopen_context *ctx;
 
+	tcp_fastopen_init_key_once(true);
+
 	rcu_read_lock();
 	ctx = rcu_dereference(tcp_fastopen_ctx);
 	if (ctx) {
@@ -78,14 +94,3 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
 	}
 	rcu_read_unlock();
 }
-
-static int __init tcp_fastopen_init(void)
-{
-	__u8 key[TCP_FASTOPEN_KEY_LENGTH];
-
-	get_random_bytes(key, sizeof(key));
-	tcp_fastopen_reset_cipher(key, sizeof(key));
-	return 0;
-}
-
-late_initcall(tcp_fastopen_init);
-- 
cgit v1.2.3


From cd91cce620907eb3c5b3e8b4d62aadf0a19baba9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 19 Oct 2013 16:24:02 -0700
Subject: tcp_memcontrol: Remove tcp_max_memory

This function is never called. Remove it.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp_memcontrol.h |  1 -
 net/ipv4/tcp_memcontrol.c    | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 7df18bc43a97..88cdd1cb992e 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -14,6 +14,5 @@ struct tcp_memcontrol {
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
-unsigned long long tcp_max_memory(const struct mem_cgroup *memcg);
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
 #endif /* _TCP_MEMCG_H */
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 559d4ae6ebf4..82985d1dc9af 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -226,19 +226,6 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 	return 0;
 }
 
-unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
-{
-	struct tcp_memcontrol *tcp;
-	struct cg_proto *cg_proto;
-
-	cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
-	if (!cg_proto)
-		return 0;
-
-	tcp = tcp_from_cgproto(cg_proto);
-	return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-}
-
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
 {
 	struct tcp_memcontrol *tcp;
-- 
cgit v1.2.3


From f594d63199688ad568fb69f6a790b11d6d6d1ba5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 19 Oct 2013 16:24:52 -0700
Subject: tcp_memcontrol: Remove setting cgroup settings via sysctl

The code is broken and does not constrain sysctl_tcp_mem as
tcp_update_limit does.  With the result that it allows the cgroup tcp
memory limits to be bypassed.

The semantics are broken as the settings are not per netns and are in a
per netns table, and instead looks at current.

Since the code is broken in both design and implementation and does not
implement the functionality for which it was written remove it.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp_memcontrol.h |  1 -
 net/ipv4/sysctl_net_ipv4.c   | 39 ++-------------------------------------
 net/ipv4/tcp_memcontrol.c    | 14 --------------
 3 files changed, 2 insertions(+), 52 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 88cdd1cb992e..af0c0680a873 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -14,5 +14,4 @@ struct tcp_memcontrol {
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
-void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
 #endif /* _TCP_MEMCG_H */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b161d5aba0b..8457f7bc4d89 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -204,43 +204,8 @@ static int ipv4_tcp_mem(struct ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp,
 			   loff_t *ppos)
 {
-	int ret;
-	unsigned long vec[3];
-	struct net *net = current->nsproxy->net_ns;
-#ifdef CONFIG_MEMCG_KMEM
-	struct mem_cgroup *memcg;
-#endif
-
-	struct ctl_table tmp = {
-		.data = &vec,
-		.maxlen = sizeof(vec),
-		.mode = ctl->mode,
-	};
-
-	if (!write) {
-		ctl->data = &net->ipv4.sysctl_tcp_mem;
-		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
-	}
-
-	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
-	if (ret)
-		return ret;
-
-#ifdef CONFIG_MEMCG_KMEM
-	rcu_read_lock();
-	memcg = mem_cgroup_from_task(current);
-
-	tcp_prot_mem(memcg, vec[0], 0);
-	tcp_prot_mem(memcg, vec[1], 1);
-	tcp_prot_mem(memcg, vec[2], 2);
-	rcu_read_unlock();
-#endif
-
-	net->ipv4.sysctl_tcp_mem[0] = vec[0];
-	net->ipv4.sysctl_tcp_mem[1] = vec[1];
-	net->ipv4.sysctl_tcp_mem[2] = vec[2];
-
-	return 0;
+	ctl->data = &current->nsproxy->net_ns->ipv4.sysctl_tcp_mem;
+	return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
 }
 
 static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 82985d1dc9af..e7c01fcf5716 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -226,20 +226,6 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 	return 0;
 }
 
-void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
-{
-	struct tcp_memcontrol *tcp;
-	struct cg_proto *cg_proto;
-
-	cg_proto = tcp_prot.proto_cgroup(memcg);
-	if (!cg_proto)
-		return;
-
-	tcp = tcp_from_cgproto(cg_proto);
-
-	tcp->tcp_prot_mem[idx] = val;
-}
-
 static struct cftype tcp_files[] = {
 	{
 		.name = "kmem.tcp.limit_in_bytes",
-- 
cgit v1.2.3


From a4fe34bf902b8f709c635ab37f1f39de0b86cff2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 19 Oct 2013 16:25:36 -0700
Subject: tcp_memcontrol: Remove the per netns control.

The code that is implemented is per memory cgroup not per netns, and
having per netns bits is just confusing.  Remove the per netns bits to
make it easier to see what is really going on.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 -
 include/net/tcp.h          |  3 +--
 net/ipv4/af_inet.c         |  2 --
 net/ipv4/sysctl_net_ipv4.c | 23 +++++++----------------
 net/ipv4/tcp.c             | 12 +++++++-----
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_memcontrol.c  | 10 ++++------
 net/ipv6/af_inet6.c        |  2 --
 net/ipv6/tcp_ipv6.c        |  1 +
 9 files changed, 21 insertions(+), 34 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 5dbd232e12ff..ee520cba2ec2 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -71,7 +71,6 @@ struct netns_ipv4 {
 	int sysctl_tcp_ecn;
 
 	kgid_t sysctl_ping_group_range[2];
-	long sysctl_tcp_mem[3];
 
 	atomic_t dev_addr_genid;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b12e29a76590..2d7b4bdc972f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -259,6 +259,7 @@ extern int sysctl_tcp_max_orphans;
 extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_dsack;
+extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
@@ -348,8 +349,6 @@ extern struct proto tcp_prot;
 #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
 #define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
 
-void tcp_init_mem(struct net *net);
-
 void tcp_tasklet_init(void);
 
 void tcp_v4_err(struct sk_buff *skb, u32);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9433a6186f54..24a53fc275b0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1697,8 +1697,6 @@ static int __init inet_init(void)
 	ip_static_sysctl_init();
 #endif
 
-	tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
-
 	/*
 	 *	Add all the base protocols.
 	 */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8457f7bc4d89..69c6a8dbe09d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -200,14 +200,6 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
 	return ret;
 }
 
-static int ipv4_tcp_mem(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp,
-			   loff_t *ppos)
-{
-	ctl->data = &current->nsproxy->net_ns->ipv4.sysctl_tcp_mem;
-	return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
-}
-
 static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos)
@@ -521,6 +513,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_mem",
+		.maxlen		= sizeof(sysctl_tcp_mem),
+		.data		= &sysctl_tcp_mem,
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{
 		.procname	= "tcp_wmem",
 		.data		= &sysctl_tcp_wmem,
@@ -830,12 +829,6 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},
-	{
-		.procname	= "tcp_mem",
-		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
-		.mode		= 0644,
-		.proc_handler	= ipv4_tcp_mem,
-	},
 	{ }
 };
 
@@ -887,8 +880,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	net->ipv4.sysctl_local_ports.range[0] =  32768;
 	net->ipv4.sysctl_local_ports.range[1] =  61000;
 
-	tcp_init_mem(net);
-
 	net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
 	if (net->ipv4.ipv4_hdr == NULL)
 		goto err_reg;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index be4b161802e8..8e8529d3c8c9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -288,9 +288,11 @@ int sysctl_tcp_min_tso_segs __read_mostly = 2;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
+long sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
 
+EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
@@ -3097,13 +3099,13 @@ static int __init set_thash_entries(char *str)
 }
 __setup("thash_entries=", set_thash_entries);
 
-void tcp_init_mem(struct net *net)
+static void tcp_init_mem(void)
 {
 	unsigned long limit = nr_free_buffer_pages() / 8;
 	limit = max(limit, 128UL);
-	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
-	net->ipv4.sysctl_tcp_mem[1] = limit;
-	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+	sysctl_tcp_mem[0] = limit / 4 * 3;
+	sysctl_tcp_mem[1] = limit;
+	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
 }
 
 void __init tcp_init(void)
@@ -3165,7 +3167,7 @@ void __init tcp_init(void)
 	sysctl_tcp_max_orphans = cnt / 2;
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
-	tcp_init_mem(&init_net);
+	tcp_init_mem();
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
 	max_wshare = min(4UL*1024*1024, limit);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 114d1b748cbb..300ab2c93f29 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2749,6 +2749,7 @@ struct proto tcp_prot = {
 	.orphan_count		= &tcp_orphan_count,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
+	.sysctl_mem		= sysctl_tcp_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index e7c01fcf5716..86feaa0d6d70 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -29,7 +29,6 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	struct cg_proto *cg_proto, *parent_cg;
 	struct tcp_memcontrol *tcp;
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-	struct net *net = current->nsproxy->net_ns;
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
@@ -37,9 +36,9 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 	tcp = tcp_from_cgproto(cg_proto);
 
-	tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
-	tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
-	tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+	tcp->tcp_prot_mem[0] = sysctl_tcp_mem[0];
+	tcp->tcp_prot_mem[1] = sysctl_tcp_mem[1];
+	tcp->tcp_prot_mem[2] = sysctl_tcp_mem[2];
 	tcp->tcp_memory_pressure = 0;
 
 	parent_cg = tcp_prot.proto_cgroup(parent);
@@ -76,7 +75,6 @@ EXPORT_SYMBOL(tcp_destroy_cgroup);
 
 static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 {
-	struct net *net = current->nsproxy->net_ns;
 	struct tcp_memcontrol *tcp;
 	struct cg_proto *cg_proto;
 	u64 old_lim;
@@ -99,7 +97,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 
 	for (i = 0; i < 3; i++)
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
-					     net->ipv4.sysctl_tcp_mem[i]);
+					     sysctl_tcp_mem[i]);
 
 	if (val == RES_COUNTER_MAX)
 		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 20af1fb81c83..6468bda1f2b9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -865,8 +865,6 @@ static int __init inet6_init(void)
 	if (err)
 		goto out_sock_register_fail;
 
-	tcpv6_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
-
 	/*
 	 *	ipngwg API draft makes clear that the correct semantics
 	 *	for TCP and UDP is to consider one TCP and UDP instance
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b996ee2005a9..0740f93a114a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1929,6 +1929,7 @@ struct proto tcpv6_prot = {
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
 	.orphan_count		= &tcp_orphan_count,
+	.sysctl_mem		= sysctl_tcp_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
-- 
cgit v1.2.3


From 2e685cad57906e19add7189b5ff49dfb6aaa21d3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 19 Oct 2013 16:26:19 -0700
Subject: tcp_memcontrol: Kill struct tcp_memcontrol

Replace the pointers in struct cg_proto with actual data fields and kill
struct tcp_memcontrol as it is not fully redundant.

This removes a confusing, unnecessary layer of abstraction.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h           | 28 ++++++++++----------
 include/net/tcp_memcontrol.h | 10 --------
 mm/memcontrol.c              |  6 ++---
 net/ipv4/tcp_memcontrol.c    | 61 +++++++++++++-------------------------------
 4 files changed, 35 insertions(+), 70 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/sock.h b/include/net/sock.h
index 7e50df5c71d4..86bb0668c581 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1036,10 +1036,10 @@ enum cg_proto_flags {
 
 struct cg_proto {
 	void			(*enter_memory_pressure)(struct sock *sk);
-	struct res_counter	*memory_allocated;	/* Current allocated memory. */
-	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
-	int			*memory_pressure;
-	long			*sysctl_mem;
+	struct res_counter	memory_allocated;	/* Current allocated memory. */
+	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
+	int			memory_pressure;
+	long			sysctl_mem[3];
 	unsigned long		flags;
 	/*
 	 * memcg field is used to find which memcg we belong directly
@@ -1135,9 +1135,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 		return false;
 
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return !!*sk->sk_cgrp->memory_pressure;
+		return !!sk->sk_cgrp->memory_pressure;
 
-	return !!*sk->sk_prot->memory_pressure;
+	return !!sk->sk_prot->memory_pressure;
 }
 
 static inline void sk_leave_memory_pressure(struct sock *sk)
@@ -1155,8 +1155,8 @@ static inline void sk_leave_memory_pressure(struct sock *sk)
 		struct proto *prot = sk->sk_prot;
 
 		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			if (*cg_proto->memory_pressure)
-				*cg_proto->memory_pressure = 0;
+			if (cg_proto->memory_pressure)
+				cg_proto->memory_pressure = 0;
 	}
 
 }
@@ -1192,7 +1192,7 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 	struct res_counter *fail;
 	int ret;
 
-	ret = res_counter_charge_nofail(prot->memory_allocated,
+	ret = res_counter_charge_nofail(&prot->memory_allocated,
 					amt << PAGE_SHIFT, &fail);
 	if (ret < 0)
 		*parent_status = OVER_LIMIT;
@@ -1201,13 +1201,13 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
 					      unsigned long amt)
 {
-	res_counter_uncharge(prot->memory_allocated, amt << PAGE_SHIFT);
+	res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT);
 }
 
 static inline u64 memcg_memory_allocated_read(struct cg_proto *prot)
 {
 	u64 ret;
-	ret = res_counter_read_u64(prot->memory_allocated, RES_USAGE);
+	ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE);
 	return ret >> PAGE_SHIFT;
 }
 
@@ -1255,7 +1255,7 @@ static inline void sk_sockets_allocated_dec(struct sock *sk)
 		struct cg_proto *cg_proto = sk->sk_cgrp;
 
 		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			percpu_counter_dec(cg_proto->sockets_allocated);
+			percpu_counter_dec(&cg_proto->sockets_allocated);
 	}
 
 	percpu_counter_dec(prot->sockets_allocated);
@@ -1269,7 +1269,7 @@ static inline void sk_sockets_allocated_inc(struct sock *sk)
 		struct cg_proto *cg_proto = sk->sk_cgrp;
 
 		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			percpu_counter_inc(cg_proto->sockets_allocated);
+			percpu_counter_inc(&cg_proto->sockets_allocated);
 	}
 
 	percpu_counter_inc(prot->sockets_allocated);
@@ -1281,7 +1281,7 @@ sk_sockets_allocated_read_positive(struct sock *sk)
 	struct proto *prot = sk->sk_prot;
 
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return percpu_counter_read_positive(sk->sk_cgrp->sockets_allocated);
+		return percpu_counter_read_positive(&sk->sk_cgrp->sockets_allocated);
 
 	return percpu_counter_read_positive(prot->sockets_allocated);
 }
diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index af0c0680a873..05b94d9453de 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -1,16 +1,6 @@
 #ifndef _TCP_MEMCG_H
 #define _TCP_MEMCG_H
 
-struct tcp_memcontrol {
-	struct cg_proto cg_proto;
-	/* per-cgroup tcp memory pressure knobs */
-	struct res_counter tcp_memory_allocated;
-	struct percpu_counter tcp_sockets_allocated;
-	/* those two are read-mostly, leave them at the end */
-	long tcp_prot_mem[3];
-	int tcp_memory_pressure;
-};
-
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c52ddbc839b..28243f7d9c23 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -311,7 +311,7 @@ struct mem_cgroup {
 
 	atomic_t	dead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-	struct tcp_memcontrol tcp_mem;
+	struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
 	/* analogous to slab_common's slab_caches list. per-memcg */
@@ -550,13 +550,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 	if (!memcg || mem_cgroup_is_root(memcg))
 		return NULL;
 
-	return &memcg->tcp_mem.cg_proto;
+	return &memcg->tcp_mem;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
-	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+	if (!memcg_proto_activated(&memcg->tcp_mem))
 		return;
 	static_key_slow_dec(&memcg_socket_limit_enabled);
 }
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 86feaa0d6d70..03e9154f7e68 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,15 +6,10 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
-static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
-{
-	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
-}
-
 static void memcg_tcp_enter_memory_pressure(struct sock *sk)
 {
 	if (sk->sk_cgrp->memory_pressure)
-		*sk->sk_cgrp->memory_pressure = 1;
+		sk->sk_cgrp->memory_pressure = 1;
 }
 EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
 
@@ -27,33 +22,24 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	 */
 	struct res_counter *res_parent = NULL;
 	struct cg_proto *cg_proto, *parent_cg;
-	struct tcp_memcontrol *tcp;
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
 		return 0;
 
-	tcp = tcp_from_cgproto(cg_proto);
-
-	tcp->tcp_prot_mem[0] = sysctl_tcp_mem[0];
-	tcp->tcp_prot_mem[1] = sysctl_tcp_mem[1];
-	tcp->tcp_prot_mem[2] = sysctl_tcp_mem[2];
-	tcp->tcp_memory_pressure = 0;
+	cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
+	cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
+	cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
+	cg_proto->memory_pressure = 0;
+	cg_proto->memcg = memcg;
 
 	parent_cg = tcp_prot.proto_cgroup(parent);
 	if (parent_cg)
-		res_parent = parent_cg->memory_allocated;
-
-	res_counter_init(&tcp->tcp_memory_allocated, res_parent);
-	percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
+		res_parent = &parent_cg->memory_allocated;
 
-	cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
-	cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
-	cg_proto->sysctl_mem = tcp->tcp_prot_mem;
-	cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
-	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
-	cg_proto->memcg = memcg;
+	res_counter_init(&cg_proto->memory_allocated, res_parent);
+	percpu_counter_init(&cg_proto->sockets_allocated, 0);
 
 	return 0;
 }
@@ -62,20 +48,17 @@ EXPORT_SYMBOL(tcp_init_cgroup);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 {
 	struct cg_proto *cg_proto;
-	struct tcp_memcontrol *tcp;
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
 		return;
 
-	tcp = tcp_from_cgproto(cg_proto);
-	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
+	percpu_counter_destroy(&cg_proto->sockets_allocated);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
 static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 {
-	struct tcp_memcontrol *tcp;
 	struct cg_proto *cg_proto;
 	u64 old_lim;
 	int i;
@@ -88,16 +71,14 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 	if (val > RES_COUNTER_MAX)
 		val = RES_COUNTER_MAX;
 
-	tcp = tcp_from_cgproto(cg_proto);
-
-	old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-	ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
+	old_lim = res_counter_read_u64(&cg_proto->memory_allocated, RES_LIMIT);
+	ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
 	if (ret)
 		return ret;
 
 	for (i = 0; i < 3; i++)
-		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
-					     sysctl_tcp_mem[i]);
+		cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT,
+						sysctl_tcp_mem[i]);
 
 	if (val == RES_COUNTER_MAX)
 		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
@@ -154,28 +135,24 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
 
 static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
 {
-	struct tcp_memcontrol *tcp;
 	struct cg_proto *cg_proto;
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
 		return default_val;
 
-	tcp = tcp_from_cgproto(cg_proto);
-	return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
+	return res_counter_read_u64(&cg_proto->memory_allocated, type);
 }
 
 static u64 tcp_read_usage(struct mem_cgroup *memcg)
 {
-	struct tcp_memcontrol *tcp;
 	struct cg_proto *cg_proto;
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
 		return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
 
-	tcp = tcp_from_cgproto(cg_proto);
-	return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
+	return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
 }
 
 static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -203,21 +180,19 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
 static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 {
 	struct mem_cgroup *memcg;
-	struct tcp_memcontrol *tcp;
 	struct cg_proto *cg_proto;
 
 	memcg = mem_cgroup_from_css(css);
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
 		return 0;
-	tcp = tcp_from_cgproto(cg_proto);
 
 	switch (event) {
 	case RES_MAX_USAGE:
-		res_counter_reset_max(&tcp->tcp_memory_allocated);
+		res_counter_reset_max(&cg_proto->memory_allocated);
 		break;
 	case RES_FAILCNT:
-		res_counter_reset_failcnt(&tcp->tcp_memory_allocated);
+		res_counter_reset_failcnt(&cg_proto->memory_allocated);
 		break;
 	}
 
-- 
cgit v1.2.3


From 0a6fa23dcb10eeb21adfd9955f7030f952a8122d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 19 Oct 2013 16:27:03 -0700
Subject: ipv4: Use math to point per net sysctls into the appropriate struct
 net.

Simplify maintenance of ipv4_net_table by using math to point the per
net sysctls into the appropriate struct net, instead of manually
reassinging all of the variables into hard coded table slots.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 69c6a8dbe09d..3a05e8123235 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -838,28 +838,15 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 
 	table = ipv4_net_table;
 	if (!net_eq(net, &init_net)) {
+		int i;
+
 		table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
 		if (table == NULL)
 			goto err_alloc;
 
-		table[0].data =
-			&net->ipv4.sysctl_icmp_echo_ignore_all;
-		table[1].data =
-			&net->ipv4.sysctl_icmp_echo_ignore_broadcasts;
-		table[2].data =
-			&net->ipv4.sysctl_icmp_ignore_bogus_error_responses;
-		table[3].data =
-			&net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr;
-		table[4].data =
-			&net->ipv4.sysctl_icmp_ratelimit;
-		table[5].data =
-			&net->ipv4.sysctl_icmp_ratemask;
-		table[6].data =
-			&net->ipv4.sysctl_ping_group_range;
-		table[7].data =
-			&net->ipv4.sysctl_tcp_ecn;
-		table[8].data =
-			&net->ipv4.sysctl_local_ports.range;
+		/* Update the variables to point into the current struct net */
+		for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
+			table[i].data += (void *)net - (void *)&init_net;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
-- 
cgit v1.2.3


From fd2d5356d90211f98ea3624263e37c4142b41edd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 19 Oct 2013 16:27:35 -0700
Subject: ipv4: Allow unprivileged users to use per net sysctls

Allow unprivileged users to use:
/proc/sys/net/ipv4/icmp_echo_ignore_all
/proc/sys/net/ipv4/icmp_echo_ignore_broadcasts
/proc/sys/net/ipv4/icmp_ignore_bogus_error_response
/proc/sys/net/ipv4/icmp_errors_use_inbound_ifaddr
/proc/sys/net/ipv4/icmp_ratelimit
/proc/sys/net/ipv4/icmp_ratemask
/proc/sys/net/ipv4/ping_group_range
/proc/sys/net/ipv4/tcp_ecn
/proc/sys/net/ipv4/ip_local_ports_range

These are occassionally handy and after a quick review I don't see
any problems with unprivileged users using them.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3a05e8123235..d5b1390eebbe 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -847,10 +847,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 		/* Update the variables to point into the current struct net */
 		for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
 			table[i].data += (void *)net - (void *)&init_net;
-
-		/* Don't export sysctls to unprivileged users */
-		if (net->user_ns != &init_user_ns)
-			table[0].procname = NULL;
 	}
 
 	/*
-- 
cgit v1.2.3


From 61c1db7fae21ed33c614356a43bf6580c5e53118 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 20 Oct 2013 20:47:30 -0700
Subject: ipv6: sit: add GSO/TSO support

Now ipv6_gso_segment() is stackable, its relatively easy to
implement GSO/TSO support for SIT tunnels

Performance results, when segmentation is done after tunnel
device (as no NIC is yet enabled for TSO SIT support) :

Before patch :

lpq84:~# ./netperf -H 2002:af6:1153:: -Cc
MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    10.00      3168.31   4.81     4.64     2.988   2.877

After patch :

lpq84:~# ./netperf -H 2002:af6:1153:: -Cc
MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    10.00      5525.00   7.76     5.17     2.763   1.840

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  2 ++
 include/linux/skbuff.h          |  6 ++++--
 net/core/ethtool.c              |  1 +
 net/ipv4/af_inet.c              |  1 +
 net/ipv4/tcp_offload.c          |  1 +
 net/ipv6/ip6_offload.c          | 11 +++++++++++
 net/ipv6/sit.c                  | 28 +++++++++++++++++++---------
 net/ipv6/udp_offload.c          |  1 +
 8 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 8dad68cede1c..b05a4b501ab5 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -43,6 +43,7 @@ enum {
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
 	NETIF_F_GSO_IPIP_BIT,		/* ... IPIP tunnel with TSO */
+	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_MPLS_BIT,		/* ... MPLS segmentation */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
@@ -109,6 +110,7 @@ enum {
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
 #define NETIF_F_GSO_GRE		__NETIF_F(GSO_GRE)
 #define NETIF_F_GSO_IPIP	__NETIF_F(GSO_IPIP)
+#define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_MPLS	__NETIF_F(GSO_MPLS)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 60729134d253..2c154976394b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -320,9 +320,11 @@ enum {
 
 	SKB_GSO_IPIP = 1 << 7,
 
-	SKB_GSO_UDP_TUNNEL = 1 << 8,
+	SKB_GSO_SIT = 1 << 8,
 
-	SKB_GSO_MPLS = 1 << 9,
+	SKB_GSO_UDP_TUNNEL = 1 << 9,
+
+	SKB_GSO_MPLS = 1 << 10,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 8cab7744790e..862989898f61 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -82,6 +82,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
 	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation",
+	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_MPLS_BIT] =	 "tx-mpls-segmentation",
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 24a53fc275b0..f4a159e705c0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1265,6 +1265,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
 		       SKB_GSO_IPIP |
+		       SKB_GSO_SIT |
 		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_MPLS |
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index dfc96b00673e..a7a5583eab04 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -57,6 +57,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_TCPV6 |
 			       SKB_GSO_GRE |
 			       SKB_GSO_IPIP |
+			       SKB_GSO_SIT |
 			       SKB_GSO_MPLS |
 			       SKB_GSO_UDP_TUNNEL |
 			       0) ||
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index f9b33d82bb9d..4b851692b1f6 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -98,6 +98,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
 		       SKB_GSO_IPIP |
+		       SKB_GSO_SIT |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_MPLS |
 		       SKB_GSO_TCPV6 |
@@ -276,6 +277,13 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
 	},
 };
 
+static const struct net_offload sit_offload = {
+	.callbacks = {
+		.gso_send_check = ipv6_gso_send_check,
+		.gso_segment	= ipv6_gso_segment,
+	},
+};
+
 static int __init ipv6_offload_init(void)
 {
 
@@ -287,6 +295,9 @@ static int __init ipv6_offload_init(void)
 		pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
 
 	dev_add_offload(&ipv6_packet_offload);
+
+	inet_add_offload(&sit_offload, IPPROTO_IPV6);
+
 	return 0;
 }
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 19269453a8ea..3a9038dd818d 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -933,10 +933,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		ttl = iph6->hop_limit;
 	tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
 
-	if (likely(!skb->encapsulation)) {
-		skb_reset_inner_headers(skb);
-		skb->encapsulation = 1;
-	}
+	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT);
+	if (IS_ERR(skb))
+		goto out;
 
 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos,
 			    ttl, df, !net_eq(tunnel->net, dev_net(dev)));
@@ -946,8 +945,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 tx_error_icmp:
 	dst_link_failure(skb);
 tx_error:
-	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
+out:
+	dev->stats.tx_errors++;
 	return NETDEV_TX_OK;
 }
 
@@ -956,13 +956,15 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
 
-	if (likely(!skb->encapsulation)) {
-		skb_reset_inner_headers(skb);
-		skb->encapsulation = 1;
-	}
+	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	if (IS_ERR(skb))
+		goto out;
 
 	ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP);
 	return NETDEV_TX_OK;
+out:
+	dev->stats.tx_errors++;
+	return NETDEV_TX_OK;
 }
 
 static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
@@ -1292,6 +1294,12 @@ static void ipip6_dev_free(struct net_device *dev)
 	free_netdev(dev);
 }
 
+#define SIT_FEATURES (NETIF_F_SG	   | \
+		      NETIF_F_FRAGLIST	   | \
+		      NETIF_F_HIGHDMA	   | \
+		      NETIF_F_GSO_SOFTWARE | \
+		      NETIF_F_HW_CSUM)
+
 static void ipip6_tunnel_setup(struct net_device *dev)
 {
 	dev->netdev_ops		= &ipip6_netdev_ops;
@@ -1305,6 +1313,8 @@ static void ipip6_tunnel_setup(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_LLTX;
+	dev->features		|= SIT_FEATURES;
+	dev->hw_features	|= SIT_FEATURES;
 }
 
 static int ipip6_tunnel_init(struct net_device *dev)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index f63780ff3732..08e23b0bf302 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -65,6 +65,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_GRE |
 				      SKB_GSO_IPIP |
+				      SKB_GSO_SIT |
 				      SKB_GSO_MPLS) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
-- 
cgit v1.2.3


From 02cf4ebd82ff0ac7254b88e466820a290ed8289a Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 21 Oct 2013 15:40:19 -0400
Subject: tcp: initialize passive-side sk_pacing_rate after 3WHS

For passive TCP connections, upon receiving the ACK that completes the
3WHS, make sure we set our pacing rate after we get our first RTT
sample.

On passive TCP connections, when we receive the ACK completing the
3WHS we do not take an RTT sample in tcp_ack(), but rather in
tcp_synack_rtt_meas(). So upon receiving the ACK that completes the
3WHS, tcp_ack() leaves sk_pacing_rate at its initial value.

Originally the initial sk_pacing_rate value was 0, so passive-side
connections defaulted to sysctl_tcp_min_tso_segs (2 segs) in skbuffs
made in the first RTT. With a default initial cwnd of 10 packets, this
happened to be correct for RTTs 5ms or bigger, so it was hard to
see problems in WAN or emulated WAN testing.

Since 7eec4174ff ("pkt_sched: fq: fix non TCP flows pacing"), the
initial sk_pacing_rate is 0xffffffff. So after that change, passive
TCP connections were keeping this value (and using large numbers of
segments per skbuff) until receiving an ACK for data.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53974c7c04fe..a16b01b537ba 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5712,6 +5712,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		} else
 			tcp_init_metrics(sk);
 
+		tcp_update_pacing_rate(sk);
+
 		/* Prevent spurious tcp_cwnd_restart() on first data packet */
 		tp->lsndtime = tcp_time_stamp;
 
-- 
cgit v1.2.3


From b416c144f46af1a30ddfa4e4319a8f077381ad63 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 21 Oct 2013 13:14:53 +0100
Subject: netfilter: x_tables: fix ordering of jumpstack allocation and table
 update

During kernel stability testing on an SMP ARMv7 system, Yalin Wang
reported the following panic from the netfilter code:

  1fe0: 0000001c 5e2d3b10 4007e779 4009e110 60000010 00000032 ff565656 ff545454
  [<c06c48dc>] (ipt_do_table+0x448/0x584) from [<c0655ef0>] (nf_iterate+0x48/0x7c)
  [<c0655ef0>] (nf_iterate+0x48/0x7c) from [<c0655f7c>] (nf_hook_slow+0x58/0x104)
  [<c0655f7c>] (nf_hook_slow+0x58/0x104) from [<c0683bbc>] (ip_local_deliver+0x88/0xa8)
  [<c0683bbc>] (ip_local_deliver+0x88/0xa8) from [<c0683718>] (ip_rcv_finish+0x418/0x43c)
  [<c0683718>] (ip_rcv_finish+0x418/0x43c) from [<c062b1c4>] (__netif_receive_skb+0x4cc/0x598)
  [<c062b1c4>] (__netif_receive_skb+0x4cc/0x598) from [<c062b314>] (process_backlog+0x84/0x158)
  [<c062b314>] (process_backlog+0x84/0x158) from [<c062de84>] (net_rx_action+0x70/0x1dc)
  [<c062de84>] (net_rx_action+0x70/0x1dc) from [<c0088230>] (__do_softirq+0x11c/0x27c)
  [<c0088230>] (__do_softirq+0x11c/0x27c) from [<c008857c>] (do_softirq+0x44/0x50)
  [<c008857c>] (do_softirq+0x44/0x50) from [<c0088614>] (local_bh_enable_ip+0x8c/0xd0)
  [<c0088614>] (local_bh_enable_ip+0x8c/0xd0) from [<c06b0330>] (inet_stream_connect+0x164/0x298)
  [<c06b0330>] (inet_stream_connect+0x164/0x298) from [<c061d68c>] (sys_connect+0x88/0xc8)
  [<c061d68c>] (sys_connect+0x88/0xc8) from [<c000e340>] (ret_fast_syscall+0x0/0x30)
  Code: 2a000021 e59d2028 e59de01c e59f011c (e7824103)
  ---[ end trace da227214a82491bd ]---
  Kernel panic - not syncing: Fatal exception in interrupt

This comes about because CPU1 is executing xt_replace_table in response
to a setsockopt syscall, resulting in:

	ret = xt_jumpstack_alloc(newinfo);
		--> newinfo->jumpstack = kzalloc(size, GFP_KERNEL);

	[...]

	table->private = newinfo;
	newinfo->initial_entries = private->initial_entries;

Meanwhile, CPU0 is handling the network receive path and ends up in
ipt_do_table, resulting in:

	private = table->private;

	[...]

	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];

On weakly ordered memory architectures, the writes to table->private
and newinfo->jumpstack from CPU1 can be observed out of order by CPU0.
Furthermore, on architectures which don't respect ordering of address
dependencies (i.e. Alpha), the reads from CPU0 can also be re-ordered.

This patch adds an smp_wmb() before the assignment to table->private
(which is essentially publishing newinfo) to ensure that all writes to
newinfo will be observed before plugging it into the table structure.
A dependent-read barrier is also added on the consumer sides, to ensure
the same ordering requirements are also respected there.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Wang, Yalin <Yalin.Wang@sonymobile.com>
Tested-by: Wang, Yalin <Yalin.Wang@sonymobile.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 5 +++++
 net/ipv4/netfilter/ip_tables.c  | 5 +++++
 net/ipv6/netfilter/ip6_tables.c | 5 +++++
 net/netfilter/x_tables.c        | 7 ++++++-
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 85a4f21aac1a..59da7cde0724 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
 	private = table->private;
+	/*
+	 * Ensure we load private-> members after we've fetched the base
+	 * pointer.
+	 */
+	smp_read_barrier_depends();
 	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d23118d95ff9..718dfbd30cbe 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb,
 	addend = xt_write_recseq_begin();
 	private = table->private;
 	cpu        = smp_processor_id();
+	/*
+	 * Ensure we load private-> members after we've fetched the base
+	 * pointer.
+	 */
+	smp_read_barrier_depends();
 	table_base = private->entries[cpu];
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 44400c216dc6..710238f58aa9 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -349,6 +349,11 @@ ip6t_do_table(struct sk_buff *skb,
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
 	private = table->private;
+	/*
+	 * Ensure we load private-> members after we've fetched the base
+	 * pointer.
+	 */
+	smp_read_barrier_depends();
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8b03028cca69..227aa11e8409 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -845,8 +845,13 @@ xt_replace_table(struct xt_table *table,
 		return NULL;
 	}
 
-	table->private = newinfo;
 	newinfo->initial_entries = private->initial_entries;
+	/*
+	 * Ensure contents of newinfo are visible before assigning to
+	 * private.
+	 */
+	smp_wmb();
+	table->private = newinfo;
 
 	/*
 	 * Even though table entries have now been swapped, other CPU's
-- 
cgit v1.2.3


From e7b519ba55aeb675daee1d304e80d752c385f7f0 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 23 Oct 2013 11:06:55 +0200
Subject: ipv4: initialize ip4_frags hash secret as late as possible

Defer the generation of the first hash secret for the ipv4 fragmentation
cache as late as possible.

ip4_frags.rnd gets initial seeded by inet_frags_init and regulary
reseeded by inet_frag_secret_rebuild. Either we call ipqhashfn directly
from ip_fragment.c in which case we initialize the secret directly.

If we first get called by inet_frag_secret_rebuild we install a new secret
by a manual call to get_random_bytes. This secret will be overwritten
as soon as the first call to ipqhashfn happens. This is safe because we
won't race while publishing the new secrets with anyone else.

Cc: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b66910aaef4d..2481993a4970 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -106,6 +106,7 @@ struct ip4_create_arg {
 
 static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
 {
+	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
 	return jhash_3words((__force u32)id << 16 | prot,
 			    (__force u32)saddr, (__force u32)daddr,
 			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
-- 
cgit v1.2.3


From 7088ad74e6e710d0c80ea2cead9500f47a2a5d58 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 23 Oct 2013 11:06:57 +0200
Subject: inet: remove old fragmentation hash initializing

All fragmentation hash secrets now get initialized by their
corresponding hash function with net_get_random_once. Thus we can
eliminate the initial seeding.

Also provide a comment that hash secret seeding happens at the first
call to the corresponding hashing function.

Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 4 ++++
 net/ipv4/inet_fragment.c | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index bfcbc0017950..6f59de98dabd 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -64,6 +64,10 @@ struct inet_frags {
 	rwlock_t		lock ____cacheline_aligned_in_smp;
 	int			secret_interval;
 	struct timer_list	secret_timer;
+
+	/* The first call to hashfn is responsible to initialize
+	 * rnd. This is best done with net_get_random_once.
+	 */
 	u32			rnd;
 	int			qsize;
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c5313a9c019b..bb075fc9a14f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -93,9 +93,6 @@ void inet_frags_init(struct inet_frags *f)
 	}
 	rwlock_init(&f->lock);
 
-	f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^
-				   (jiffies ^ (jiffies >> 6)));
-
 	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
 			(unsigned long)f);
 	f->secret_timer.expires = jiffies + f->secret_interval;
-- 
cgit v1.2.3


From 27bf69708394ca270f9ec53875aa7309c86dfc1d Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 16 Oct 2013 21:07:39 +0530
Subject: net: use DMA_COMPLETE for dma completion status

Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 net/ipv4/tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6e5617b9f9db..d2652fb3232e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1429,7 +1429,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
 	do {
 		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
 					      last_issued, &done,
-					      &used) == DMA_SUCCESS) {
+					      &used) == DMA_COMPLETE) {
 			/* Safe to free early-copied skbs now */
 			__skb_queue_purge(&sk->sk_async_wait_queue);
 			break;
@@ -1437,7 +1437,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
 			struct sk_buff *skb;
 			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
 			       (dma_async_is_complete(skb->dma_cookie, done,
-						      used) == DMA_SUCCESS)) {
+						      used) == DMA_COMPLETE)) {
 				__skb_dequeue(&sk->sk_async_wait_queue);
 				kfree_skb(skb);
 			}
-- 
cgit v1.2.3


From bc15afa39ecc16f01c3389d15d8f6015a427fe85 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 24 Oct 2013 08:44:25 -0700
Subject: tcp: fix SYNACK RTT estimation in Fast Open

tp->lsndtime may not always be the SYNACK timestamp if a passive
Fast Open socket sends data before handshake completes. And if the
remote acknowledges both the data and the SYNACK, the RTT sample
is already taken in tcp_ack(), so no need to call
tcp_update_ack_rtt() in tcp_synack_rtt_meas() aagain.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a16b01b537ba..305cd0526a78 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2871,14 +2871,19 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 }
 
 /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
+static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	s32 seq_rtt = -1;
 
-	if (tp->lsndtime && !tp->total_retrans)
-		seq_rtt = tcp_time_stamp - tp->lsndtime;
-	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
+	if (synack_stamp && !tp->total_retrans)
+		seq_rtt = tcp_time_stamp - synack_stamp;
+
+	/* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
+	 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
+	 */
+	if (!tp->srtt)
+		tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
 }
 
 static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
@@ -5587,6 +5592,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	struct request_sock *req;
 	int queued = 0;
 	bool acceptable;
+	u32 synack_stamp;
 
 	tp->rx_opt.saw_tstamp = 0;
 
@@ -5669,9 +5675,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		 * so release it.
 		 */
 		if (req) {
+			synack_stamp = tcp_rsk(req)->snt_synack;
 			tp->total_retrans = req->num_retrans;
 			reqsk_fastopen_remove(sk, req, false);
 		} else {
+			synack_stamp = tp->lsndtime;
 			/* Make sure socket is routed, for correct metrics. */
 			icsk->icsk_af_ops->rebuild_header(sk);
 			tcp_init_congestion_control(sk);
@@ -5694,7 +5702,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
 		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-		tcp_synack_rtt_meas(sk, req);
+		tcp_synack_rtt_meas(sk, synack_stamp);
 
 		if (tp->rx_opt.tstamp_ok)
 			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-- 
cgit v1.2.3


From 2909d874f34eae157aecab0af27c6dc4a1751f8f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 24 Oct 2013 08:55:25 -0700
Subject: tcp: only take RTT from timestamps if new data is acked

Patch ed08495c3 "tcp: use RTT from SACK for RTO" has a bug that
it does not check if the ACK acknowledge new data before taking
the RTT sample from TCP timestamps. This patch adds the check
back as required by the RFC.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 305cd0526a78..6ffe41a82c00 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2856,7 +2856,8 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 	 * left edge of the send window.
 	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
 	 */
-	if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+	if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+	    flag & FLAG_ACKED)
 		seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
 
 	if (seq_rtt < 0)
-- 
cgit v1.2.3


From 2f715c1dde6e1760f3101358dc26f8c9489be0bf Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 24 Oct 2013 08:59:27 -0700
Subject: tcp: do not rearm RTO when future data are sacked

Patch ed08495c3 "tcp: use RTT from SACK for RTO" always re-arms RTO upon
obtaining a RTT sample from newly sacked data.

But technically RTO should only be re-armed when the data sent before
the last (re)transmission of write queue head are (s)acked. Otherwise
the RTO may continue to extend during loss recovery on data sent
in the future.

Note that RTTs from ACK or timestamps do not have this problem, as the RTT
source must be from data sent before.

The new RTO re-arm policy is
1) Always re-arm RTO if SND.UNA is advanced
2) Re-arm RTO if sack RTT is available, provided the sacked data was
   sent before the last time write_queue_head was sent.

Signed-off-by: Larry Brakmo <brakmo@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6ffe41a82c00..068c8fb0d158 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2987,6 +2987,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	s32 seq_rtt = -1;
 	s32 ca_seq_rtt = -1;
 	ktime_t last_ackt = net_invalid_timestamp();
+	bool rtt_update;
 
 	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -3063,14 +3064,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		flag |= FLAG_SACK_RENEGING;
 
-	if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
-	    (flag & FLAG_ACKED))
-		tcp_rearm_rto(sk);
+	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
 
 	if (flag & FLAG_ACKED) {
 		const struct tcp_congestion_ops *ca_ops
 			= inet_csk(sk)->icsk_ca_ops;
 
+		tcp_rearm_rto(sk);
 		if (unlikely(icsk->icsk_mtup.probe_size &&
 			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
 			tcp_mtup_probe_success(sk);
@@ -3109,6 +3109,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 			ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
 		}
+	} else if (skb && rtt_update && sack_rtt >= 0 &&
+		   sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
+		/* Do not re-arm RTO if the sack RTT is measured from data sent
+		 * after when the head was last (re)transmitted. Otherwise the
+		 * timeout may continue to extend in loss recovery.
+		 */
+		tcp_rearm_rto(sk);
 	}
 
 #if FASTRETRANS_DEBUG > 0
-- 
cgit v1.2.3


From 8c3a897bfab10f68f90252440bb29e6749a7312a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 27 Oct 2013 18:18:16 -0700
Subject: inet: restore gso for vxlan

Alexei reported a performance regression on vxlan, caused
by commit 3347c9602955 "ipv4: gso: make inet_gso_segment() stackable"

GSO vxlan packets were not properly segmented, adding IP fragments
while they were not expected.

Rename 'bool tunnel' to 'bool encap', and add a new boolean
to express the fact that UDP should be fragmented.
This fragmentation is triggered by skb->encapsulation being set.

Remove a "skb->encapsulation = 1" added in above commit,
as its not needed, as frags inherit skb->frag from original
GSO skb.

Reported-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f4a159e705c0..09d78d4a3cff 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1251,8 +1251,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
 	unsigned int offset = 0;
+	bool udpfrag, encap;
 	struct iphdr *iph;
-	bool tunnel;
 	int proto;
 	int nhoff;
 	int ihl;
@@ -1290,8 +1290,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		goto out;
 	__skb_pull(skb, ihl);
 
-	tunnel = SKB_GSO_CB(skb)->encap_level > 0;
-	if (tunnel)
+	encap = SKB_GSO_CB(skb)->encap_level > 0;
+	if (encap)
 		features = skb->dev->hw_enc_features & netif_skb_features(skb);
 	SKB_GSO_CB(skb)->encap_level += ihl;
 
@@ -1306,24 +1306,23 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	if (IS_ERR_OR_NULL(segs))
 		goto out;
 
+	udpfrag = !!skb->encapsulation && proto == IPPROTO_UDP;
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
-		if (!tunnel && proto == IPPROTO_UDP) {
+		if (udpfrag) {
 			iph->id = htons(id);
 			iph->frag_off = htons(offset >> 3);
 			if (skb->next != NULL)
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
-		} else  {
+		} else {
 			iph->id = htons(id++);
 		}
 		iph->tot_len = htons(skb->len - nhoff);
 		ip_send_check(iph);
-		if (tunnel) {
+		if (encap)
 			skb_reset_inner_headers(skb);
-			skb->encapsulation = 1;
-		}
 		skb->network_header = (u8 *)iph - skb->head;
 	} while ((skb = skb->next));
 
-- 
cgit v1.2.3


From eeb1b73378b560e00ff1da2ef09fed9254f4e128 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 25 Oct 2013 10:21:32 +0200
Subject: xfrm: Increase the garbage collector threshold

With the removal of the routing cache, we lost the
option to tweak the garbage collector threshold
along with the maximum routing cache size. So git
commit 703fb94ec ("xfrm: Fix the gc threshold value
for ipv4") moved back to a static threshold.

It turned out that the current threshold before we
start garbage collecting is much to small for some
workloads, so increase it from 1024 to 32768. This
means that we start the garbage collector if we have
more than 32768 dst entries in the system and refuse
new allocations if we are above 65536.

Reported-by: Wolfgang Walter <linux@stwm.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 2 +-
 net/ipv6/xfrm6_policy.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index ccde54248c8c..4764ee48447c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -236,7 +236,7 @@ static struct dst_ops xfrm4_dst_ops = {
 	.destroy =		xfrm4_dst_destroy,
 	.ifdown =		xfrm4_dst_ifdown,
 	.local_out =		__ip_local_out,
-	.gc_thresh =		1024,
+	.gc_thresh =		32768,
 };
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 08ed2772b7aa..dd503a3535ff 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -285,7 +285,7 @@ static struct dst_ops xfrm6_dst_ops = {
 	.destroy =		xfrm6_dst_destroy,
 	.ifdown =		xfrm6_dst_ifdown,
 	.local_out =		__ip6_local_out,
-	.gc_thresh =		1024,
+	.gc_thresh =		32768,
 };
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
-- 
cgit v1.2.3


From 0d08c42cf9a71530fef5ebcfe368f38f2dd0476f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 25 Oct 2013 17:26:17 -0700
Subject: tcp: gso: fix truesize tracking

commit 6ff50cd55545 ("tcp: gso: do not generate out of order packets")
had an heuristic that can trigger a warning in skb_try_coalesce(),
because skb->truesize of the gso segments were exactly set to mss.

This breaks the requirement that

skb->truesize >= skb->len + truesizeof(struct sk_buff);

It can trivially be reproduced by :

ifconfig lo mtu 1500
ethtool -K lo tso off
netperf

As the skbs are looped into the TCP networking stack, skb_try_coalesce()
warns us of these skb under-estimating their truesize.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_offload.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 3a7525e6c086..533c58a5cfb7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -18,6 +18,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
 				netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int sum_truesize = 0;
 	struct tcphdr *th;
 	unsigned int thlen;
 	unsigned int seq;
@@ -102,13 +103,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
 		if (copy_destructor) {
 			skb->destructor = gso_skb->destructor;
 			skb->sk = gso_skb->sk;
-			/* {tcp|sock}_wfree() use exact truesize accounting :
-			 * sum(skb->truesize) MUST be exactly be gso_skb->truesize
-			 * So we account mss bytes of 'true size' for each segment.
-			 * The last segment will contain the remaining.
-			 */
-			skb->truesize = mss;
-			gso_skb->truesize -= mss;
+			sum_truesize += skb->truesize;
 		}
 		skb = skb->next;
 		th = tcp_hdr(skb);
@@ -125,7 +120,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
 	if (copy_destructor) {
 		swap(gso_skb->sk, skb->sk);
 		swap(gso_skb->destructor, skb->destructor);
-		swap(gso_skb->truesize, skb->truesize);
+		sum_truesize += skb->truesize;
+		atomic_add(sum_truesize - gso_skb->truesize,
+			   &skb->sk->sk_wmem_alloc);
 	}
 
 	delta = htonl(oldlen + (skb_tail_pointer(skb) -
-- 
cgit v1.2.3


From daba287b299ec7a2c61ae3a714920e90e8396ad5 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sun, 27 Oct 2013 17:29:11 +0100
Subject: ipv4: fix DO and PROBE pmtu mode regarding local fragmentation with
 UFO/CORK

UFO as well as UDP_CORK do not respect IP_PMTUDISC_DO and
IP_PMTUDISC_PROBE well enough.

UFO enabled packet delivery just appends all frags to the cork and hands
it over to the network card. So we just deliver non-DF udp fragments
(DF-flag may get overwritten by hardware or virtual UFO enabled
interface).

UDP_CORK does enqueue the data until the cork is disengaged. At this
point it sets the correct IP_DF and local_df flags and hands it over to
ip_fragment which in this case will generate an icmp error which gets
appended to the error socket queue. This is not reflected in the syscall
error (of course, if UFO is enabled this also won't happen).

Improve this by checking the pmtudisc flags before appending data to the
socket and if we still can fit all data in one packet when IP_PMTUDISC_DO
or IP_PMTUDISC_PROBE is set, only then proceed.

We use (mtu-fragheaderlen) to check for the maximum length because we
ensure not to generate a fragment and non-fragmented data does not need
to have its length aligned on 64 bit boundaries. Also the passed in
ip_options are already aligned correctly.

Maybe, we can relax some other checks around ip_fragment. This needs
more research.

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8fbac7de1e1b..51be64e18e32 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -810,7 +810,7 @@ static int __ip_append_data(struct sock *sk,
 	int copy;
 	int err;
 	int offset = 0;
-	unsigned int maxfraglen, fragheaderlen;
+	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
 
@@ -823,8 +823,10 @@ static int __ip_append_data(struct sock *sk,
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+	maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ?
+			 mtu : 0xFFFF;
 
-	if (cork->length + length > 0xFFFF - fragheaderlen) {
+	if (cork->length + length > maxnonfragsize - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
 			       mtu-exthdrlen);
 		return -EMSGSIZE;
@@ -1122,7 +1124,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 	int mtu;
 	int len;
 	int err;
-	unsigned int maxfraglen, fragheaderlen, fraggap;
+	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
 
 	if (inet->hdrincl)
 		return -EPERM;
@@ -1146,8 +1148,10 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+	maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ?
+			 mtu : 0xFFFF;
 
-	if (cork->length + size > 0xFFFF - fragheaderlen) {
+	if (cork->length + size > maxnonfragsize - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
 		return -EMSGSIZE;
 	}
-- 
cgit v1.2.3


From 123b0d1ba0a98ef12550d82b79ccb8d89090f871 Mon Sep 17 00:00:00 2001
From: Mathias Krause <mathias.krause@secunet.com>
Date: Fri, 18 Oct 2013 12:09:04 +0200
Subject: net: esp{4,6}: remove padlen from struct esp_data

The padlen member of struct esp_data is always zero. Get rid of it.

Signed-off-by: Mathias Krause <mathias.krause@secunet.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/esp.h | 3 ---
 net/ipv4/esp4.c   | 9 +--------
 net/ipv6/esp6.c   | 9 +--------
 3 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/esp.h b/include/net/esp.h
index 1356dda00d22..706b740d7057 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -6,9 +6,6 @@
 struct crypto_aead;
 
 struct esp_data {
-	/* 0..255 */
-	int padlen;
-
 	/* Confidentiality & Integrity */
 	struct crypto_aead *aead;
 };
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 109ee89f123e..8b5386a6cb88 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -154,8 +154,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	}
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
-	if (esp->padlen)
-		clen = ALIGN(clen, esp->padlen);
 	plen = clen - skb->len - tfclen;
 
 	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
@@ -461,7 +459,6 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
 {
 	struct esp_data *esp = x->data;
 	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
-	u32 align = max_t(u32, blksize, esp->padlen);
 	unsigned int net_adj;
 
 	switch (x->props.mode) {
@@ -477,7 +474,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
 	}
 
 	return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
-		 net_adj) & ~(align - 1)) + net_adj - 2;
+		 net_adj) & ~(blksize - 1)) + net_adj - 2;
 }
 
 static void esp4_err(struct sk_buff *skb, u32 info)
@@ -659,8 +656,6 @@ static int esp_init_state(struct xfrm_state *x)
 
 	aead = esp->aead;
 
-	esp->padlen = 0;
-
 	x->props.header_len = sizeof(struct ip_esp_hdr) +
 			      crypto_aead_ivsize(aead);
 	if (x->props.mode == XFRM_MODE_TUNNEL)
@@ -683,8 +678,6 @@ static int esp_init_state(struct xfrm_state *x)
 	}
 
 	align = ALIGN(crypto_aead_blocksize(aead), 4);
-	if (esp->padlen)
-		align = max_t(u32, align, esp->padlen);
 	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
 
 error:
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index d3618a78fcac..0073cd096984 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -181,8 +181,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	}
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
-	if (esp->padlen)
-		clen = ALIGN(clen, esp->padlen);
 	plen = clen - skb->len - tfclen;
 
 	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
@@ -416,7 +414,6 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
 {
 	struct esp_data *esp = x->data;
 	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
-	u32 align = max_t(u32, blksize, esp->padlen);
 	unsigned int net_adj;
 
 	if (x->props.mode != XFRM_MODE_TUNNEL)
@@ -425,7 +422,7 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
 		net_adj = 0;
 
 	return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
-		 net_adj) & ~(align - 1)) + net_adj - 2;
+		 net_adj) & ~(blksize - 1)) + net_adj - 2;
 }
 
 static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
@@ -606,8 +603,6 @@ static int esp6_init_state(struct xfrm_state *x)
 
 	aead = esp->aead;
 
-	esp->padlen = 0;
-
 	x->props.header_len = sizeof(struct ip_esp_hdr) +
 			      crypto_aead_ivsize(aead);
 	switch (x->props.mode) {
@@ -626,8 +621,6 @@ static int esp6_init_state(struct xfrm_state *x)
 	}
 
 	align = ALIGN(crypto_aead_blocksize(aead), 4);
-	if (esp->padlen)
-		align = max_t(u32, align, esp->padlen);
 	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
 
 error:
-- 
cgit v1.2.3


From 1c5ad13f7c2b2afe30e43858d04fff979dc9d243 Mon Sep 17 00:00:00 2001
From: Mathias Krause <mathias.krause@secunet.com>
Date: Fri, 18 Oct 2013 12:09:05 +0200
Subject: net: esp{4,6}: get rid of struct esp_data

struct esp_data consists of a single pointer, vanishing the need for it
to be a structure. Fold the pointer into 'data' direcly, removing one
level of pointer indirection.

Signed-off-by: Mathias Krause <mathias.krause@secunet.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/esp.h |  7 -------
 net/ipv4/esp4.c   | 40 ++++++++++++++--------------------------
 net/ipv6/esp6.c   | 39 ++++++++++++++-------------------------
 3 files changed, 28 insertions(+), 58 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/esp.h b/include/net/esp.h
index 706b740d7057..c92213c38312 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -3,13 +3,6 @@
 
 #include <linux/skbuff.h>
 
-struct crypto_aead;
-
-struct esp_data {
-	/* Confidentiality & Integrity */
-	struct crypto_aead *aead;
-};
-
 void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
 
 struct ip_esp_hdr;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 8b5386a6cb88..7785b28061ac 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -121,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct aead_givcrypt_request *req;
 	struct scatterlist *sg;
 	struct scatterlist *asg;
-	struct esp_data *esp;
 	struct sk_buff *trailer;
 	void *tmp;
 	u8 *iv;
@@ -139,8 +138,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 
 	/* skb is pure payload to encrypt */
 
-	esp = x->data;
-	aead = esp->aead;
+	aead = x->data;
 	alen = crypto_aead_authsize(aead);
 
 	tfclen = 0;
@@ -278,8 +276,7 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 {
 	const struct iphdr *iph;
 	struct xfrm_state *x = xfrm_input_state(skb);
-	struct esp_data *esp = x->data;
-	struct crypto_aead *aead = esp->aead;
+	struct crypto_aead *aead = x->data;
 	int alen = crypto_aead_authsize(aead);
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 	int elen = skb->len - hlen;
@@ -374,8 +371,7 @@ static void esp_input_done(struct crypto_async_request *base, int err)
 static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct ip_esp_hdr *esph;
-	struct esp_data *esp = x->data;
-	struct crypto_aead *aead = esp->aead;
+	struct crypto_aead *aead = x->data;
 	struct aead_request *req;
 	struct sk_buff *trailer;
 	int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
@@ -457,8 +453,8 @@ out:
 
 static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
 {
-	struct esp_data *esp = x->data;
-	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
+	struct crypto_aead *aead = x->data;
+	u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	unsigned int net_adj;
 
 	switch (x->props.mode) {
@@ -473,7 +469,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
 		BUG();
 	}
 
-	return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+	return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
 		 net_adj) & ~(blksize - 1)) + net_adj - 2;
 }
 
@@ -508,18 +504,16 @@ static void esp4_err(struct sk_buff *skb, u32 info)
 
 static void esp_destroy(struct xfrm_state *x)
 {
-	struct esp_data *esp = x->data;
+	struct crypto_aead *aead = x->data;
 
-	if (!esp)
+	if (!aead)
 		return;
 
-	crypto_free_aead(esp->aead);
-	kfree(esp);
+	crypto_free_aead(aead);
 }
 
 static int esp_init_aead(struct xfrm_state *x)
 {
-	struct esp_data *esp = x->data;
 	struct crypto_aead *aead;
 	int err;
 
@@ -528,7 +522,7 @@ static int esp_init_aead(struct xfrm_state *x)
 	if (IS_ERR(aead))
 		goto error;
 
-	esp->aead = aead;
+	x->data = aead;
 
 	err = crypto_aead_setkey(aead, x->aead->alg_key,
 				 (x->aead->alg_key_len + 7) / 8);
@@ -545,7 +539,6 @@ error:
 
 static int esp_init_authenc(struct xfrm_state *x)
 {
-	struct esp_data *esp = x->data;
 	struct crypto_aead *aead;
 	struct crypto_authenc_key_param *param;
 	struct rtattr *rta;
@@ -580,7 +573,7 @@ static int esp_init_authenc(struct xfrm_state *x)
 	if (IS_ERR(aead))
 		goto error;
 
-	esp->aead = aead;
+	x->data = aead;
 
 	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
 		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
@@ -635,16 +628,11 @@ error:
 
 static int esp_init_state(struct xfrm_state *x)
 {
-	struct esp_data *esp;
 	struct crypto_aead *aead;
 	u32 align;
 	int err;
 
-	esp = kzalloc(sizeof(*esp), GFP_KERNEL);
-	if (esp == NULL)
-		return -ENOMEM;
-
-	x->data = esp;
+	x->data = NULL;
 
 	if (x->aead)
 		err = esp_init_aead(x);
@@ -654,7 +642,7 @@ static int esp_init_state(struct xfrm_state *x)
 	if (err)
 		goto error;
 
-	aead = esp->aead;
+	aead = x->data;
 
 	x->props.header_len = sizeof(struct ip_esp_hdr) +
 			      crypto_aead_ivsize(aead);
@@ -678,7 +666,7 @@ static int esp_init_state(struct xfrm_state *x)
 	}
 
 	align = ALIGN(crypto_aead_blocksize(aead), 4);
-	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+	x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
 
 error:
 	return err;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 0073cd096984..87eb79e65e49 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -164,10 +164,9 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	u8 *iv;
 	u8 *tail;
 	__be32 *seqhi;
-	struct esp_data *esp = x->data;
 
 	/* skb is pure payload to encrypt */
-	aead = esp->aead;
+	aead = x->data;
 	alen = crypto_aead_authsize(aead);
 
 	tfclen = 0;
@@ -269,8 +268,7 @@ error:
 static int esp_input_done2(struct sk_buff *skb, int err)
 {
 	struct xfrm_state *x = xfrm_input_state(skb);
-	struct esp_data *esp = x->data;
-	struct crypto_aead *aead = esp->aead;
+	struct crypto_aead *aead = x->data;
 	int alen = crypto_aead_authsize(aead);
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 	int elen = skb->len - hlen;
@@ -323,8 +321,7 @@ static void esp_input_done(struct crypto_async_request *base, int err)
 static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct ip_esp_hdr *esph;
-	struct esp_data *esp = x->data;
-	struct crypto_aead *aead = esp->aead;
+	struct crypto_aead *aead = x->data;
 	struct aead_request *req;
 	struct sk_buff *trailer;
 	int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
@@ -412,8 +409,8 @@ out:
 
 static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
 {
-	struct esp_data *esp = x->data;
-	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
+	struct crypto_aead *aead = x->data;
+	u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	unsigned int net_adj;
 
 	if (x->props.mode != XFRM_MODE_TUNNEL)
@@ -421,7 +418,7 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
 	else
 		net_adj = 0;
 
-	return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+	return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
 		 net_adj) & ~(blksize - 1)) + net_adj - 2;
 }
 
@@ -452,18 +449,16 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 static void esp6_destroy(struct xfrm_state *x)
 {
-	struct esp_data *esp = x->data;
+	struct crypto_aead *aead = x->data;
 
-	if (!esp)
+	if (!aead)
 		return;
 
-	crypto_free_aead(esp->aead);
-	kfree(esp);
+	crypto_free_aead(aead);
 }
 
 static int esp_init_aead(struct xfrm_state *x)
 {
-	struct esp_data *esp = x->data;
 	struct crypto_aead *aead;
 	int err;
 
@@ -472,7 +467,7 @@ static int esp_init_aead(struct xfrm_state *x)
 	if (IS_ERR(aead))
 		goto error;
 
-	esp->aead = aead;
+	x->data = aead;
 
 	err = crypto_aead_setkey(aead, x->aead->alg_key,
 				 (x->aead->alg_key_len + 7) / 8);
@@ -489,7 +484,6 @@ error:
 
 static int esp_init_authenc(struct xfrm_state *x)
 {
-	struct esp_data *esp = x->data;
 	struct crypto_aead *aead;
 	struct crypto_authenc_key_param *param;
 	struct rtattr *rta;
@@ -524,7 +518,7 @@ static int esp_init_authenc(struct xfrm_state *x)
 	if (IS_ERR(aead))
 		goto error;
 
-	esp->aead = aead;
+	x->data = aead;
 
 	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
 		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
@@ -579,7 +573,6 @@ error:
 
 static int esp6_init_state(struct xfrm_state *x)
 {
-	struct esp_data *esp;
 	struct crypto_aead *aead;
 	u32 align;
 	int err;
@@ -587,11 +580,7 @@ static int esp6_init_state(struct xfrm_state *x)
 	if (x->encap)
 		return -EINVAL;
 
-	esp = kzalloc(sizeof(*esp), GFP_KERNEL);
-	if (esp == NULL)
-		return -ENOMEM;
-
-	x->data = esp;
+	x->data = NULL;
 
 	if (x->aead)
 		err = esp_init_aead(x);
@@ -601,7 +590,7 @@ static int esp6_init_state(struct xfrm_state *x)
 	if (err)
 		goto error;
 
-	aead = esp->aead;
+	aead = x->data;
 
 	x->props.header_len = sizeof(struct ip_esp_hdr) +
 			      crypto_aead_ivsize(aead);
@@ -621,7 +610,7 @@ static int esp6_init_state(struct xfrm_state *x)
 	}
 
 	align = ALIGN(crypto_aead_blocksize(aead), 4);
-	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+	x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
 
 error:
 	return err;
-- 
cgit v1.2.3


From c968601d174739cb1e7100c95e0eb3d2f7e91bc9 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 29 Oct 2013 10:09:05 -0700
Subject: tcp: temporarily disable Fast Open on SYN timeout

Fast Open currently has a fall back feature to address SYN-data being
dropped but it requires the middle-box to pass on regular SYN retry
after SYN-data. This is implemented in commit aab487435 ("net-tcp:
Fast Open client - detecting SYN-data drops")

However some NAT boxes will drop all subsequent packets after first
SYN-data and blackholes the entire connections.  An example is in
commit 356d7d8 "netfilter: nf_conntrack: fix tcp_in_window for Fast
Open".

The sender should note such incidents and fall back to use the regular
TCP handshake on subsequent attempts temporarily as well: after the
second SYN timeouts the original Fast Open SYN is most likely lost.
When such an event recurs Fast Open is disabled based on the number of
recurrences exponentially.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 5 +++--
 net/ipv4/tcp_timer.c   | 6 +++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4a2a84110dfb..2ab09cbae74d 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -671,8 +671,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
 
 		write_seqlock_bh(&fastopen_seqlock);
-		tfom->mss = mss;
-		if (cookie->len > 0)
+		if (mss)
+			tfom->mss = mss;
+		if (cookie && cookie->len > 0)
 			tfom->cookie = *cookie;
 		if (syn_lost) {
 			++tfom->syn_loss;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index af07b5b23ebf..64f0354c84c7 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,12 +156,16 @@ static bool retransmits_timed_out(struct sock *sk,
 static int tcp_write_timeout(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 	int retry_until;
 	bool do_reset, syn_set = false;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-		if (icsk->icsk_retransmits)
+		if (icsk->icsk_retransmits) {
 			dst_negative_advice(sk);
+			if (tp->syn_fastopen || tp->syn_data)
+				tcp_fastopen_cache_set(sk, 0, NULL, true);
+		}
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 		syn_set = true;
 	} else {
-- 
cgit v1.2.3


From 84502b5ef9849a9694673b15c31bd3ac693010ae Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 30 Oct 2013 11:16:28 +0100
Subject: xfrm: Fix null pointer dereference when decoding sessions

On some codepaths the skb does not have a dst entry
when xfrm_decode_session() is called. So check for
a valid skb_dst() before dereferencing the device
interface index. We use 0 as the device index if
there is no valid skb_dst(), or at reverse decoding
we use skb_iif as device interface index.

Bug was introduced with git commit bafd4bd4dc
("xfrm: Decode sessions with output interface.").

Reported-by: Meelis Roos <mroos@linux.ee>
Tested-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 6 +++++-
 net/ipv6/xfrm6_policy.c | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 4764ee48447c..e1a63930a967 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -104,10 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 	const struct iphdr *iph = ip_hdr(skb);
 	u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
 	struct flowi4 *fl4 = &fl->u.ip4;
+	int oif = 0;
+
+	if (skb_dst(skb))
+		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl4, 0, sizeof(struct flowi4));
 	fl4->flowi4_mark = skb->mark;
-	fl4->flowi4_oif = skb_dst(skb)->dev->ifindex;
+	fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
 
 	if (!ip_is_fragment(iph)) {
 		switch (iph->protocol) {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index dd503a3535ff..5f8e128c512d 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -135,10 +135,14 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 	struct ipv6_opt_hdr *exthdr;
 	const unsigned char *nh = skb_network_header(skb);
 	u8 nexthdr = nh[IP6CB(skb)->nhoff];
+	int oif = 0;
+
+	if (skb_dst(skb))
+		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl6, 0, sizeof(struct flowi6));
 	fl6->flowi6_mark = skb->mark;
-	fl6->flowi6_oif = skb_dst(skb)->dev->ifindex;
+	fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
 
 	fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
 	fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
-- 
cgit v1.2.3


From ca0e8bd68bae3d0bad758b5a82dbf9327b75325c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Wed, 30 Oct 2013 13:31:30 +0800
Subject: netfilter: nf_tables: remove duplicated include from nf_tables_ipv4.c

Remove duplicated include.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_tables_ipv4.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 8f7536be1322..0f4cbfeb19bd 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -16,7 +16,6 @@
 #include <net/netfilter/nf_tables.h>
 #include <net/net_namespace.h>
 #include <net/ip.h>
-#include <net/net_namespace.h>
 #include <net/netfilter/nf_tables_ipv4.h>
 
 static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
-- 
cgit v1.2.3


From 0d41cca490c274352211efac50e9598d39a9dc80 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 31 Oct 2013 09:19:32 -0700
Subject: tcp: enable sockets to use MSG_FASTOPEN by default

Applications have started to use Fast Open (e.g., Chrome browser has
such an optional flag) and the feature has gone through several
generations of kernels since 3.7 with many real network tests. It's
time to enable this flag by default for applications to test more
conveniently and extensively.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 4 ++--
 net/ipv4/tcp_fastopen.c                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a46d78583ae1..6c0098359ca6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -451,7 +451,7 @@ tcp_fastopen - INTEGER
 	connect() to perform a TCP handshake automatically.
 
 	The values (bitmap) are
-	1: Enables sending data in the opening SYN on the client.
+	1: Enables sending data in the opening SYN on the client w/ MSG_FASTOPEN.
 	2: Enables TCP Fast Open on the server side, i.e., allowing data in
 	   a SYN packet to be accepted and passed to the application before
 	   3-way hand shake finishes.
@@ -464,7 +464,7 @@ tcp_fastopen - INTEGER
 	   different ways of setting max_qlen without the TCP_FASTOPEN socket
 	   option.
 
-	Default: 0
+	Default: 1
 
 	Note that the client & server side Fast Open flags (1 and 2
 	respectively) must be also enabled before the rest of flags can take
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 766032b4a6c3..f195d9316e55 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -8,7 +8,7 @@
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_fastopen __read_mostly;
+int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
 
 struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 
-- 
cgit v1.2.3


From 9f9843a751d0a2057f9f3d313886e7e5e6ebaac9 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 31 Oct 2013 11:07:31 -0700
Subject: tcp: properly handle stretch acks in slow start

Slow start now increases cwnd by 1 if an ACK acknowledges some packets,
regardless the number of packets. Consequently slow start performance
is highly dependent on the degree of the stretch ACKs caused by
receiver or network ACK compression mechanisms (e.g., delayed-ACK,
GRO, etc).  But slow start algorithm is to send twice the amount of
packets of packets left so it should process a stretch ACK of degree
N as if N ACKs of degree 1, then exits when cwnd exceeds ssthresh. A
follow up patch will use the remainder of the N (if greater than 1)
to adjust cwnd in the congestion avoidance phase.

In addition this patch retires the experimental limited slow start
(LSS) feature. LSS has multiple drawbacks but questionable benefit. The
fractional cwnd increase in LSS requires a loop in slow start even
though it's rarely used. Configuring such an increase step via a global
sysctl on different BDPS seems hard. Finally and most importantly the
slow start overshoot concern is now better covered by the Hybrid slow
start (hystart) enabled by default.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 11 --------
 include/net/tcp.h                      |  7 +++--
 net/ipv4/sysctl_net_ipv4.c             |  7 -----
 net/ipv4/tcp_bic.c                     |  5 ++--
 net/ipv4/tcp_cong.c                    | 47 ++++++++++++----------------------
 net/ipv4/tcp_cubic.c                   |  5 ++--
 net/ipv4/tcp_highspeed.c               |  4 +--
 net/ipv4/tcp_htcp.c                    |  4 +--
 net/ipv4/tcp_hybla.c                   |  5 ++--
 net/ipv4/tcp_illinois.c                |  5 ++--
 net/ipv4/tcp_input.c                   |  6 ++---
 net/ipv4/tcp_lp.c                      |  5 ++--
 net/ipv4/tcp_scalable.c                |  5 ++--
 net/ipv4/tcp_vegas.c                   | 11 ++++----
 net/ipv4/tcp_veno.c                    |  9 ++++---
 net/ipv4/tcp_yeah.c                    |  5 ++--
 16 files changed, 59 insertions(+), 82 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 6c0098359ca6..8b8a05787641 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -267,17 +267,6 @@ tcp_max_orphans - INTEGER
 	more aggressively. Let me to remind again: each orphan eats
 	up to ~64K of unswappable memory.
 
-tcp_max_ssthresh - INTEGER
-	Limited Slow-Start for TCP with large congestion windows (cwnd) defined in
-	RFC3742. Limited slow-start is a mechanism to limit growth of the cwnd
-	on the region where cwnd is larger than tcp_max_ssthresh. TCP increases cwnd
-	by at most tcp_max_ssthresh segments, and by at least tcp_max_ssthresh/2
-	segments per RTT when the cwnd is above tcp_max_ssthresh.
-	If TCP connection increased cwnd to thousands (or tens of thousands) segments,
-	and thousands of packets were being dropped during slow-start, you can set
-	tcp_max_ssthresh to improve performance for new TCP connection.
-	Default: 0 (off)
-
 tcp_max_syn_backlog - INTEGER
 	Maximal number of remembered connection requests, which have not
 	received an acknowledgment from connecting client.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2d7b4bdc972f..70e55d200610 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -275,7 +275,6 @@ extern int sysctl_tcp_mtu_probing;
 extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
-extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
@@ -797,7 +796,7 @@ struct tcp_congestion_ops {
 	/* lower bound for congestion window (optional) */
 	u32 (*min_cwnd)(const struct sock *sk);
 	/* do new cwnd calculation (required) */
-	void (*cong_avoid)(struct sock *sk, u32 ack, u32 in_flight);
+	void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked, u32 in_flight);
 	/* call before changing ca_state (optional) */
 	void (*set_state)(struct sock *sk, u8 new_state);
 	/* call when cwnd event occurs (optional) */
@@ -824,12 +823,12 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
 int tcp_set_congestion_control(struct sock *sk, const char *name);
-void tcp_slow_start(struct tcp_sock *tp);
+int tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
 
 extern struct tcp_congestion_ops tcp_init_congestion_ops;
 u32 tcp_reno_ssthresh(struct sock *sk);
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight);
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight);
 u32 tcp_reno_min_cwnd(const struct sock *sk);
 extern struct tcp_congestion_ops tcp_reno;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d5b1390eebbe..3d69ec8dac57 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -700,13 +700,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler   = proc_allowed_congestion_control,
 	},
-	{
-		.procname	= "tcp_max_ssthresh",
-		.data		= &sysctl_tcp_max_ssthresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname       = "tcp_thin_linear_timeouts",
 		.data           = &sysctl_tcp_thin_linear_timeouts,
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index f45e1c242440..821846fb0a7e 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,7 +140,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 		ca->cnt = 1;
 }
 
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+			      u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -149,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		return;
 
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 	else {
 		bictcp_update(ca, tp->snd_cwnd);
 		tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 019c2389a341..ad37bf18ae4b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -15,8 +15,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_max_ssthresh = 0;
-
 static DEFINE_SPINLOCK(tcp_cong_list_lock);
 static LIST_HEAD(tcp_cong_list);
 
@@ -299,35 +297,24 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
 }
 EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
 
-/*
- * Slow start is used when congestion window is less than slow start
- * threshold. This version implements the basic RFC2581 version
- * and optionally supports:
- * 	RFC3742 Limited Slow Start  	  - growth limited to max_ssthresh
- *	RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
  */
-void tcp_slow_start(struct tcp_sock *tp)
+int tcp_slow_start(struct tcp_sock *tp, u32 acked)
 {
-	int cnt; /* increase in packets */
-	unsigned int delta = 0;
-	u32 snd_cwnd = tp->snd_cwnd;
-
-	if (unlikely(!snd_cwnd)) {
-		pr_err_once("snd_cwnd is nul, please report this bug.\n");
-		snd_cwnd = 1U;
-	}
+	u32 cwnd = tp->snd_cwnd + acked;
 
-	if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
-		cnt = sysctl_tcp_max_ssthresh >> 1;	/* limited slow start */
-	else
-		cnt = snd_cwnd;				/* exponential increase */
-
-	tp->snd_cwnd_cnt += cnt;
-	while (tp->snd_cwnd_cnt >= snd_cwnd) {
-		tp->snd_cwnd_cnt -= snd_cwnd;
-		delta++;
-	}
-	tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp);
+	if (cwnd > tp->snd_ssthresh)
+		cwnd = tp->snd_ssthresh + 1;
+	acked -= cwnd - tp->snd_cwnd;
+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+	return acked;
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
 
@@ -351,7 +338,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
 /* This is Jacobson's slow start and congestion avoidance.
  * SIGCOMM '88, p. 328.
  */
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -360,7 +347,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 
 	/* In "safe" area, increase. */
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 	/* In dangerous area, increase slowly. */
 	else
 		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index b6ae92a51f58..828e4c3ffbaf 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,7 +304,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 		ca->cnt = 1;
 }
 
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+			      u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -315,7 +316,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 	if (tp->snd_cwnd <= tp->snd_ssthresh) {
 		if (hystart && after(ack, ca->end_seq))
 			bictcp_hystart_reset(sk);
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 	} else {
 		bictcp_update(ca, tp->snd_cwnd);
 		tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 30f27f6b3655..8ed9305dfdf4 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,7 +109,7 @@ static void hstcp_init(struct sock *sk)
 	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
 }
 
-static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct hstcp *ca = inet_csk_ca(sk);
@@ -118,7 +118,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
 		return;
 
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 	else {
 		/* Update AIMD parameters.
 		 *
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index c1a8175361e8..4a194acfd923 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,7 +227,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
 	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
 }
 
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct htcp *ca = inet_csk_ca(sk);
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		return;
 
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 	else {
 		/* In dangerous area, increase slowly.
 		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 57bdd17dff4d..478fe82611bf 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -85,7 +85,8 @@ static inline u32 hybla_fraction(u32 odds)
  *     o Give cwnd a new value based on the model proposed
  *     o remember increments <1
  */
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+			     u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct hybla *ca = inet_csk_ca(sk);
@@ -102,7 +103,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		return;
 
 	if (!ca->hybla_en) {
-		tcp_reno_cong_avoid(sk, ack, in_flight);
+		tcp_reno_cong_avoid(sk, ack, acked, in_flight);
 		return;
 	}
 
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 834857f3c871..8a520996f3d2 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -256,7 +256,8 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
 /*
  * Increase window in response to successful acknowledgment.
  */
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+				    u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct illinois *ca = inet_csk_ca(sk);
@@ -270,7 +271,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 
 	/* In slow start */
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 
 	else {
 		u32 delta;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 63095b218b4a..c53b7f35c51d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2934,10 +2934,10 @@ static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
 		tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
 }
 
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
+	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
 	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
 }
 
@@ -3454,7 +3454,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* Advance cwnd if state allows */
 	if (tcp_may_raise_cwnd(sk, flag))
-		tcp_cong_avoid(sk, ack, prior_in_flight);
+		tcp_cong_avoid(sk, ack, acked, prior_in_flight);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 72f7218b03f5..991d62a2f9bb 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,12 +115,13 @@ static void tcp_lp_init(struct sock *sk)
  * Will only call newReno CA when away from inference.
  * From TCP-LP's paper, this will be handled in additive increasement.
  */
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+			      u32 in_flight)
 {
 	struct lp *lp = inet_csk_ca(sk);
 
 	if (!(lp->flag & LP_WITHIN_INF))
-		tcp_reno_cong_avoid(sk, ack, in_flight);
+		tcp_reno_cong_avoid(sk, ack, acked, in_flight);
 }
 
 /**
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 8ce55b8aaec8..19ea6c2951f3 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,7 +15,8 @@
 #define TCP_SCALABLE_AI_CNT	50U
 #define TCP_SCALABLE_MD_SCALE	3
 
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+				    u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -23,7 +24,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		return;
 
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 	else
 		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
 }
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 80fa2bfd7ede..06cae62bf208 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,13 +163,14 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
 	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);
 }
 
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+				 u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct vegas *vegas = inet_csk_ca(sk);
 
 	if (!vegas->doing_vegas_now) {
-		tcp_reno_cong_avoid(sk, ack, in_flight);
+		tcp_reno_cong_avoid(sk, ack, acked, in_flight);
 		return;
 	}
 
@@ -194,7 +195,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 			/* We don't have enough RTT samples to do the Vegas
 			 * calculation, so we'll behave like Reno.
 			 */
-			tcp_reno_cong_avoid(sk, ack, in_flight);
+			tcp_reno_cong_avoid(sk, ack, acked, in_flight);
 		} else {
 			u32 rtt, diff;
 			u64 target_cwnd;
@@ -243,7 +244,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 
 			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
 				/* Slow start.  */
-				tcp_slow_start(tp);
+				tcp_slow_start(tp, acked);
 			} else {
 				/* Congestion avoidance. */
 
@@ -283,7 +284,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 	}
 	/* Use normal slow start */
 	else if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 
 }
 
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index ac43cd747bce..326475a94865 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,13 +114,14 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 		tcp_veno_init(sk);
 }
 
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+				u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct veno *veno = inet_csk_ca(sk);
 
 	if (!veno->doing_veno_now) {
-		tcp_reno_cong_avoid(sk, ack, in_flight);
+		tcp_reno_cong_avoid(sk, ack, acked, in_flight);
 		return;
 	}
 
@@ -133,7 +134,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		/* We don't have enough rtt samples to do the Veno
 		 * calculation, so we'll behave like Reno.
 		 */
-		tcp_reno_cong_avoid(sk, ack, in_flight);
+		tcp_reno_cong_avoid(sk, ack, acked, in_flight);
 	} else {
 		u64 target_cwnd;
 		u32 rtt;
@@ -152,7 +153,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 
 		if (tp->snd_cwnd <= tp->snd_ssthresh) {
 			/* Slow start.  */
-			tcp_slow_start(tp);
+			tcp_slow_start(tp, acked);
 		} else {
 			/* Congestion avoidance. */
 			if (veno->diff < beta) {
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 05c3b6f0e8e1..a347a078ee07 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,7 +69,8 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
 	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
 }
 
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked,
+				u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct yeah *yeah = inet_csk_ca(sk);
@@ -78,7 +79,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		return;
 
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp);
+		tcp_slow_start(tp, acked);
 
 	else if (!yeah->doing_reno_now) {
 		/* Scalable */
-- 
cgit v1.2.3


From 482fc6094afad572a4ea1fd722e7b11ca72022a0 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 5 Nov 2013 02:24:17 +0100
Subject: ipv4: introduce new IP_MTU_DISCOVER mode IP_PMTUDISC_INTERFACE

Sockets marked with IP_PMTUDISC_INTERFACE won't do path mtu discovery,
their sockets won't accept and install new path mtu information and they
will always use the interface mtu for outgoing packets. It is guaranteed
that the packet is not fragmented locally. But we won't set the DF-Flag
on the outgoing frames.

Florian Weimer had the idea to use this flag to ensure DNS servers are
never generating outgoing fragments. They may well be fragmented on the
path, but the server never stores or usees path mtu values, which could
well be forged in an attack.

(The root of the problem with path MTU discovery is that there is
no reliable way to authenticate ICMP Fragmentation Needed But DF Set
messages because they are sent from intermediate routers with their
source addresses, and the IMCP payload will not always contain sufficient
information to identify a flow.)

Recent research in the DNS community showed that it is possible to
implement an attack where DNS cache poisoning is feasible by spoofing
fragments. This work was done by Amir Herzberg and Haya Shulman:
<https://sites.google.com/site/hayashulman/files/fragmentation-poisoning.pdf>

This issue was previously discussed among the DNS community, e.g.
<http://www.ietf.org/mail-archive/web/dnsext/current/msg01204.html>,
without leading to fixes.

This patch depends on the patch "ipv4: fix DO and PROBE pmtu mode
regarding local fragmentation with UFO/CORK" for the enforcement of the
non-fragmentable checks. If other users than ip_append_page/data should
use this semantic too, we have to add a new flag to IPCB(skb)->flags to
suppress local fragmentation and check for this in ip_finish_output.

Many thanks to Florian Weimer for the idea and feedback while implementing
this patch.

Cc: David S. Miller <davem@davemloft.net>
Suggested-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     | 16 ++++++++++++----
 include/uapi/linux/in.h |  5 +++++
 net/dccp/ipv4.c         |  1 +
 net/ipv4/ip_output.c    |  8 ++++----
 net/ipv4/ip_sockglue.c  |  2 +-
 net/ipv4/route.c        |  4 ++++
 net/ipv4/tcp_ipv4.c     |  1 +
 7 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index dd4ae0029fd8..f68c167280a7 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -313,12 +313,20 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 	return hoplimit;
 }
 
-static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+static inline bool ip_sk_accept_pmtu(const struct sock *sk)
 {
-	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+	return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE;
+}
 
-	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
-	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+static inline bool ip_sk_use_pmtu(const struct sock *sk)
+{
+	return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
+}
+
+static inline int ip_skb_dst_mtu(const struct sk_buff *skb)
+{
+	return (!skb->sk || ip_sk_use_pmtu(skb->sk)) ?
+	       dst_mtu(skb_dst(skb)) : skb_dst(skb)->dev->mtu;
 }
 
 #endif	/* _ROUTE_H */
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index f9e8e496ae5d..393c5de09d42 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -115,6 +115,11 @@ struct in_addr {
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
 #define IP_PMTUDISC_DO			2	/* Always DF		*/
 #define IP_PMTUDISC_PROBE		3       /* Ignore dst pmtu      */
+/* Always use interface mtu (ignores dst pmtu) but don't set DF flag.
+ * Also incoming ICMP frag_needed notifications will be ignored on
+ * this socket to prevent accepting spoofed ones.
+ */
+#define IP_PMTUDISC_INTERFACE		4
 
 #define IP_MULTICAST_IF			32
 #define IP_MULTICAST_TTL 		33
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 720c36225ed9..d9f65fc66db5 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -174,6 +174,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    ip_sk_accept_pmtu(sk) &&
 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		dccp_sync_mss(sk, mtu);
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 51be64e18e32..912402752f2f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1037,7 +1037,6 @@ error:
 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 			 struct ipcm_cookie *ipc, struct rtable **rtp)
 {
-	struct inet_sock *inet = inet_sk(sk);
 	struct ip_options_rcu *opt;
 	struct rtable *rt;
 
@@ -1063,8 +1062,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	 * We steal reference to this route, caller should not release it
 	 */
 	*rtp = NULL;
-	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
+	cork->fragsize = ip_sk_use_pmtu(sk) ?
+			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
@@ -1315,7 +1314,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
+	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
 	    (skb->len <= dst_mtu(&rt->dst) &&
 	     ip_dont_fragment(sk, &rt->dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0626f2cb192e..3f858266fa7e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -627,7 +627,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->nodefrag = val ? 1 : 0;
 		break;
 	case IP_MTU_DISCOVER:
-		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
+		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE)
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d2d325382b13..f428935c50db 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1036,6 +1036,10 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	bool new = false;
 
 	bh_lock_sock(sk);
+
+	if (!ip_sk_accept_pmtu(sk))
+		goto out;
+
 	rt = (struct rtable *) __sk_dst_get(sk);
 
 	if (sock_owned_by_user(sk) || !rt) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 300ab2c93f29..14bba8a1c5a7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -288,6 +288,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    ip_sk_accept_pmtu(sk) &&
 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		tcp_sync_mss(sk, mtu);
 
-- 
cgit v1.2.3


From 827da44c61419f29ae3be198c342e2147f1a10cb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 7 Oct 2013 15:51:58 -0700
Subject: net: Explicitly initialize u64_stats_sync structures for lockdep

In order to enable lockdep on seqcount/seqlock structures, we
must explicitly initialize any locks.

The u64_stats_sync structure, uses a seqcount, and thus we need
to introduce a u64_stats_init() function and use it to initialize
the structure.

This unfortunately adds a lot of fairly trivial initialization code
to a number of drivers. But the benefit of ensuring correctness makes
this worth while.

Because these changes are required for lockdep to be enabled, and the
changes are quite trivial, I've not yet split this patch out into 30-some
separate patches, as I figured it would be better to get the various
maintainers thoughts on how to best merge this change along with
the seqcount lockdep enablement.

Feedback would be appreciated!

Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: James Morris <jmorris@namei.org>
Cc: Jesse Gross <jesse@nicira.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mirko Lindner <mlindner@marvell.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Roger Luethi <rl@hellgate.ch>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Simon Horman <horms@verge.net.au>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Cc: Wensong Zhang <wensong@linux-vs.org>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/net/dummy.c                            |  6 ++++++
 drivers/net/ethernet/emulex/benet/be_main.c    |  4 ++++
 drivers/net/ethernet/intel/igb/igb_main.c      |  5 +++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |  4 ++++
 drivers/net/ethernet/marvell/mvneta.c          |  3 +++
 drivers/net/ethernet/marvell/sky2.c            |  3 +++
 drivers/net/ethernet/neterion/vxge/vxge-main.c |  4 ++++
 drivers/net/ethernet/nvidia/forcedeth.c        |  2 ++
 drivers/net/ethernet/realtek/8139too.c         |  3 +++
 drivers/net/ethernet/tile/tilepro.c            |  2 ++
 drivers/net/ethernet/via/via-rhine.c           |  3 +++
 drivers/net/ifb.c                              |  5 +++++
 drivers/net/loopback.c                         |  6 ++++++
 drivers/net/macvlan.c                          |  7 +++++++
 drivers/net/nlmon.c                            |  8 ++++++++
 drivers/net/team/team.c                        |  6 ++++++
 drivers/net/team/team_mode_loadbalance.c       |  9 ++++++++-
 drivers/net/veth.c                             |  8 ++++++++
 drivers/net/virtio_net.c                       |  8 ++++++++
 drivers/net/vxlan.c                            |  8 ++++++++
 drivers/net/xen-netfront.c                     |  6 ++++++
 include/linux/u64_stats_sync.h                 |  7 +++++++
 net/8021q/vlan_dev.c                           |  9 ++++++++-
 net/bridge/br_device.c                         |  7 +++++++
 net/ipv4/af_inet.c                             | 14 ++++++++++++++
 net/ipv4/ip_tunnel.c                           |  8 +++++++-
 net/ipv6/addrconf.c                            | 14 ++++++++++++++
 net/ipv6/af_inet6.c                            | 14 ++++++++++++++
 net/ipv6/ip6_gre.c                             | 15 +++++++++++++++
 net/ipv6/ip6_tunnel.c                          |  7 +++++++
 net/ipv6/sit.c                                 | 15 +++++++++++++++
 net/netfilter/ipvs/ip_vs_ctl.c                 | 25 ++++++++++++++++++++++---
 net/openvswitch/datapath.c                     |  6 ++++++
 net/openvswitch/vport.c                        |  8 ++++++++
 34 files changed, 253 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index b710c6b2d659..bd8f84b0b894 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -88,10 +88,16 @@ static netdev_tx_t dummy_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static int dummy_dev_init(struct net_device *dev)
 {
+	int i;
 	dev->dstats = alloc_percpu(struct pcpu_dstats);
 	if (!dev->dstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_dstats *dstats;
+		dstats = per_cpu_ptr(dev->dstats, i);
+		u64_stats_init(&dstats->syncp);
+	}
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 2c38cc402119..edd75950855a 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2047,6 +2047,9 @@ static int be_tx_qs_create(struct be_adapter *adapter)
 		if (status)
 			return status;
 
+		u64_stats_init(&txo->stats.sync);
+		u64_stats_init(&txo->stats.sync_compl);
+
 		/* If num_evt_qs is less than num_tx_qs, then more than
 		 * one txq share an eq
 		 */
@@ -2108,6 +2111,7 @@ static int be_rx_cqs_create(struct be_adapter *adapter)
 		if (rc)
 			return rc;
 
+		u64_stats_init(&rxo->stats.sync);
 		eq = &adapter->eq_obj[i % adapter->num_evt_qs].q;
 		rc = be_cmd_cq_create(adapter, cq, eq, false, 3);
 		if (rc)
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 8cf44f2a8ccd..b6edb93a8fc1 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1223,6 +1223,9 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter,
 		ring->count = adapter->tx_ring_count;
 		ring->queue_index = txr_idx;
 
+		u64_stats_init(&ring->tx_syncp);
+		u64_stats_init(&ring->tx_syncp2);
+
 		/* assign ring to adapter */
 		adapter->tx_ring[txr_idx] = ring;
 
@@ -1256,6 +1259,8 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter,
 		ring->count = adapter->rx_ring_count;
 		ring->queue_index = rxr_idx;
 
+		u64_stats_init(&ring->rx_syncp);
+
 		/* assign ring to adapter */
 		adapter->rx_ring[rxr_idx] = ring;
 	}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0ade0cd5ef53..c1750364ddc0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4867,6 +4867,8 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring)
 	if (!tx_ring->tx_buffer_info)
 		goto err;
 
+	u64_stats_init(&tx_ring->syncp);
+
 	/* round up to nearest 4K */
 	tx_ring->size = tx_ring->count * sizeof(union ixgbe_adv_tx_desc);
 	tx_ring->size = ALIGN(tx_ring->size, 4096);
@@ -4949,6 +4951,8 @@ int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
 	if (!rx_ring->rx_buffer_info)
 		goto err;
 
+	u64_stats_init(&rx_ring->syncp);
+
 	/* Round up to nearest 4K */
 	rx_ring->size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc);
 	rx_ring->size = ALIGN(rx_ring->size, 4096);
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index e35bac7cfdf1..cb4635c0cd73 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2792,6 +2792,9 @@ static int mvneta_probe(struct platform_device *pdev)
 
 	pp = netdev_priv(dev);
 
+	u64_stats_init(&pp->tx_stats.syncp);
+	u64_stats_init(&pp->rx_stats.syncp);
+
 	pp->weight = MVNETA_RX_POLL_WEIGHT;
 	pp->phy_node = phy_node;
 	pp->phy_interface = phy_mode;
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index e09a8c6f8536..339d841a538b 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4763,6 +4763,9 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port,
 	sky2->hw = hw;
 	sky2->msg_enable = netif_msg_init(debug, default_msg);
 
+	u64_stats_init(&sky2->tx_stats.syncp);
+	u64_stats_init(&sky2->rx_stats.syncp);
+
 	/* Auto speed and flow control */
 	sky2->flags = SKY2_FLAG_AUTO_SPEED | SKY2_FLAG_AUTO_PAUSE;
 	if (hw->chip_id != CHIP_ID_YUKON_XL)
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 5a20eaf903dd..44626ec1127b 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -2072,6 +2072,10 @@ static int vxge_open_vpaths(struct vxgedev *vdev)
 				vdev->config.tx_steering_type;
 			vpath->fifo.ndev = vdev->ndev;
 			vpath->fifo.pdev = vdev->pdev;
+
+			u64_stats_init(&vpath->fifo.stats.syncp);
+			u64_stats_init(&vpath->ring.stats.syncp);
+
 			if (vdev->config.tx_steering_type)
 				vpath->fifo.txq =
 					netdev_get_tx_queue(vdev->ndev, i);
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 098b96dad66f..2d045be4b5cf 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -5619,6 +5619,8 @@ static int nv_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 	spin_lock_init(&np->lock);
 	spin_lock_init(&np->hwstats_lock);
 	SET_NETDEV_DEV(dev, &pci_dev->dev);
+	u64_stats_init(&np->swstats_rx_syncp);
+	u64_stats_init(&np->swstats_tx_syncp);
 
 	init_timer(&np->oom_kick);
 	np->oom_kick.data = (unsigned long) dev;
diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 3ccedeb8aba0..c40e984868ad 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -791,6 +791,9 @@ static struct net_device *rtl8139_init_board(struct pci_dev *pdev)
 
 	pci_set_master (pdev);
 
+	u64_stats_init(&tp->rx_stats.syncp);
+	u64_stats_init(&tp->tx_stats.syncp);
+
 retry:
 	/* PIO bar register comes first. */
 	bar = !use_io;
diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c
index 106be47716e7..edb2e12a0fe2 100644
--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -1008,6 +1008,8 @@ static void tile_net_register(void *dev_ptr)
 	info->egress_timer.data = (long)info;
 	info->egress_timer.function = tile_net_handle_egress_timer;
 
+	u64_stats_init(&info->stats.syncp);
+
 	priv->cpu[my_cpu] = info;
 
 	/*
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index bdf697b184ae..dd99715e6645 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -987,6 +987,9 @@ static int rhine_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	rp->base = ioaddr;
 
+	u64_stats_init(&rp->tx_stats.syncp);
+	u64_stats_init(&rp->rx_stats.syncp);
+
 	/* Get chip registers into a sane state */
 	rhine_power_init(dev);
 	rhine_hw_init(dev, pioaddr);
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index a3bed28197d2..c14d39bf32d0 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -265,6 +265,7 @@ MODULE_PARM_DESC(numifbs, "Number of ifb devices");
 static int __init ifb_init_one(int index)
 {
 	struct net_device *dev_ifb;
+	struct ifb_private *dp;
 	int err;
 
 	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
@@ -273,6 +274,10 @@ static int __init ifb_init_one(int index)
 	if (!dev_ifb)
 		return -ENOMEM;
 
+	dp = netdev_priv(dev_ifb);
+	u64_stats_init(&dp->rsync);
+	u64_stats_init(&dp->tsync);
+
 	dev_ifb->rtnl_link_ops = &ifb_link_ops;
 	err = register_netdevice(dev_ifb);
 	if (err < 0)
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index a17d85a331f1..ac24c27b4b2d 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -137,10 +137,16 @@ static const struct ethtool_ops loopback_ethtool_ops = {
 
 static int loopback_dev_init(struct net_device *dev)
 {
+	int i;
 	dev->lstats = alloc_percpu(struct pcpu_lstats);
 	if (!dev->lstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_lstats *lb_stats;
+		lb_stats = per_cpu_ptr(dev->lstats, i);
+		u64_stats_init(&lb_stats->syncp);
+	}
 	return 0;
 }
 
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 9bf46bd19b87..0924e51b9ee0 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -501,6 +501,7 @@ static int macvlan_init(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	const struct net_device *lowerdev = vlan->lowerdev;
+	int i;
 
 	dev->state		= (dev->state & ~MACVLAN_STATE_MASK) |
 				  (lowerdev->state & MACVLAN_STATE_MASK);
@@ -516,6 +517,12 @@ static int macvlan_init(struct net_device *dev)
 	if (!vlan->pcpu_stats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct macvlan_pcpu_stats *mvlstats;
+		mvlstats = per_cpu_ptr(vlan->pcpu_stats, i);
+		u64_stats_init(&mvlstats->syncp);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index b57ce5f48962..d2bb12bfabd5 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -47,8 +47,16 @@ static int nlmon_change_mtu(struct net_device *dev, int new_mtu)
 
 static int nlmon_dev_init(struct net_device *dev)
 {
+	int i;
+
 	dev->lstats = alloc_percpu(struct pcpu_lstats);
 
+	for_each_possible_cpu(i) {
+		struct pcpu_lstats *nlmstats;
+		nlmstats = per_cpu_ptr(dev->lstats, i);
+		u64_stats_init(&nlmstats->syncp);
+	}
+
 	return dev->lstats == NULL ? -ENOMEM : 0;
 }
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 50e43e64d51d..6574eb8766f9 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1540,6 +1540,12 @@ static int team_init(struct net_device *dev)
 	if (!team->pcpu_stats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct team_pcpu_stats *team_stats;
+		team_stats = per_cpu_ptr(team->pcpu_stats, i);
+		u64_stats_init(&team_stats->syncp);
+	}
+
 	for (i = 0; i < TEAM_PORT_HASHENTRIES; i++)
 		INIT_HLIST_HEAD(&team->en_port_hlist[i]);
 	INIT_LIST_HEAD(&team->port_list);
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index 829a9cd2b4da..d671fc3ac5ac 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -570,7 +570,7 @@ static int lb_init(struct team *team)
 {
 	struct lb_priv *lb_priv = get_lb_priv(team);
 	lb_select_tx_port_func_t *func;
-	int err;
+	int i, err;
 
 	/* set default tx port selector */
 	func = lb_select_tx_port_get_func("hash");
@@ -588,6 +588,13 @@ static int lb_init(struct team *team)
 		goto err_alloc_pcpu_stats;
 	}
 
+	for_each_possible_cpu(i) {
+		struct lb_pcpu_stats *team_lb_stats;
+		team_lb_stats = per_cpu_ptr(lb_priv->pcpu_stats, i);
+		u64_stats_init(&team_lb_stats->syncp);
+	}
+
+
 	INIT_DELAYED_WORK(&lb_priv->ex->stats.refresh_dw, lb_stats_refresh);
 
 	err = team_options_register(team, lb_options, ARRAY_SIZE(lb_options));
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index eee1f19ef1e9..46e83e3fe999 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -230,10 +230,18 @@ static int veth_change_mtu(struct net_device *dev, int new_mtu)
 
 static int veth_dev_init(struct net_device *dev)
 {
+	int i;
+
 	dev->vstats = alloc_percpu(struct pcpu_vstats);
 	if (!dev->vstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_vstats *veth_stats;
+		veth_stats = per_cpu_ptr(dev->vstats, i);
+		u64_stats_init(&veth_stats->syncp);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9fbdfcd1e1a0..ee384f3d612b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1569,6 +1569,14 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (vi->stats == NULL)
 		goto free;
 
+	for_each_possible_cpu(i) {
+		struct virtnet_stats *virtnet_stats;
+		virtnet_stats = per_cpu_ptr(vi->stats, i);
+		u64_stats_init(&virtnet_stats->tx_syncp);
+		u64_stats_init(&virtnet_stats->rx_syncp);
+	}
+
+
 	vi->vq_index = alloc_percpu(int);
 	if (vi->vq_index == NULL)
 		goto free_stats;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2ef5b6219f3f..01ab64d5f9a4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1884,11 +1884,19 @@ static int vxlan_init(struct net_device *dev)
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 	struct vxlan_sock *vs;
+	int i;
 
 	dev->tstats = alloc_percpu(struct pcpu_tstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *vxlan_stats;
+		vxlan_stats = per_cpu_ptr(dev->tstats, i);
+		u64_stats_init(&vxlan_stats->syncp);
+	}
+
+
 	spin_lock(&vn->sock_lock);
 	vs = vxlan_find_sock(dev_net(dev), vxlan->dst_port);
 	if (vs) {
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 36808bf25677..54223ac6d8a6 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1338,6 +1338,12 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 	if (np->stats == NULL)
 		goto exit;
 
+	for_each_possible_cpu(i) {
+		struct netfront_stats *xen_nf_stats;
+		xen_nf_stats = per_cpu_ptr(np->stats, i);
+		u64_stats_init(&xen_nf_stats->syncp);
+	}
+
 	/* Initialise tx_skbs as a free chain containing every entry. */
 	np->tx_skb_freelist = 0;
 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 8da8c4e87da3..7bfabd20204c 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -67,6 +67,13 @@ struct u64_stats_sync {
 #endif
 };
 
+
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+# define u64_stats_init(syncp)	seqcount_init(syncp.seq)
+#else
+# define u64_stats_init(syncp)	do { } while (0)
+#endif
+
 static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 09bf1c38805b..4deff3ed1da1 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -558,7 +558,7 @@ static const struct net_device_ops vlan_netdev_ops;
 static int vlan_dev_init(struct net_device *dev)
 {
 	struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
-	int subclass = 0;
+	int subclass = 0, i;
 
 	netif_carrier_off(dev);
 
@@ -612,6 +612,13 @@ static int vlan_dev_init(struct net_device *dev)
 	if (!vlan_dev_priv(dev)->vlan_pcpu_stats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct vlan_pcpu_stats *vlan_stat;
+		vlan_stat = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
+		u64_stats_init(&vlan_stat->syncp);
+	}
+
+
 	return 0;
 }
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ca04163635da..7893d641775b 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -88,11 +88,18 @@ out:
 static int br_dev_init(struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
+	int i;
 
 	br->stats = alloc_percpu(struct br_cpu_netstats);
 	if (!br->stats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct br_cpu_netstats *br_dev_stats;
+		br_dev_stats = per_cpu_ptr(br->stats, i);
+		u64_stats_init(&br_dev_stats->syncp);
+	}
+
 	return 0;
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cfeb85cff4f0..5f4617e377b8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1518,6 +1518,7 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
 	ptr[0] = __alloc_percpu(mibsize, align);
 	if (!ptr[0])
 		return -ENOMEM;
+
 #if SNMP_ARRAY_SZ == 2
 	ptr[1] = __alloc_percpu(mibsize, align);
 	if (!ptr[1]) {
@@ -1561,6 +1562,8 @@ static const struct net_protocol icmp_protocol = {
 
 static __net_init int ipv4_mib_init_net(struct net *net)
 {
+	int i;
+
 	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
 			  sizeof(struct tcp_mib),
 			  __alignof__(struct tcp_mib)) < 0)
@@ -1569,6 +1572,17 @@ static __net_init int ipv4_mib_init_net(struct net *net)
 			  sizeof(struct ipstats_mib),
 			  __alignof__(struct ipstats_mib)) < 0)
 		goto err_ip_mib;
+
+	for_each_possible_cpu(i) {
+		struct ipstats_mib *af_inet_stats;
+		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i);
+		u64_stats_init(&af_inet_stats->syncp);
+#if SNMP_ARRAY_SZ == 2
+		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i);
+		u64_stats_init(&af_inet_stats->syncp);
+#endif
+	}
+
 	if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
 			  sizeof(struct linux_mib),
 			  __alignof__(struct linux_mib)) < 0)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 63a6d6d6b875..caf01176a5e4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -976,13 +976,19 @@ int ip_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct iphdr *iph = &tunnel->parms.iph;
-	int err;
+	int i, err;
 
 	dev->destructor	= ip_tunnel_dev_free;
 	dev->tstats = alloc_percpu(struct pcpu_tstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *ipt_stats;
+		ipt_stats = per_cpu_ptr(dev->tstats, i);
+		u64_stats_init(&ipt_stats->syncp);
+	}
+
 	err = gro_cells_init(&tunnel->gro_cells, dev);
 	if (err) {
 		free_percpu(dev->tstats);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cd3fb301da38..d62d589bb622 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -281,10 +281,24 @@ static void addrconf_mod_dad_timer(struct inet6_ifaddr *ifp,
 
 static int snmp6_alloc_dev(struct inet6_dev *idev)
 {
+	int i;
+
 	if (snmp_mib_init((void __percpu **)idev->stats.ipv6,
 			  sizeof(struct ipstats_mib),
 			  __alignof__(struct ipstats_mib)) < 0)
 		goto err_ip;
+
+	for_each_possible_cpu(i) {
+		struct ipstats_mib *addrconf_stats;
+		addrconf_stats = per_cpu_ptr(idev->stats.ipv6[0], i);
+		u64_stats_init(&addrconf_stats->syncp);
+#if SNMP_ARRAY_SZ == 2
+		addrconf_stats = per_cpu_ptr(idev->stats.ipv6[1], i);
+		u64_stats_init(&addrconf_stats->syncp);
+#endif
+	}
+
+
 	idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device),
 					GFP_KERNEL);
 	if (!idev->stats.icmpv6dev)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7c96100b021e..a8f8559b3dce 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -719,6 +719,8 @@ static void ipv6_packet_cleanup(void)
 
 static int __net_init ipv6_init_mibs(struct net *net)
 {
+	int i;
+
 	if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6,
 			  sizeof(struct udp_mib),
 			  __alignof__(struct udp_mib)) < 0)
@@ -731,6 +733,18 @@ static int __net_init ipv6_init_mibs(struct net *net)
 			  sizeof(struct ipstats_mib),
 			  __alignof__(struct ipstats_mib)) < 0)
 		goto err_ip_mib;
+
+	for_each_possible_cpu(i) {
+		struct ipstats_mib *af_inet6_stats;
+		af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics[0], i);
+		u64_stats_init(&af_inet6_stats->syncp);
+#if SNMP_ARRAY_SZ == 2
+		af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics[1], i);
+		u64_stats_init(&af_inet6_stats->syncp);
+#endif
+	}
+
+
 	if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics,
 			  sizeof(struct icmpv6_mib),
 			  __alignof__(struct icmpv6_mib)) < 0)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index bf4a9a084de5..8acb28621f9c 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1252,6 +1252,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 static int ip6gre_tunnel_init(struct net_device *dev)
 {
 	struct ip6_tnl *tunnel;
+	int i;
 
 	tunnel = netdev_priv(dev);
 
@@ -1269,6 +1270,13 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *ip6gre_tunnel_stats;
+		ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i);
+		u64_stats_init(&ip6gre_tunnel_stats->syncp);
+	}
+
+
 	return 0;
 }
 
@@ -1449,6 +1457,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
 static int ip6gre_tap_init(struct net_device *dev)
 {
 	struct ip6_tnl *tunnel;
+	int i;
 
 	tunnel = netdev_priv(dev);
 
@@ -1462,6 +1471,12 @@ static int ip6gre_tap_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *ip6gre_tap_stats;
+		ip6gre_tap_stats = per_cpu_ptr(dev->tstats, i);
+		u64_stats_init(&ip6gre_tap_stats->syncp);
+	}
+
 	return 0;
 }
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 583b77e2f69b..df1fa58528c6 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1494,12 +1494,19 @@ static inline int
 ip6_tnl_dev_init_gen(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
+	int i;
 
 	t->dev = dev;
 	t->net = dev_net(dev);
 	dev->tstats = alloc_percpu(struct pcpu_tstats);
 	if (!dev->tstats)
 		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *ip6_tnl_stats;
+		ip6_tnl_stats = per_cpu_ptr(dev->tstats, i);
+		u64_stats_init(&ip6_tnl_stats->syncp);
+	}
 	return 0;
 }
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 19269453a8ea..365dc5473eb2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1310,6 +1310,7 @@ static void ipip6_tunnel_setup(struct net_device *dev)
 static int ipip6_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int i;
 
 	tunnel->dev = dev;
 	tunnel->net = dev_net(dev);
@@ -1322,6 +1323,12 @@ static int ipip6_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *ipip6_tunnel_stats;
+		ipip6_tunnel_stats = per_cpu_ptr(dev->tstats, i);
+		u64_stats_init(&ipip6_tunnel_stats->syncp);
+	}
+
 	return 0;
 }
 
@@ -1331,6 +1338,7 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 	struct iphdr *iph = &tunnel->parms.iph;
 	struct net *net = dev_net(dev);
 	struct sit_net *sitn = net_generic(net, sit_net_id);
+	int i;
 
 	tunnel->dev = dev;
 	tunnel->net = dev_net(dev);
@@ -1344,6 +1352,13 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 	dev->tstats = alloc_percpu(struct pcpu_tstats);
 	if (!dev->tstats)
 		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *ipip6_fb_stats;
+		ipip6_fb_stats = per_cpu_ptr(dev->tstats, i);
+		u64_stats_init(&ipip6_fb_stats->syncp);
+	}
+
 	dev_hold(dev);
 	rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
 	return 0;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a3df9bddc4f7..3825725907f6 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -842,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 	       struct ip_vs_dest **dest_p)
 {
 	struct ip_vs_dest *dest;
-	unsigned int atype;
+	unsigned int atype, i;
 
 	EnterFunction(2);
 
@@ -869,6 +869,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 	if (!dest->stats.cpustats)
 		goto err_alloc;
 
+	for_each_possible_cpu(i) {
+		struct ip_vs_cpu_stats *ip_vs_dest_stats;
+		ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
+		u64_stats_init(&ip_vs_dest_stats->syncp);
+	}
+
 	dest->af = svc->af;
 	dest->protocol = svc->protocol;
 	dest->vaddr = svc->addr;
@@ -1134,7 +1140,7 @@ static int
 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 		  struct ip_vs_service **svc_p)
 {
-	int ret = 0;
+	int ret = 0, i;
 	struct ip_vs_scheduler *sched = NULL;
 	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
@@ -1184,6 +1190,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 		goto out_err;
 	}
 
+	for_each_possible_cpu(i) {
+		struct ip_vs_cpu_stats *ip_vs_stats;
+		ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
+		u64_stats_init(&ip_vs_stats->syncp);
+	}
+
+
 	/* I'm the first user of the service */
 	atomic_set(&svc->refcnt, 0);
 
@@ -3780,7 +3793,7 @@ static struct notifier_block ip_vs_dst_notifier = {
 
 int __net_init ip_vs_control_net_init(struct net *net)
 {
-	int idx;
+	int i, idx;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	/* Initialize rs_table */
@@ -3799,6 +3812,12 @@ int __net_init ip_vs_control_net_init(struct net *net)
 	if (!ipvs->tot_stats.cpustats)
 		return -ENOMEM;
 
+	for_each_possible_cpu(i) {
+		struct ip_vs_cpu_stats *ipvs_tot_stats;
+		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
+		u64_stats_init(&ipvs_tot_stats->syncp);
+	}
+
 	spin_lock_init(&ipvs->tot_stats.lock);
 
 	proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2aa13bd7f2b2..b92553c02279 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1698,6 +1698,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		goto err_destroy_table;
 	}
 
+	for_each_possible_cpu(i) {
+		struct dp_stats_percpu *dpath_stats;
+		dpath_stats = per_cpu_ptr(dp->stats_percpu, i);
+		u64_stats_init(&dpath_stats->sync);
+	}
+
 	dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
 			GFP_KERNEL);
 	if (!dp->ports) {
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6f65dbe13812..d830a95f03a4 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -118,6 +118,7 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
 {
 	struct vport *vport;
 	size_t alloc_size;
+	int i;
 
 	alloc_size = sizeof(struct vport);
 	if (priv_size) {
@@ -141,6 +142,13 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	for_each_possible_cpu(i) {
+		struct pcpu_tstats *vport_stats;
+		vport_stats = per_cpu_ptr(vport->percpu_stats, i);
+		u64_stats_init(&vport_stats->syncp);
+	}
+
+
 	spin_lock_init(&vport->stats_lock);
 
 	return vport;
-- 
cgit v1.2.3


From dcd607718385d02ce3741de225927a57f528f93b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2013 18:32:06 -0800
Subject: inet: fix a UFO regression

While testing virtio_net and skb_segment() changes, Hannes reported
that UFO was sending wrong frames.

It appears this was introduced by a recent commit :
8c3a897bfab1 ("inet: restore gso for vxlan")

The old condition to perform IP frag was :

tunnel = !!skb->encapsulation;
...
        if (!tunnel && proto == IPPROTO_UDP) {

So the new one should be :

udpfrag = !skb->encapsulation && proto == IPPROTO_UDP;
...
        if (udpfrag) {

Initialization of udpfrag must be done before call
to ops->callbacks.gso_segment(skb, features), as
skb_udp_tunnel_segment() clears skb->encapsulation

(We want udpfrag to be true for UFO, false for VXLAN)

With help from Alexei Starovoitov

Reported-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 09d78d4a3cff..68af9aac91d0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1299,6 +1299,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
+	/* Note : following gso_segment() might change skb->encapsulation */
+	udpfrag = !skb->encapsulation && proto == IPPROTO_UDP;
+
 	ops = rcu_dereference(inet_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment))
 		segs = ops->callbacks.gso_segment(skb, features);
@@ -1306,7 +1309,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	if (IS_ERR_OR_NULL(segs))
 		goto out;
 
-	udpfrag = !!skb->encapsulation && proto == IPPROTO_UDP;
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
-- 
cgit v1.2.3


From 81b9eab5ebbf0d5d54da4fc168cfb02c2adc76b8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 12 Nov 2013 14:39:13 -0800
Subject: core/dev: do not ignore dmac in dev_forward_skb()

commit 06a23fe31ca3
("core/dev: set pkt_type after eth_type_trans() in dev_forward_skb()")
and refactoring 64261f230a91
("dev: move skb_scrub_packet() after eth_type_trans()")

are forcing pkt_type to be PACKET_HOST when skb traverses veth.

which means that ip forwarding will kick in inside netns
even if skb->eth->h_dest != dev->dev_addr

Fix order of eth_type_trans() and skb_scrub_packet() in dev_forward_skb()
and in ip_tunnel_rcv()

Fixes: 06a23fe31ca3 ("core/dev: set pkt_type after eth_type_trans() in dev_forward_skb()")
CC: Isaku Yamahata <yamahatanetdev@gmail.com>
CC: Maciej Zenczykowski <zenczykowski@gmail.com>
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 6 +-----
 net/ipv4/ip_tunnel.c | 4 ++--
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8ffc52e01ece..974143d3e727 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1690,13 +1690,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
-	skb->protocol = eth_type_trans(skb, dev);
 
-	/* eth_type_trans() can set pkt_type.
-	 * call skb_scrub_packet() after it to clear pkt_type _after_ calling
-	 * eth_type_trans().
-	 */
 	skb_scrub_packet(skb, true);
+	skb->protocol = eth_type_trans(skb, dev);
 
 	return netif_rx(skb);
 }
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 63a6d6d6b875..254f11c24aa5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -454,6 +454,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 	tstats->rx_bytes += skb->len;
 	u64_stats_update_end(&tstats->syncp);
 
+	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
 	if (tunnel->dev->type == ARPHRD_ETHER) {
 		skb->protocol = eth_type_trans(skb, tunnel->dev);
 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
@@ -461,8 +463,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		skb->dev = tunnel->dev;
 	}
 
-	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
-
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
-- 
cgit v1.2.3


From 98e09386c0ef4dfd48af7ba60ff908f0d525cdee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Nov 2013 06:32:54 -0800
Subject: tcp: tsq: restore minimal amount of queueing

After commit c9eeec26e32e ("tcp: TSQ can use a dynamic limit"), several
users reported throughput regressions, notably on mvneta and wifi
adapters.

802.11 AMPDU requires a fair amount of queueing to be effective.

This patch partially reverts the change done in tcp_write_xmit()
so that the minimal amount is sysctl_tcp_limit_output_bytes.

It also remove the use of this sysctl while building skb stored
in write queue, as TSO autosizing does the right thing anyway.

Users with well behaving NICS and correct qdisc (like sch_fq),
can then lower the default sysctl_tcp_limit_output_bytes value from
128KB to 8KB.

This new usage of sysctl_tcp_limit_output_bytes permits each driver
authors to check how their driver performs when/if the value is set
to a minimum of 4KB.

Normally, line rate for a single TCP flow should be possible,
but some drivers rely on timers to perform TX completion and
too long TX completion delays prevent reaching full throughput.

Fixes: c9eeec26e32e ("tcp: TSQ can use a dynamic limit")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Sujith Manoharan <sujith@msujith.org>
Reported-by: Arnaud Ebalard <arno@natisbad.org>
Tested-by: Sujith Manoharan <sujith@msujith.org>
Cc: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 3 ---
 net/ipv4/tcp.c                         | 6 ------
 net/ipv4/tcp_output.c                  | 6 +++++-
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8b8a05787641..3c12d9a7ed00 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -577,9 +577,6 @@ tcp_limit_output_bytes - INTEGER
 	typical pfifo_fast qdiscs.
 	tcp_limit_output_bytes limits the number of bytes on qdisc
 	or device to reduce artificial RTT/cwnd and reduce bufferbloat.
-	Note: For GSO/TSO enabled flows, we try to have at least two
-	packets in flight. Reducing tcp_limit_output_bytes might also
-	reduce the size of individual GSO packet (64KB being the max)
 	Default: 131072
 
 tcp_challenge_ack_limit - INTEGER
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e8529d3c8c9..3dc0c6cf02a8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -808,12 +808,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 		xmit_size_goal = min_t(u32, gso_size,
 				       sk->sk_gso_max_size - 1 - hlen);
 
-		/* TSQ : try to have at least two segments in flight
-		 * (one in NIC TX ring, another in Qdisc)
-		 */
-		xmit_size_goal = min_t(u32, xmit_size_goal,
-				       sysctl_tcp_limit_output_bytes >> 1);
-
 		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
 
 		/* We try hard to avoid divides here */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 672854664ff5..c5231d9b06d7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1875,8 +1875,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		 *  - better RTT estimation and ACK scheduling
 		 *  - faster recovery
 		 *  - high rates
+		 * Alas, some drivers / subsystems require a fair amount
+		 * of queued bytes to ensure line rate.
+		 * One example is wifi aggregation (802.11 AMPDU)
 		 */
-		limit = max(skb->truesize, sk->sk_pacing_rate >> 10);
+		limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
+			      sk->sk_pacing_rate >> 10);
 
 		if (atomic_read(&sk->sk_wmem_alloc) > limit) {
 			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
-- 
cgit v1.2.3


From dccf76ca6b626c0c4a4e09bb221adee3270ab0ef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Nov 2013 15:00:46 -0800
Subject: net-tcp: fix panic in tcp_fastopen_cache_set()

We had some reports of crashes using TCP fastopen, and Dave Jones
gave a nice stack trace pointing to the error.

Issue is that tcp_get_metrics() should not be called with a NULL dst

Fixes: 1fe4c481ba637 ("net-tcp: Fast Open client - cookie cache")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dave Jones <davej@redhat.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Tested-by: Dave Jones <davej@fedoraproject.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 2ab09cbae74d..d3ee2e0c28b6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -663,10 +663,13 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 			    struct tcp_fastopen_cookie *cookie, bool syn_lost)
 {
+	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_metrics_block *tm;
 
+	if (!dst)
+		return;
 	rcu_read_lock();
-	tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+	tm = tcp_get_metrics(sk, dst, true);
 	if (tm) {
 		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
 
-- 
cgit v1.2.3


From 4534de8305b3f1460a527a0cda0e3dc2224c6f0c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 14 Nov 2013 17:14:46 +0100
Subject: genetlink: make all genl_ops users const

Now that genl_ops are no longer modified in place when
registering, they can be made const. This patch was done
mostly with spatch:

@@
identifier ops;
@@
+const
 struct genl_ops ops[] = {
 ...
 };

(except the struct thing in net/openvswitch/datapath.c)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c               |  2 +-
 drivers/net/wireless/mac80211_hwsim.c |  2 +-
 kernel/taskstats.c                    |  2 +-
 net/core/drop_monitor.c               |  2 +-
 net/hsr/hsr_netlink.c                 |  2 +-
 net/ieee802154/netlink.c              |  2 +-
 net/ipv4/tcp_metrics.c                |  2 +-
 net/irda/irnetlink.c                  |  2 +-
 net/l2tp/l2tp_netlink.c               |  2 +-
 net/netfilter/ipvs/ip_vs_ctl.c        |  2 +-
 net/netlabel/netlabel_cipso_v4.c      |  2 +-
 net/netlabel/netlabel_mgmt.c          |  2 +-
 net/netlabel/netlabel_unlabeled.c     |  2 +-
 net/nfc/netlink.c                     |  2 +-
 net/openvswitch/datapath.c            | 10 +++++-----
 net/wimax/stack.c                     |  2 +-
 net/wireless/nl80211.c                |  2 +-
 17 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 50e43e64d51d..6390254beb7d 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2644,7 +2644,7 @@ static int team_nl_cmd_port_list_get(struct sk_buff *skb,
 	return err;
 }
 
-static struct genl_ops team_nl_ops[] = {
+static const struct genl_ops team_nl_ops[] = {
 	{
 		.cmd = TEAM_CMD_NOOP,
 		.doit = team_nl_cmd_noop,
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index de0df86704e7..cfc3fda79a2d 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2097,7 +2097,7 @@ out:
 }
 
 /* Generic Netlink operations array */
-static struct genl_ops hwsim_ops[] = {
+static const struct genl_ops hwsim_ops[] = {
 	{
 		.cmd = HWSIM_CMD_REGISTER,
 		.policy = hwsim_genl_policy,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 609e77f68687..76595cd9d211 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -673,7 +673,7 @@ err:
 	nlmsg_free(rep_skb);
 }
 
-static struct genl_ops taskstats_ops[] = {
+static const struct genl_ops taskstats_ops[] = {
 	{
 		.cmd		= TASKSTATS_CMD_GET,
 		.doit		= taskstats_user_cmd,
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 5e78d44333b9..f9fe2f22d20b 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -333,7 +333,7 @@ out:
 	return NOTIFY_DONE;
 }
 
-static struct genl_ops dropmon_ops[] = {
+static const struct genl_ops dropmon_ops[] = {
 	{
 		.cmd = NET_DM_CMD_CONFIG,
 		.doit = net_dm_cmd_config,
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 8f52a9fb7e85..79d72ca309ce 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -389,7 +389,7 @@ fail:
 }
 
 
-static struct genl_ops hsr_ops[] = {
+static const struct genl_ops hsr_ops[] = {
 	{
 		.cmd = HSR_C_GET_NODE_STATUS,
 		.flags = 0,
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index eb9faef6782a..3ffcdbb56aab 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -109,7 +109,7 @@ out:
 	return -ENOBUFS;
 }
 
-static struct genl_ops ieee8021154_ops[] = {
+static const struct genl_ops ieee8021154_ops[] = {
 	/* see nl-phy.c */
 	IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
 			ieee802154_dump_phy),
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d3ee2e0c28b6..8c121b523eee 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -991,7 +991,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct genl_ops tcp_metrics_nl_ops[] = {
+static const struct genl_ops tcp_metrics_nl_ops[] = {
 	{
 		.cmd = TCP_METRICS_CMD_GET,
 		.doit = tcp_metrics_nl_cmd_get,
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index c32971269280..bf5d7d476dae 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -131,7 +131,7 @@ static const struct nla_policy irda_nl_policy[IRDA_NL_ATTR_MAX + 1] = {
 	[IRDA_NL_ATTR_MODE] = { .type = NLA_U32 },
 };
 
-static struct genl_ops irda_nl_ops[] = {
+static const struct genl_ops irda_nl_ops[] = {
 	{
 		.cmd = IRDA_NL_CMD_SET_MODE,
 		.doit = irda_nl_set_mode,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index be446d517bc9..57db66e24f1f 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -793,7 +793,7 @@ static struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
 	},
 };
 
-static struct genl_ops l2tp_nl_ops[] = {
+static const struct genl_ops l2tp_nl_ops[] = {
 	{
 		.cmd = L2TP_CMD_NOOP,
 		.doit = l2tp_nl_cmd_noop,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 62786a495cea..fc8a04ed8854 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3567,7 +3567,7 @@ out:
 }
 
 
-static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+static const struct genl_ops ip_vs_genl_ops[] __read_mostly = {
 	{
 		.cmd	= IPVS_CMD_NEW_SERVICE,
 		.flags	= GENL_ADMIN_PERM,
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index a1100640495d..706691739b99 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -737,7 +737,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
  * NetLabel Generic NETLINK Command Definitions
  */
 
-static struct genl_ops netlbl_cipsov4_ops[] = {
+static const struct genl_ops netlbl_cipsov4_ops[] = {
 	{
 	.cmd = NLBL_CIPSOV4_C_ADD,
 	.flags = GENL_ADMIN_PERM,
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index dd1c37d7acbc..7de6f660b80a 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -705,7 +705,7 @@ version_failure:
  * NetLabel Generic NETLINK Command Definitions
  */
 
-static struct genl_ops netlbl_mgmt_genl_ops[] = {
+static const struct genl_ops netlbl_mgmt_genl_ops[] = {
 	{
 	.cmd = NLBL_MGMT_C_ADD,
 	.flags = GENL_ADMIN_PERM,
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 8f0897407a2c..76ee9252daa7 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1323,7 +1323,7 @@ unlabel_staticlistdef_return:
  * NetLabel Generic NETLINK Command Definitions
  */
 
-static struct genl_ops netlbl_unlabel_genl_ops[] = {
+static const struct genl_ops netlbl_unlabel_genl_ops[] = {
 	{
 	.cmd = NLBL_UNLABEL_C_STATICADD,
 	.flags = GENL_ADMIN_PERM,
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 84b7e3ea7b7a..f5585611c098 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1364,7 +1364,7 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info)
 	return dev->ops->se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx);
 }
 
-static struct genl_ops nfc_genl_ops[] = {
+static const struct genl_ops nfc_genl_ops[] = {
 	{
 		.cmd = NFC_CMD_GET_DEVICE,
 		.doit = nfc_genl_get_device,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 1408adc2a2a7..91e1c927a465 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -557,7 +557,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 	[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
 };
 
-static struct genl_ops dp_packet_genl_ops[] = {
+static const struct genl_ops dp_packet_genl_ops[] = {
 	{ .cmd = OVS_PACKET_CMD_EXECUTE,
 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = packet_policy,
@@ -1034,7 +1034,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static struct genl_ops dp_flow_genl_ops[] = {
+static const struct genl_ops dp_flow_genl_ops[] = {
 	{ .cmd = OVS_FLOW_CMD_NEW,
 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
@@ -1392,7 +1392,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static struct genl_ops dp_datapath_genl_ops[] = {
+static const struct genl_ops dp_datapath_genl_ops[] = {
 	{ .cmd = OVS_DP_CMD_NEW,
 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
@@ -1753,7 +1753,7 @@ out:
 	return skb->len;
 }
 
-static struct genl_ops dp_vport_genl_ops[] = {
+static const struct genl_ops dp_vport_genl_ops[] = {
 	{ .cmd = OVS_VPORT_CMD_NEW,
 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
@@ -1779,7 +1779,7 @@ static struct genl_ops dp_vport_genl_ops[] = {
 
 struct genl_family_and_ops {
 	struct genl_family *family;
-	struct genl_ops *ops;
+	const struct genl_ops *ops;
 	int n_ops;
 	struct genl_multicast_group *group;
 };
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 4b7f15a50087..47170c9495f1 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -415,7 +415,7 @@ static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = {
 	},
 };
 
-static struct genl_ops wimax_gnl_ops[] = {
+static const struct genl_ops wimax_gnl_ops[] = {
 	{
 		.cmd = WIMAX_GNL_OP_MSG_FROM_USER,
 		.flags = GENL_ADMIN_PERM,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3fef55958e98..58c43c8e149f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8937,7 +8937,7 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		rtnl_unlock();
 }
 
-static struct genl_ops nl80211_ops[] = {
+static const struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
 		.doit = nl80211_get_wiphy,
-- 
cgit v1.2.3


From c9e9042994d37cbc1ee538c500e9da1bb9d1bcdf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 14 Nov 2013 13:37:54 -0800
Subject: ipv4: fix possible seqlock deadlock

ip4_datagram_connect() being called from process context,
it should use IP_INC_STATS() instead of IP_INC_STATS_BH()
otherwise we can deadlock on 32bit arches, or get corruptions of
SNMP counters.

Fixes: 584bdf8cbdf6 ("[IPV4]: Fix "ipOutNoRoutes" counter error for TCP and UDP")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/datagram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b28e863fe0a7..19e36376d2a0 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -57,7 +57,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		if (err == -ENETUNREACH)
-			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 652586df95e5d76b37d07a11839126dcfede1621 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 14 Nov 2013 14:31:57 -0800
Subject: seq_file: remove "%n" usage from seq_file users

All seq_printf() users are using "%n" for calculating padding size,
convert them to use seq_setwidth() / seq_pad() pair.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Joe Perches <joe@perches.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/consoles.c   | 10 ++++------
 fs/proc/nommu.c      | 12 +++++-------
 fs/proc/task_mmu.c   | 20 ++++++--------------
 fs/proc/task_nommu.c | 19 ++++++-------------
 net/ipv4/fib_trie.c  | 13 +++++++------
 net/ipv4/ping.c      | 15 +++++++--------
 net/ipv4/tcp_ipv4.c  | 33 +++++++++++++++------------------
 net/ipv4/udp.c       | 15 +++++++--------
 net/phonet/socket.c  | 24 +++++++++++-------------
 net/sctp/objcnt.c    |  9 +++++----
 10 files changed, 73 insertions(+), 97 deletions(-)

(limited to 'net/ipv4')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index b701eaa482bf..51942d5abcec 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -29,7 +29,6 @@ static int show_console_dev(struct seq_file *m, void *v)
 	char flags[ARRAY_SIZE(con_flags) + 1];
 	struct console *con = v;
 	unsigned int a;
-	int len;
 	dev_t dev = 0;
 
 	if (con->device) {
@@ -47,11 +46,10 @@ static int show_console_dev(struct seq_file *m, void *v)
 			con_flags[a].name : ' ';
 	flags[a] = 0;
 
-	seq_printf(m, "%s%d%n", con->name, con->index, &len);
-	len = 21 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+	seq_setwidth(m, 21 - 1);
+	seq_printf(m, "%s%d", con->name, con->index);
+	seq_pad(m, ' ');
+	seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
 			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
 			flags);
 	if (dev)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index ccfd99bd1c5a..5f9bc8a746c9 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -39,7 +39,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
-	int flags, len;
+	int flags;
 
 	flags = region->vm_flags;
 	file = region->vm_file;
@@ -50,8 +50,9 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 		ino = inode->i_ino;
 	}
 
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
 		   region->vm_start,
 		   region->vm_end,
 		   flags & VM_READ ? 'r' : '-',
@@ -59,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
 		   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
-		   MAJOR(dev), MINOR(dev), ino, &len);
+		   MAJOR(dev), MINOR(dev), ino);
 
 	if (file) {
-		len = 25 + sizeof(void *) * 6 - len;
-		if (len < 1)
-			len = 1;
-		seq_printf(m, "%*c", len, ' ');
+		seq_pad(m, ' ');
 		seq_path(m, &file->f_path, "");
 	}
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 42b5cf5d0326..fb52b548080d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -84,14 +84,6 @@ unsigned long task_statm(struct mm_struct *mm,
 	return mm->total_vm;
 }
 
-static void pad_len_spaces(struct seq_file *m, int len)
-{
-	len = 25 + sizeof(void*) * 6 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c", len, ' ');
-}
-
 #ifdef CONFIG_NUMA
 /*
  * These functions are for numa_maps but called in generic **maps seq_file
@@ -269,7 +261,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	unsigned long long pgoff = 0;
 	unsigned long start, end;
 	dev_t dev = 0;
-	int len;
 	const char *name = NULL;
 
 	if (file) {
@@ -287,7 +278,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	if (stack_guard_page_end(vma, end))
 		end -= PAGE_SIZE;
 
-	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
 			start,
 			end,
 			flags & VM_READ ? 'r' : '-',
@@ -295,14 +287,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 			flags & VM_EXEC ? 'x' : '-',
 			flags & VM_MAYSHARE ? 's' : 'p',
 			pgoff,
-			MAJOR(dev), MINOR(dev), ino, &len);
+			MAJOR(dev), MINOR(dev), ino);
 
 	/*
 	 * Print the dentry name for named mappings, and a
 	 * special [heap] marker for the heap:
 	 */
 	if (file) {
-		pad_len_spaces(m, len);
+		seq_pad(m, ' ');
 		seq_path(m, &file->f_path, "\n");
 		goto done;
 	}
@@ -334,7 +326,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 				name = "[stack]";
 			} else {
 				/* Thread stack in /proc/PID/maps */
-				pad_len_spaces(m, len);
+				seq_pad(m, ' ');
 				seq_printf(m, "[stack:%d]", tid);
 			}
 		}
@@ -342,7 +334,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
 done:
 	if (name) {
-		pad_len_spaces(m, len);
+		seq_pad(m, ' ');
 		seq_puts(m, name);
 	}
 	seq_putc(m, '\n');
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 56123a6f462e..678455d2d683 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,14 +123,6 @@ unsigned long task_statm(struct mm_struct *mm,
 	return size;
 }
 
-static void pad_len_spaces(struct seq_file *m, int len)
-{
-	len = 25 + sizeof(void*) * 6 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c", len, ' ');
-}
-
 /*
  * display a single VMA to a sequenced file
  */
@@ -142,7 +134,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
-	int flags, len;
+	int flags;
 	unsigned long long pgoff = 0;
 
 	flags = vma->vm_flags;
@@ -155,8 +147,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 	}
 
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
@@ -164,16 +157,16 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
 		   pgoff,
-		   MAJOR(dev), MINOR(dev), ino, &len);
+		   MAJOR(dev), MINOR(dev), ino);
 
 	if (file) {
-		pad_len_spaces(m, len);
+		seq_pad(m, ' ');
 		seq_path(m, &file->f_path, "");
 	} else if (mm) {
 		pid_t tid = vm_is_stack(priv->task, vma, is_pid);
 
 		if (tid != 0) {
-			pad_len_spaces(m, len);
+			seq_pad(m, ' ');
 			/*
 			 * Thread stack in /proc/PID/task/TID/maps or
 			 * the main process stack.
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ec9a9ef4ce50..5afeb5aa4c7c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2523,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
 			const struct fib_info *fi = fa->fa_info;
 			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
-			int len;
 
 			if (fa->fa_type == RTN_BROADCAST
 			    || fa->fa_type == RTN_MULTICAST)
 				continue;
 
+			seq_setwidth(seq, 127);
+
 			if (fi)
 				seq_printf(seq,
 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
-					 "%d\t%08X\t%d\t%u\t%u%n",
+					 "%d\t%08X\t%d\t%u\t%u",
 					 fi->fib_dev ? fi->fib_dev->name : "*",
 					 prefix,
 					 fi->fib_nh->nh_gw, flags, 0, 0,
@@ -2541,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 					 (fi->fib_advmss ?
 					  fi->fib_advmss + 40 : 0),
 					 fi->fib_window,
-					 fi->fib_rtt >> 3, &len);
+					 fi->fib_rtt >> 3);
 			else
 				seq_printf(seq,
 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
-					 "%d\t%08X\t%d\t%u\t%u%n",
+					 "%d\t%08X\t%d\t%u\t%u",
 					 prefix, 0, flags, 0, 0, 0,
-					 mask, 0, 0, 0, &len);
+					 mask, 0, 0, 0);
 
-			seq_printf(seq, "%*s\n", 127 - len, "");
+			seq_pad(seq, '\n');
 		}
 	}
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9afbdb19f4a2..cbc85f660d54 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1076,7 +1076,7 @@ void ping_seq_stop(struct seq_file *seq, void *v)
 EXPORT_SYMBOL_GPL(ping_seq_stop);
 
 static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
-		int bucket, int *len)
+		int bucket)
 {
 	struct inet_sock *inet = inet_sk(sp);
 	__be32 dest = inet->inet_daddr;
@@ -1085,7 +1085,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
 	__u16 srcp = ntohs(inet->inet_sport);
 
 	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
@@ -1093,23 +1093,22 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
 		0, sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
-		atomic_read(&sp->sk_drops), len);
+		atomic_read(&sp->sk_drops));
 }
 
 static int ping_v4_seq_show(struct seq_file *seq, void *v)
 {
+	seq_setwidth(seq, 127);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%-127s\n",
-			   "  sl  local_address rem_address   st tx_queue "
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
 			   "inode ref pointer drops");
 	else {
 		struct ping_iter_state *state = seq->private;
-		int len;
 
-		ping_v4_format_sock(v, seq, state->bucket, &len);
-		seq_printf(seq, "%*s\n", 127 - len, "");
+		ping_v4_format_sock(v, seq, state->bucket);
 	}
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 14bba8a1c5a7..59a6f8b90cd9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2541,13 +2541,13 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 EXPORT_SYMBOL(tcp_proc_unregister);
 
 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
-			 struct seq_file *f, int i, kuid_t uid, int *len)
+			 struct seq_file *f, int i, kuid_t uid)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	long delta = req->expires - jiffies;
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
 		i,
 		ireq->ir_loc_addr,
 		ntohs(inet_sk(sk)->inet_sport),
@@ -2562,11 +2562,10 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
 		0,  /* non standard timer */
 		0, /* open_requests have no inode */
 		atomic_read(&sk->sk_refcnt),
-		req,
-		len);
+		req);
 }
 
-static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 {
 	int timer_active;
 	unsigned long timer_expires;
@@ -2605,7 +2604,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
-			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
+			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
 		i, src, srcp, dest, destp, sk->sk_state,
 		tp->write_seq - tp->snd_una,
 		rx_queue,
@@ -2622,12 +2621,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		tp->snd_cwnd,
 		sk->sk_state == TCP_LISTEN ?
 		    (fastopenq ? fastopenq->max_qlen : 0) :
-		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
-		len);
+		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
 }
 
 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
-			       struct seq_file *f, int i, int *len)
+			       struct seq_file *f, int i)
 {
 	__be32 dest, src;
 	__u16 destp, srcp;
@@ -2639,10 +2637,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
 	srcp  = ntohs(tw->tw_sport);
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
-		atomic_read(&tw->tw_refcnt), tw, len);
+		atomic_read(&tw->tw_refcnt), tw);
 }
 
 #define TMPSZ 150
@@ -2651,11 +2649,10 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
 {
 	struct tcp_iter_state *st;
 	struct sock *sk = v;
-	int len;
 
+	seq_setwidth(seq, TMPSZ - 1);
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "%-*s\n", TMPSZ - 1,
-			   "  sl  local_address rem_address   st tx_queue "
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
 			   "inode");
 		goto out;
@@ -2666,16 +2663,16 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
 	case TCP_SEQ_STATE_LISTENING:
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (sk->sk_state == TCP_TIME_WAIT)
-			get_timewait4_sock(v, seq, st->num, &len);
+			get_timewait4_sock(v, seq, st->num);
 		else
-			get_tcp4_sock(v, seq, st->num, &len);
+			get_tcp4_sock(v, seq, st->num);
 		break;
 	case TCP_SEQ_STATE_OPENREQ:
-		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
+		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
 		break;
 	}
-	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
 out:
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89909dd730dd..de86e5bc4462 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2331,7 +2331,7 @@ EXPORT_SYMBOL(udp_proc_unregister);
 
 /* ------------------------------------------------------------------------ */
 static void udp4_format_sock(struct sock *sp, struct seq_file *f,
-		int bucket, int *len)
+		int bucket)
 {
 	struct inet_sock *inet = inet_sk(sp);
 	__be32 dest = inet->inet_daddr;
@@ -2340,7 +2340,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 	__u16 srcp	  = ntohs(inet->inet_sport);
 
 	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
@@ -2348,23 +2348,22 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
 		0, sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
-		atomic_read(&sp->sk_drops), len);
+		atomic_read(&sp->sk_drops));
 }
 
 int udp4_seq_show(struct seq_file *seq, void *v)
 {
+	seq_setwidth(seq, 127);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%-127s\n",
-			   "  sl  local_address rem_address   st tx_queue "
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
 			   "inode ref pointer drops");
 	else {
 		struct udp_iter_state *state = seq->private;
-		int len;
 
-		udp4_format_sock(v, seq, state->bucket, &len);
-		seq_printf(seq, "%*s\n", 127 - len, "");
+		udp4_format_sock(v, seq, state->bucket);
 	}
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 77e38f733496..008214a3d5eb 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -595,26 +595,25 @@ static void pn_sock_seq_stop(struct seq_file *seq, void *v)
 
 static int pn_sock_seq_show(struct seq_file *seq, void *v)
 {
-	int len;
-
+	seq_setwidth(seq, 127);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%s%n", "pt  loc  rem rs st tx_queue rx_queue "
-			"  uid inode ref pointer drops", &len);
+		seq_puts(seq, "pt  loc  rem rs st tx_queue rx_queue "
+			"  uid inode ref pointer drops");
 	else {
 		struct sock *sk = v;
 		struct pn_sock *pn = pn_sk(sk);
 
 		seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu "
-			"%d %pK %d%n",
+			"%d %pK %d",
 			sk->sk_protocol, pn->sobject, pn->dobject,
 			pn->resource, sk->sk_state,
 			sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
 			from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
 			sock_i_ino(sk),
 			atomic_read(&sk->sk_refcnt), sk,
-			atomic_read(&sk->sk_drops), &len);
+			atomic_read(&sk->sk_drops));
 	}
-	seq_printf(seq, "%*s\n", 127 - len, "");
+	seq_pad(seq, '\n');
 	return 0;
 }
 
@@ -785,20 +784,19 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v)
 
 static int pn_res_seq_show(struct seq_file *seq, void *v)
 {
-	int len;
-
+	seq_setwidth(seq, 63);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%s%n", "rs   uid inode", &len);
+		seq_puts(seq, "rs   uid inode");
 	else {
 		struct sock **psk = v;
 		struct sock *sk = *psk;
 
-		seq_printf(seq, "%02X %5u %lu%n",
+		seq_printf(seq, "%02X %5u %lu",
 			   (int) (psk - pnres.sk),
 			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
-			   sock_i_ino(sk), &len);
+			   sock_i_ino(sk));
 	}
-	seq_printf(seq, "%*s\n", 63 - len, "");
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 5ea573b37648..647396baa56f 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -79,12 +79,13 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
  */
 static int sctp_objcnt_seq_show(struct seq_file *seq, void *v)
 {
-	int i, len;
+	int i;
 
 	i = (int)*(loff_t *)v;
-	seq_printf(seq, "%s: %d%n", sctp_dbg_objcnt[i].label,
-				atomic_read(sctp_dbg_objcnt[i].counter), &len);
-	seq_printf(seq, "%*s\n", 127 - len, "");
+	seq_setwidth(seq, 127);
+	seq_printf(seq, "%s: %d", sctp_dbg_objcnt[i].label,
+				atomic_read(sctp_dbg_objcnt[i].counter));
+	seq_pad(seq, '\n');
 	return 0;
 }
 
-- 
cgit v1.2.3


From a6441b7a39f18acb68c83cd738f1310881aa8a0b Mon Sep 17 00:00:00 2001
From: Martin Topholm <mph@one.com>
Date: Thu, 14 Nov 2013 15:35:30 +0100
Subject: netfilter: synproxy: send mss option to backend

When the synproxy_parse_options is called on the client ack the mss
option will not be present. Consequently mss wont be included in the
backend syn packet, which falls back to 536 bytes mss.

Therefore XT_SYNPROXY_OPT_MSS is explicitly flagged when recovering mss
value from cookie.

Signed-off-by: Martin Topholm <mph@one.com>
Reviewed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_SYNPROXY.c  | 1 +
 net/ipv6/netfilter/ip6t_SYNPROXY.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 01cffeaa0085..f13bd91d9a56 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -244,6 +244,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,
 
 	this_cpu_inc(snet->stats->cookie_valid);
 	opts->mss = mss;
+	opts->options |= XT_SYNPROXY_OPT_MSS;
 
 	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
 		synproxy_check_timestamp_cookie(opts);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index bf9f612c1bc2..f78f41aca8e9 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -259,6 +259,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,
 
 	this_cpu_inc(snet->stats->cookie_valid);
 	opts->mss = mss;
+	opts->options |= XT_SYNPROXY_OPT_MSS;
 
 	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
 		synproxy_check_timestamp_cookie(opts);
-- 
cgit v1.2.3


From bceaa90240b6019ed73b49965eac7d167610be69 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 18 Nov 2013 04:20:45 +0100
Subject: inet: prevent leakage of uninitialized memory to user in recv
 syscalls

Only update *addr_len when we actually fill in sockaddr, otherwise we
can return uninitialized memory from the stack to the caller in the
recvfrom, recvmmsg and recvmsg syscalls. Drop the the (addr_len == NULL)
checks because we only get called with a valid addr_len pointer either
from sock_common_recvmsg or inet_recvmsg.

If a blocking read waits on a socket which is concurrently shut down we
now return zero and set msg_msgnamelen to 0.

Reported-by: mpb <mpb.mail@gmail.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/dgram.c |  3 +--
 net/ipv4/ping.c        | 19 +++++++------------
 net/ipv4/raw.c         |  4 +---
 net/ipv4/udp.c         |  7 +------
 net/ipv6/raw.c         |  4 +---
 net/ipv6/udp.c         |  5 +----
 net/l2tp/l2tp_ip.c     |  4 +---
 net/phonet/datagram.c  |  9 ++++-----
 8 files changed, 17 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index 581a59504bd5..1865fdf5a5a5 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -315,9 +315,8 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (saddr) {
 		saddr->family = AF_IEEE802154;
 		saddr->addr = mac_cb(skb)->sa;
-	}
-	if (addr_len)
 		*addr_len = sizeof(*saddr);
+	}
 
 	if (flags & MSG_TRUNC)
 		copied = skb->len;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9afbdb19f4a2..aacefa0caa36 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -830,8 +830,6 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 {
 	struct inet_sock *isk = inet_sk(sk);
 	int family = sk->sk_family;
-	struct sockaddr_in *sin;
-	struct sockaddr_in6 *sin6;
 	struct sk_buff *skb;
 	int copied, err;
 
@@ -841,13 +839,6 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (flags & MSG_OOB)
 		goto out;
 
-	if (addr_len) {
-		if (family == AF_INET)
-			*addr_len = sizeof(*sin);
-		else if (family == AF_INET6 && addr_len)
-			*addr_len = sizeof(*sin6);
-	}
-
 	if (flags & MSG_ERRQUEUE) {
 		if (family == AF_INET) {
 			return ip_recv_error(sk, msg, len);
@@ -877,11 +868,13 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	/* Copy the address and add cmsg data. */
 	if (family == AF_INET) {
-		sin = (struct sockaddr_in *) msg->msg_name;
+		struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+
 		sin->sin_family = AF_INET;
 		sin->sin_port = 0 /* skb->h.uh->source */;
 		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
 		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
 
 		if (isk->cmsg_flags)
 			ip_cmsg_recv(msg, skb);
@@ -890,17 +883,19 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	} else if (family == AF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
 		struct ipv6hdr *ip6 = ipv6_hdr(skb);
-		sin6 = (struct sockaddr_in6 *) msg->msg_name;
+		struct sockaddr_in6 *sin6 =
+			(struct sockaddr_in6 *)msg->msg_name;
+
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_port = 0;
 		sin6->sin6_addr = ip6->saddr;
-
 		sin6->sin6_flowinfo = 0;
 		if (np->sndflow)
 			sin6->sin6_flowinfo = ip6_flowinfo(ip6);
 
 		sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
 							  IP6CB(skb)->iif);
+		*addr_len = sizeof(*sin6);
 
 		if (inet6_sk(sk)->rxopt.all)
 			pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 41e1d2845c8f..5cb8ddb505ee 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -696,9 +696,6 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (flags & MSG_OOB)
 		goto out;
 
-	if (addr_len)
-		*addr_len = sizeof(*sin);
-
 	if (flags & MSG_ERRQUEUE) {
 		err = ip_recv_error(sk, msg, len);
 		goto out;
@@ -726,6 +723,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
 		sin->sin_port = 0;
 		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
 	}
 	if (inet->cmsg_flags)
 		ip_cmsg_recv(msg, skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89909dd730dd..998431cd471a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1235,12 +1235,6 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	int is_udplite = IS_UDPLITE(sk);
 	bool slow;
 
-	/*
-	 *	Check any passed addresses
-	 */
-	if (addr_len)
-		*addr_len = sizeof(*sin);
-
 	if (flags & MSG_ERRQUEUE)
 		return ip_recv_error(sk, msg, len);
 
@@ -1302,6 +1296,7 @@ try_again:
 		sin->sin_port = udp_hdr(skb)->source;
 		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
 		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
 	}
 	if (inet->cmsg_flags)
 		ip_cmsg_recv(msg, skb);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 3c00842b0079..e24ff1df0401 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -465,9 +465,6 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (flags & MSG_OOB)
 		return -EOPNOTSUPP;
 
-	if (addr_len)
-		*addr_len=sizeof(*sin6);
-
 	if (flags & MSG_ERRQUEUE)
 		return ipv6_recv_error(sk, msg, len);
 
@@ -506,6 +503,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		sin6->sin6_flowinfo = 0;
 		sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
 							  IP6CB(skb)->iif);
+		*addr_len = sizeof(*sin6);
 	}
 
 	sock_recv_ts_and_drops(msg, sk, skb);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f3893e897f72..81eb8cf8389b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -392,9 +392,6 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	int is_udp4;
 	bool slow;
 
-	if (addr_len)
-		*addr_len = sizeof(struct sockaddr_in6);
-
 	if (flags & MSG_ERRQUEUE)
 		return ipv6_recv_error(sk, msg, len);
 
@@ -480,7 +477,7 @@ try_again:
 				ipv6_iface_scope_id(&sin6->sin6_addr,
 						    IP6CB(skb)->iif);
 		}
-
+		*addr_len = sizeof(*sin6);
 	}
 	if (is_udp4) {
 		if (inet->cmsg_flags)
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 571db8dd2292..da1a1cee1a08 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -518,9 +518,6 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 	if (flags & MSG_OOB)
 		goto out;
 
-	if (addr_len)
-		*addr_len = sizeof(*sin);
-
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
 		goto out;
@@ -543,6 +540,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
 		sin->sin_port = 0;
 		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
 	}
 	if (inet->cmsg_flags)
 		ip_cmsg_recv(msg, skb);
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 12c30f3e643e..38946b26e471 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -139,9 +139,6 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
 			MSG_CMSG_COMPAT))
 		goto out_nofree;
 
-	if (addr_len)
-		*addr_len = sizeof(sa);
-
 	skb = skb_recv_datagram(sk, flags, noblock, &rval);
 	if (skb == NULL)
 		goto out_nofree;
@@ -162,8 +159,10 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
 
 	rval = (flags & MSG_TRUNC) ? skb->len : copylen;
 
-	if (msg->msg_name != NULL)
-		memcpy(msg->msg_name, &sa, sizeof(struct sockaddr_pn));
+	if (msg->msg_name != NULL) {
+		memcpy(msg->msg_name, &sa, sizeof(sa));
+		*addr_len = sizeof(sa);
+	}
 
 out:
 	skb_free_datagram(sk, skb);
-- 
cgit v1.2.3


From cf970c002d270c36202bd5b9c2804d3097a52da0 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 18 Nov 2013 07:07:45 +0100
Subject: ping: prevent NULL pointer dereference on write to msg_name

A plain read() on a socket does set msg->msg_name to NULL. So check for
NULL pointer first.

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ping.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index aacefa0caa36..91bfe0437e7a 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -870,11 +870,13 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (family == AF_INET) {
 		struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
 
-		sin->sin_family = AF_INET;
-		sin->sin_port = 0 /* skb->h.uh->source */;
-		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
-		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
-		*addr_len = sizeof(*sin);
+		if (sin) {
+			sin->sin_family = AF_INET;
+			sin->sin_port = 0 /* skb->h.uh->source */;
+			sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+			*addr_len = sizeof(*sin);
+		}
 
 		if (isk->cmsg_flags)
 			ip_cmsg_recv(msg, skb);
@@ -886,16 +888,18 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		struct sockaddr_in6 *sin6 =
 			(struct sockaddr_in6 *)msg->msg_name;
 
-		sin6->sin6_family = AF_INET6;
-		sin6->sin6_port = 0;
-		sin6->sin6_addr = ip6->saddr;
-		sin6->sin6_flowinfo = 0;
-		if (np->sndflow)
-			sin6->sin6_flowinfo = ip6_flowinfo(ip6);
-
-		sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
-							  IP6CB(skb)->iif);
-		*addr_len = sizeof(*sin6);
+		if (sin6) {
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = 0;
+			sin6->sin6_addr = ip6->saddr;
+			sin6->sin6_flowinfo = 0;
+			if (np->sndflow)
+				sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+			sin6->sin6_scope_id =
+				ipv6_iface_scope_id(&sin6->sin6_addr,
+						    IP6CB(skb)->iif);
+			*addr_len = sizeof(*sin6);
+		}
 
 		if (inet6_sk(sk)->rxopt.all)
 			pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb);
-- 
cgit v1.2.3


From 236c9f84868534c718b6889aa624de64763281f9 Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Tue, 19 Nov 2013 16:53:28 +0800
Subject: xfrm: Release dst if this dst is improper for vti tunnel

After searching rt by the vti tunnel dst/src parameter,
if this rt has neither attached to any transformation
nor the transformation is not tunnel oriented, this rt
should be released back to ip layer.

otherwise causing dst memory leakage.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5d9c845d288a..52b802a0cd8c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -126,6 +126,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (!rt->dst.xfrm ||
 	    rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
 		dev->stats.tx_carrier_errors++;
+		ip_rt_put(rt);
 		goto tx_error_icmp;
 	}
 	tdev = rt->dst.dev;
-- 
cgit v1.2.3


From dbde497966804e63a38fdedc1e3815e77097efc2 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 19 Nov 2013 22:10:06 +0400
Subject: tcp: don't update snd_nxt, when a socket is switched from repair mode

snd_nxt must be updated synchronously with sk_send_head.  Otherwise
tp->packets_out may be updated incorrectly, what may bring a kernel panic.

Here is a kernel panic from my host.
[  103.043194] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048
[  103.044025] IP: [<ffffffff815aaaaf>] tcp_rearm_rto+0xcf/0x150
...
[  146.301158] Call Trace:
[  146.301158]  [<ffffffff815ab7f0>] tcp_ack+0xcc0/0x12c0

Before this panic a tcp socket was restored. This socket had sent and
unsent data in the write queue. Sent data was restored in repair mode,
then the socket was switched from reapair mode and unsent data was
restored. After that the socket was switched back into repair mode.

In that moment we had a socket where write queue looks like this:
snd_una    snd_nxt   write_seq
   |_________|________|
             |
	  sk_send_head

After a second switching from repair mode the state of socket was
changed:

snd_una          snd_nxt, write_seq
   |_________ ________|
             |
	  sk_send_head

This state is inconsistent, because snd_nxt and sk_send_head are not
synchronized.

Bellow you can find a call trace, how packets_out can be incremented
twice for one skb, if snd_nxt and sk_send_head are not synchronized.
In this case packets_out will be always positive, even when
sk_write_queue is empty.

tcp_write_wakeup
	skb = tcp_send_head(sk);
	tcp_fragment
		if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq))
			tcp_adjust_pcount(sk, skb, diff);
	tcp_event_new_data_sent
		tp->packets_out += tcp_skb_pcount(skb);

I think update of snd_nxt isn't required, when a socket is switched from
repair mode.  Because it's initialized in tcp_connect_init. Then when a
write queue is restored, snd_nxt is incremented in tcp_event_new_data_sent,
so it's always is in consistent state.

I have checked, that the bug is not reproduced with this patch and
all tests about restoring tcp connections work fine.

Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5231d9b06d7..7820f3a7dd70 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3097,7 +3097,6 @@ void tcp_send_window_probe(struct sock *sk)
 {
 	if (sk->sk_state == TCP_ESTABLISHED) {
 		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
-		tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
 		tcp_xmit_probe_skb(sk, 0);
 	}
 }
-- 
cgit v1.2.3


From c53ed7423619b4e8108914a9f31b426dd58ad591 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Nov 2013 15:19:31 +0100
Subject: genetlink: only pass array to genl_register_family_with_ops()

As suggested by David Miller, make genl_register_family_with_ops()
a macro and pass only the array, evaluating ARRAY_SIZE() in the
macro, this is a little safer.

The openvswitch has some indirection, assing ops/n_ops directly in
that code. This might ultimately just assign the pointers in the
family initializations, saving the struct genl_family_and_ops and
code (once mcast groups are handled differently.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c               |  3 +--
 drivers/net/wireless/mac80211_hwsim.c |  3 +--
 fs/dlm/netlink.c                      | 10 ++++++----
 include/linux/genl_magic_func.h       |  3 +--
 include/net/genetlink.h               |  8 ++++++--
 kernel/taskstats.c                    |  3 +--
 net/core/drop_monitor.c               |  3 +--
 net/hsr/hsr_netlink.c                 |  3 +--
 net/ieee802154/netlink.c              |  3 +--
 net/ipv4/tcp_metrics.c                |  3 +--
 net/irda/irnetlink.c                  |  3 +--
 net/l2tp/l2tp_netlink.c               |  7 +------
 net/netfilter/ipvs/ip_vs_ctl.c        |  2 +-
 net/netlabel/netlabel_cipso_v4.c      |  2 +-
 net/netlabel/netlabel_mgmt.c          |  2 +-
 net/netlabel/netlabel_unlabeled.c     |  2 +-
 net/netlink/genetlink.c               | 14 ++++++++------
 net/nfc/netlink.c                     |  3 +--
 net/openvswitch/datapath.c            |  5 +++--
 net/tipc/netlink.c                    | 11 ++++++-----
 net/wimax/stack.c                     |  4 ++--
 net/wireless/nl80211.c                |  3 +--
 22 files changed, 47 insertions(+), 53 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 6390254beb7d..f55758b0840e 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2699,8 +2699,7 @@ static int team_nl_init(void)
 {
 	int err;
 
-	err = genl_register_family_with_ops(&team_nl_family, team_nl_ops,
-					    ARRAY_SIZE(team_nl_ops));
+	err = genl_register_family_with_ops(&team_nl_family, team_nl_ops);
 	if (err)
 		return err;
 
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index cfc3fda79a2d..9df7bc91a26f 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2148,8 +2148,7 @@ static int hwsim_init_netlink(void)
 
 	printk(KERN_INFO "mac80211_hwsim: initializing netlink\n");
 
-	rc = genl_register_family_with_ops(&hwsim_genl_family,
-		hwsim_ops, ARRAY_SIZE(hwsim_ops));
+	rc = genl_register_family_with_ops(&hwsim_genl_family, hwsim_ops);
 	if (rc)
 		goto failure;
 
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 60a327863b11..e7cfbaf8d0e2 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -74,14 +74,16 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct genl_ops dlm_nl_ops = {
-	.cmd		= DLM_CMD_HELLO,
-	.doit		= user_cmd,
+static struct genl_ops dlm_nl_ops[] = {
+	{
+		.cmd	= DLM_CMD_HELLO,
+		.doit	= user_cmd,
+	},
 };
 
 int __init dlm_netlink_init(void)
 {
-	return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
+	return genl_register_family_with_ops(&family, dlm_nl_ops);
 }
 
 void dlm_netlink_exit(void)
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 023bc346b877..47086030ab31 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -293,8 +293,7 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 
 int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
 {
-	int err = genl_register_family_with_ops(&ZZZ_genl_family,
-		ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops));
+	int err = genl_register_family_with_ops(&ZZZ_genl_family, ZZZ_genl_ops);
 	if (err)
 		return err;
 #undef GENL_mc_group
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index e96385d46b48..9bd52a4c5e17 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -152,8 +152,9 @@ static inline int genl_register_family(struct genl_family *family)
  *
  * Return 0 on success or a negative error code.
  */
-static inline int genl_register_family_with_ops(struct genl_family *family,
-	const struct genl_ops *ops, size_t n_ops)
+static inline int _genl_register_family_with_ops(struct genl_family *family,
+						 const struct genl_ops *ops,
+						 size_t n_ops)
 {
 	family->module = THIS_MODULE;
 	family->ops = ops;
@@ -161,6 +162,9 @@ static inline int genl_register_family_with_ops(struct genl_family *family,
 	return __genl_register_family(family);
 }
 
+#define genl_register_family_with_ops(family, ops)	\
+	_genl_register_family_with_ops((family), (ops), ARRAY_SIZE(ops))
+
 int genl_unregister_family(struct genl_family *family);
 int genl_register_mc_group(struct genl_family *family,
 			   struct genl_multicast_group *grp);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 76595cd9d211..13d2f7cd65db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -703,8 +703,7 @@ static int __init taskstats_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&family, taskstats_ops,
-					   ARRAY_SIZE(taskstats_ops));
+	rc = genl_register_family_with_ops(&family, taskstats_ops);
 	if (rc)
 		return rc;
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index f9fe2f22d20b..0efc5028ba9d 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -365,8 +365,7 @@ static int __init init_net_drop_monitor(void)
 	}
 
 	rc = genl_register_family_with_ops(&net_drop_monitor_family,
-					   dropmon_ops,
-					   ARRAY_SIZE(dropmon_ops));
+					   dropmon_ops);
 	if (rc) {
 		pr_err("Could not create drop monitor netlink family\n");
 		return rc;
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 3b9205d2afc4..f182260be76d 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -414,8 +414,7 @@ int __init hsr_netlink_init(void)
 	if (rc)
 		goto fail_rtnl_link_register;
 
-	rc = genl_register_family_with_ops(&hsr_genl_family, hsr_ops,
-					   ARRAY_SIZE(hsr_ops));
+	rc = genl_register_family_with_ops(&hsr_genl_family, hsr_ops);
 	if (rc)
 		goto fail_genl_register_family;
 
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 3ffcdbb56aab..1a81709a4717 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -129,8 +129,7 @@ int __init ieee802154_nl_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&nl802154_family, ieee8021154_ops,
-					   ARRAY_SIZE(ieee8021154_ops));
+	rc = genl_register_family_with_ops(&nl802154_family, ieee8021154_ops);
 	if (rc)
 		return rc;
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 8c121b523eee..06493736fbc8 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1082,8 +1082,7 @@ void __init tcp_metrics_init(void)
 	if (ret < 0)
 		goto cleanup;
 	ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
-					    tcp_metrics_nl_ops,
-					    ARRAY_SIZE(tcp_metrics_nl_ops));
+					    tcp_metrics_nl_ops);
 	if (ret < 0)
 		goto cleanup_subsys;
 	return;
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index bf5d7d476dae..a37b81fe0479 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -149,8 +149,7 @@ static const struct genl_ops irda_nl_ops[] = {
 
 int irda_nl_register(void)
 {
-	return genl_register_family_with_ops(&irda_nl_family,
-		irda_nl_ops, ARRAY_SIZE(irda_nl_ops));
+	return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops);
 }
 
 void irda_nl_unregister(void)
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 57db66e24f1f..4cfd722e9153 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -887,13 +887,8 @@ EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
 
 static int l2tp_nl_init(void)
 {
-	int err;
-
 	pr_info("L2TP netlink interface\n");
-	err = genl_register_family_with_ops(&l2tp_nl_family, l2tp_nl_ops,
-					    ARRAY_SIZE(l2tp_nl_ops));
-
-	return err;
+	return genl_register_family_with_ops(&l2tp_nl_family, l2tp_nl_ops);
 }
 
 static void l2tp_nl_cleanup(void)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fc8a04ed8854..393498704691 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3666,7 +3666,7 @@ static const struct genl_ops ip_vs_genl_ops[] __read_mostly = {
 static int __init ip_vs_genl_register(void)
 {
 	return genl_register_family_with_ops(&ip_vs_genl_family,
-		ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
+					     ip_vs_genl_ops);
 }
 
 static void ip_vs_genl_unregister(void)
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 706691739b99..69345cebe3a3 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -783,5 +783,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
 int __init netlbl_cipsov4_genl_init(void)
 {
 	return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
-		netlbl_cipsov4_ops, ARRAY_SIZE(netlbl_cipsov4_ops));
+					     netlbl_cipsov4_ops);
 }
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 7de6f660b80a..8ef83ee97c6a 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -779,5 +779,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
 int __init netlbl_mgmt_genl_init(void)
 {
 	return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
-		netlbl_mgmt_genl_ops, ARRAY_SIZE(netlbl_mgmt_genl_ops));
+					     netlbl_mgmt_genl_ops);
 }
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 76ee9252daa7..43817d73ccf9 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1397,7 +1397,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
 int __init netlbl_unlabel_genl_init(void)
 {
 	return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
-		netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops));
+					     netlbl_unlabel_genl_ops);
 }
 
 /*
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f54215d7b8f3..c68ce73619b5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -906,11 +906,13 @@ static int genl_ctrl_event(int event, void *data)
 	return 0;
 }
 
-static struct genl_ops genl_ctrl_ops = {
-	.cmd		= CTRL_CMD_GETFAMILY,
-	.doit		= ctrl_getfamily,
-	.dumpit		= ctrl_dumpfamily,
-	.policy		= ctrl_policy,
+static struct genl_ops genl_ctrl_ops[] = {
+	{
+		.cmd		= CTRL_CMD_GETFAMILY,
+		.doit		= ctrl_getfamily,
+		.dumpit		= ctrl_dumpfamily,
+		.policy		= ctrl_policy,
+	},
 };
 
 static struct genl_multicast_group notify_grp = {
@@ -954,7 +956,7 @@ static int __init genl_init(void)
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
 		INIT_LIST_HEAD(&family_ht[i]);
 
-	err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1);
+	err = genl_register_family_with_ops(&genl_ctrl, genl_ctrl_ops);
 	if (err < 0)
 		goto problem;
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f5585611c098..fe6760d328a0 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1536,8 +1536,7 @@ int __init nfc_genl_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&nfc_genl_family, nfc_genl_ops,
-					   ARRAY_SIZE(nfc_genl_ops));
+	rc = genl_register_family_with_ops(&nfc_genl_family, nfc_genl_ops);
 	if (rc)
 		return rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 91e1c927a465..8ec8b73033e0 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1817,8 +1817,9 @@ static int dp_register_genl(void)
 	for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
 		const struct genl_family_and_ops *f = &dp_genl_families[i];
 
-		err = genl_register_family_with_ops(f->family, f->ops,
-						    f->n_ops);
+		f->family->ops = f->ops;
+		f->family->n_ops = f->n_ops;
+		err = genl_register_family(f->family);
 		if (err)
 			goto error;
 		n_registered++;
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 8bcd4985d0fb..9f72a6376362 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -76,9 +76,11 @@ static struct genl_family tipc_genl_family = {
 	.maxattr	= 0,
 };
 
-static struct genl_ops tipc_genl_ops = {
-	.cmd		= TIPC_GENL_CMD,
-	.doit		= handle_cmd,
+static struct genl_ops tipc_genl_ops[] = {
+	{
+		.cmd		= TIPC_GENL_CMD,
+		.doit		= handle_cmd,
+	},
 };
 
 static int tipc_genl_family_registered;
@@ -87,8 +89,7 @@ int tipc_netlink_start(void)
 {
 	int res;
 
-	res = genl_register_family_with_ops(&tipc_genl_family,
-		&tipc_genl_ops, 1);
+	res = genl_register_family_with_ops(&tipc_genl_family, tipc_genl_ops);
 	if (res) {
 		pr_err("Failed to register netlink interface\n");
 		return res;
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 47170c9495f1..6328afe90319 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -597,8 +597,8 @@ int __init wimax_subsys_init(void)
 
 	snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
 		 "WiMAX");
-	result = genl_register_family_with_ops(&wimax_gnl_family, wimax_gnl_ops,
-					       ARRAY_SIZE(wimax_gnl_ops));
+	result = genl_register_family_with_ops(&wimax_gnl_family,
+					       wimax_gnl_ops);
 	if (unlikely(result < 0)) {
 		printk(KERN_ERR "cannot register generic netlink family: %d\n",
 		       result);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 58c43c8e149f..1b6c5dd4dccf 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11329,8 +11329,7 @@ int nl80211_init(void)
 {
 	int err;
 
-	err = genl_register_family_with_ops(&nl80211_fam,
-		nl80211_ops, ARRAY_SIZE(nl80211_ops));
+	err = genl_register_family_with_ops(&nl80211_fam, nl80211_ops);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From dcdfdf56b4a6c9437fc37dbc9cee94a788f9b0c4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 19 Nov 2013 19:12:34 -0800
Subject: ipv4: fix race in concurrent ip_route_input_slow()

CPUs can ask for local route via ip_route_input_noref() concurrently.
if nh_rth_input is not cached yet, CPUs will proceed to allocate
equivalent DSTs on 'lo' and then will try to cache them in nh_rth_input
via rt_cache_route()
Most of the time they succeed, but on occasion the following two lines:
	orig = *p;
	prev = cmpxchg(p, orig, rt);
in rt_cache_route() do race and one of the cpus fails to complete cmpxchg.
But ip_route_input_slow() doesn't check the return code of rt_cache_route(),
so dst is leaking. dst_destroy() is never called and 'lo' device
refcnt doesn't go to zero, which can be seen in the logs as:
	unregister_netdevice: waiting for lo to become free. Usage count = 1
Adding mdelay() between above two lines makes it easily reproducible.
Fix it similar to nh_pcpu_rth_output case.

Fixes: d2d68ba9fe8b ("ipv4: Cache input routes in fib_info nexthops.")
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f428935c50db..f8da28278014 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1776,8 +1776,12 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
-	if (do_cache)
-		rt_cache_route(&FIB_RES_NH(res), rth);
+	if (do_cache) {
+		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+			rth->dst.flags |= DST_NOCACHE;
+			rt_add_uncached_list(rth);
+		}
+	}
 	skb_dst_set(skb, &rth->dst);
 	err = 0;
 	goto out;
-- 
cgit v1.2.3


From cc5c00bbb44c5d68b883aa5cb9d01514a2525d94 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 22 Nov 2013 10:31:29 +0800
Subject: gro: Only verify TCP checksums for candidates

In some cases we may receive IP packets that are longer than
their stated lengths.  Such packets are never merged in GRO.
However, we may end up computing their checksums incorrectly
and end up allowing packets with a bogus checksum enter our
stack with the checksum status set as verified.

Since such packets are rare and not performance-critical, this
patch simply skips the checksum verification for them.

Reported-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>

Thanks,
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_offload.c   | 5 +++++
 net/ipv6/tcpv6_offload.c | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a2b68a108eae..55aeec930140 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -276,6 +276,10 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *
 	__wsum wsum;
 	__sum16 sum;
 
+	/* Don't bother verifying checksum if we're going to flush anyway. */
+	if (NAPI_GRO_CB(skb)->flush)
+		goto skip_csum;
+
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
@@ -301,6 +305,7 @@ flush:
 		break;
 	}
 
+skip_csum:
 	return tcp_gro_receive(head, skb);
 }
 
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index c1097c798900..71923d14127a 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -39,6 +39,10 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
 	__wsum wsum;
 	__sum16 sum;
 
+	/* Don't bother verifying checksum if we're going to flush anyway. */
+	if (NAPI_GRO_CB(skb)->flush)
+		goto skip_csum;
+
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
 		if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr,
@@ -65,6 +69,7 @@ flush:
 		break;
 	}
 
+skip_csum:
 	return tcp_gro_receive(head, skb);
 }
 
-- 
cgit v1.2.3


From b8ee93ba80b5a0b6c3c06b65c34dd1276f16c047 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 22 Nov 2013 10:32:11 +0800
Subject: gro: Clean up tcpX_gro_receive checksum verification

This patch simplifies the checksum verification in tcpX_gro_receive
by reusing the CHECKSUM_COMPLETE code for CHECKSUM_NONE.  All it
does for CHECKSUM_NONE is compute the partial checksum and then
treat it as if it came from the hardware (CHECKSUM_COMPLETE).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_offload.c   | 26 ++++++++++----------------
 net/ipv6/tcpv6_offload.c | 27 ++++++++++-----------------
 2 files changed, 20 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 55aeec930140..05606353c7e7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -274,35 +274,29 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *
 {
 	const struct iphdr *iph = skb_gro_network_header(skb);
 	__wsum wsum;
-	__sum16 sum;
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
 	if (NAPI_GRO_CB(skb)->flush)
 		goto skip_csum;
 
+	wsum = skb->csum;
+
 	switch (skb->ip_summed) {
+	case CHECKSUM_NONE:
+		wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
+				    0);
+
+		/* fall through */
+
 	case CHECKSUM_COMPLETE:
 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
-				  skb->csum)) {
+				  wsum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			break;
 		}
-flush:
+
 		NAPI_GRO_CB(skb)->flush = 1;
 		return NULL;
-
-	case CHECKSUM_NONE:
-		wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
-					  skb_gro_len(skb), IPPROTO_TCP, 0);
-		sum = csum_fold(skb_checksum(skb,
-					     skb_gro_offset(skb),
-					     skb_gro_len(skb),
-					     wsum));
-		if (sum)
-			goto flush;
-
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		break;
 	}
 
 skip_csum:
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 71923d14127a..6d18157dc32c 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -37,36 +37,29 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
 {
 	const struct ipv6hdr *iph = skb_gro_network_header(skb);
 	__wsum wsum;
-	__sum16 sum;
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
 	if (NAPI_GRO_CB(skb)->flush)
 		goto skip_csum;
 
+	wsum = skb->csum;
+
 	switch (skb->ip_summed) {
+	case CHECKSUM_NONE:
+		wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
+				    wsum);
+
+		/* fall through */
+
 	case CHECKSUM_COMPLETE:
 		if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr,
-				  skb->csum)) {
+				  wsum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			break;
 		}
-flush:
+
 		NAPI_GRO_CB(skb)->flush = 1;
 		return NULL;
-
-	case CHECKSUM_NONE:
-		wsum = ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr,
-						    skb_gro_len(skb),
-						    IPPROTO_TCP, 0));
-		sum = csum_fold(skb_checksum(skb,
-					     skb_gro_offset(skb),
-					     skb_gro_len(skb),
-					     wsum));
-		if (sum)
-			goto flush;
-
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		break;
 	}
 
 skip_csum:
-- 
cgit v1.2.3


From fb10f802b0fb76079612cb78505cbc9ad81e683b Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Fri, 22 Nov 2013 14:48:40 +0800
Subject: tcp_memcg: remove useless var old_lim

nobody needs it. remove.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_memcontrol.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 03e9154f7e68..269a89ecd2f4 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -60,7 +60,6 @@ EXPORT_SYMBOL(tcp_destroy_cgroup);
 static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 {
 	struct cg_proto *cg_proto;
-	u64 old_lim;
 	int i;
 	int ret;
 
@@ -71,7 +70,6 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 	if (val > RES_COUNTER_MAX)
 		val = RES_COUNTER_MAX;
 
-	old_lim = res_counter_read_u64(&cg_proto->memory_allocated, RES_LIMIT);
 	ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 85fbaa75037d0b6b786ff18658ddf0b4014ce2a4 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 23 Nov 2013 00:46:12 +0100
Subject: inet: fix addr_len/msg->msg_namelen assignment in recv_error and
 rxpmtu functions

Commit bceaa90240b6019ed73b49965eac7d167610be69 ("inet: prevent leakage
of uninitialized memory to user in recv syscalls") conditionally updated
addr_len if the msg_name is written to. The recv_error and rxpmtu
functions relied on the recvmsg functions to set up addr_len before.

As this does not happen any more we have to pass addr_len to those
functions as well and set it to the size of the corresponding sockaddr
length.

This broke traceroute and such.

Fixes: bceaa90240b6 ("inet: prevent leakage of uninitialized memory to user in recv syscalls")
Reported-by: Brad Spengler <spender@grsecurity.net>
Reported-by: Tom Labanowski
Cc: mpb <mpb.mail@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       | 2 +-
 include/net/ipv6.h     | 6 ++++--
 include/net/ping.h     | 3 ++-
 net/ipv4/ip_sockglue.c | 3 ++-
 net/ipv4/ping.c        | 5 +++--
 net/ipv4/raw.c         | 2 +-
 net/ipv4/udp.c         | 2 +-
 net/ipv6/datagram.c    | 7 +++++--
 net/ipv6/ping.c        | 3 ++-
 net/ipv6/raw.c         | 4 ++--
 net/ipv6/udp.c         | 4 ++--
 net/l2tp/l2tp_ip6.c    | 2 +-
 12 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index 217bc5bfc6c6..5a25f36fe3a7 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -473,7 +473,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 int ip_ra_control(struct sock *sk, unsigned char on,
 		  void (*destructor)(struct sock *));
 
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len);
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len);
 void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
 		   u32 info, u8 *payload);
 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 2a5f668cd683..eb198acaac1d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -776,8 +776,10 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
 
 int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
 
-int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len);
-int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len);
+int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
+		    int *addr_len);
+int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
+		     int *addr_len);
 void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
 		     u32 info, u8 *payload);
 void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
diff --git a/include/net/ping.h b/include/net/ping.h
index 3f67704f3747..90f48417b03d 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -31,7 +31,8 @@
 
 /* Compatibility glue so we can support IPv6 when it's compiled as a module */
 struct pingv6_ops {
-	int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len);
+	int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len,
+			       int *addr_len);
 	int (*ip6_datagram_recv_ctl)(struct sock *sk, struct msghdr *msg,
 				     struct sk_buff *skb);
 	int (*icmpv6_err_convert)(u8 type, u8 code, int *err);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 3f858266fa7e..ddf32a6bc415 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -386,7 +386,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
 /*
  *	Handle MSG_ERRQUEUE
  */
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 {
 	struct sock_exterr_skb *serr;
 	struct sk_buff *skb, *skb2;
@@ -423,6 +423,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
 						   serr->addr_offset);
 		sin->sin_port = serr->port;
 		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
 	}
 
 	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 876c6ca2d8f9..840cf1b9e6ee 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -841,10 +841,11 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	if (flags & MSG_ERRQUEUE) {
 		if (family == AF_INET) {
-			return ip_recv_error(sk, msg, len);
+			return ip_recv_error(sk, msg, len, addr_len);
 #if IS_ENABLED(CONFIG_IPV6)
 		} else if (family == AF_INET6) {
-			return pingv6_ops.ipv6_recv_error(sk, msg, len);
+			return pingv6_ops.ipv6_recv_error(sk, msg, len,
+							  addr_len);
 #endif
 		}
 	}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5cb8ddb505ee..23c3e5b5bb53 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -697,7 +697,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		goto out;
 
 	if (flags & MSG_ERRQUEUE) {
-		err = ip_recv_error(sk, msg, len);
+		err = ip_recv_error(sk, msg, len, addr_len);
 		goto out;
 	}
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5944d7d668dd..44dfaa09b584 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1236,7 +1236,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	bool slow;
 
 	if (flags & MSG_ERRQUEUE)
-		return ip_recv_error(sk, msg, len);
+		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a454b0ff57c7..d690204a0275 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -318,7 +318,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu)
 /*
  *	Handle MSG_ERRQUEUE
  */
-int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
+int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sock_exterr_skb *serr;
@@ -369,6 +369,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
 					       &sin->sin6_addr);
 			sin->sin6_scope_id = 0;
 		}
+		*addr_len = sizeof(*sin);
 	}
 
 	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
@@ -423,7 +424,8 @@ EXPORT_SYMBOL_GPL(ipv6_recv_error);
 /*
  *	Handle IPV6_RECVPATHMTU
  */
-int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len)
+int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
+		     int *addr_len)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff *skb;
@@ -457,6 +459,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len)
 		sin->sin6_port = 0;
 		sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id;
 		sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr;
+		*addr_len = sizeof(*sin);
 	}
 
 	put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 8815e31a87fe..a83243c3d656 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -57,7 +57,8 @@ static struct inet_protosw pingv6_protosw = {
 
 
 /* Compatibility glue so we can support IPv6 when it's compiled as a module */
-static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
+static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
+				 int *addr_len)
 {
 	return -EAFNOSUPPORT;
 }
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e24ff1df0401..7fb4e14c467f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -466,10 +466,10 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		return -EOPNOTSUPP;
 
 	if (flags & MSG_ERRQUEUE)
-		return ipv6_recv_error(sk, msg, len);
+		return ipv6_recv_error(sk, msg, len, addr_len);
 
 	if (np->rxpmtu && np->rxopt.bits.rxpmtu)
-		return ipv6_recv_rxpmtu(sk, msg, len);
+		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 81eb8cf8389b..bcd5699313c3 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -393,10 +393,10 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	bool slow;
 
 	if (flags & MSG_ERRQUEUE)
-		return ipv6_recv_error(sk, msg, len);
+		return ipv6_recv_error(sk, msg, len, addr_len);
 
 	if (np->rxpmtu && np->rxopt.bits.rxpmtu)
-		return ipv6_recv_rxpmtu(sk, msg, len);
+		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
 	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index cfd65304be60..d9b437e55007 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -665,7 +665,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		*addr_len = sizeof(*lsa);
 
 	if (flags & MSG_ERRQUEUE)
-		return ipv6_recv_error(sk, msg, len);
+		return ipv6_recv_error(sk, msg, len, addr_len);
 
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
-- 
cgit v1.2.3


From 39af0c409e8c06b51caf7dc43e49ef96ae790a55 Mon Sep 17 00:00:00 2001
From: Baker Zhang <Baker.kernel@gmail.com>
Date: Tue, 26 Nov 2013 09:29:15 +0800
Subject: net: remove outdated comment for ipv4 and ipv6 protocol handler

since f9242b6b28d61295f2bf7e8adfb1060b382e5381
inet: Sanitize inet{,6} protocol demux.

there are not pretended hash tables for ipv4 or
ipv6 protocol handler.

Signed-off-by: Baker Zhang <Baker.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/protocol.c | 8 --------
 net/ipv6/protocol.c | 4 ----
 2 files changed, 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index ce848461acbb..46d6a1c923a8 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -31,10 +31,6 @@
 const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
 const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
 
-/*
- *	Add a protocol handler to the hash tables
- */
-
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
 	if (!prot->netns_ok) {
@@ -55,10 +51,6 @@ int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
 }
 EXPORT_SYMBOL(inet_add_offload);
 
-/*
- *	Remove a protocol from the hash tables.
- */
-
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
 	int ret;
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 22d1bd4670da..e048cf1bb6a2 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -36,10 +36,6 @@ int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol
 }
 EXPORT_SYMBOL(inet6_add_protocol);
 
-/*
- *	Remove a protocol from the hash tables.
- */
-
 int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
 	int ret;
-- 
cgit v1.2.3


From d3f7d56a7a4671d395e8af87071068a195257bf6 Mon Sep 17 00:00:00 2001
From: Shawn Landden <shawn@churchofgit.com>
Date: Sun, 24 Nov 2013 22:36:28 -0800
Subject: net: update consumers of MSG_MORE to recognize MSG_SENDPAGE_NOTLAST

Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once)
added an internal flag MSG_SENDPAGE_NOTLAST, similar to
MSG_MORE.

algif_hash, algif_skcipher, and udp used MSG_MORE from tcp_sendpages()
and need to see the new flag as identical to MSG_MORE.

This fixes sendfile() on AF_ALG.

v3: also fix udp

Cc: Tom Herbert <therbert@google.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: <stable@vger.kernel.org> # 3.4.x + 3.2.x
Reported-and-tested-by: Shawn Landden <shawnlandden@gmail.com>
Original-patch: Richard Weinberger <richard@nod.at>
Signed-off-by: Shawn Landden <shawn@churchofgit.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/algif_hash.c     | 3 +++
 crypto/algif_skcipher.c | 3 +++
 net/ipv4/udp.c          | 3 +++
 3 files changed, 9 insertions(+)

(limited to 'net/ipv4')

diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index ef5356cd280a..850246206b12 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page,
 	struct hash_ctx *ctx = ask->private;
 	int err;
 
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
 	lock_sock(sk);
 	sg_init_table(ctx->sgl.sg, 1);
 	sg_set_page(ctx->sgl.sg, page, size, offset);
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 6a6dfc062d2a..a19c027b29bd 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -378,6 +378,9 @@ static ssize_t skcipher_sendpage(struct socket *sock, struct page *page,
 	struct skcipher_sg_list *sgl;
 	int err = -EINVAL;
 
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
 	lock_sock(sk);
 	if (!ctx->more && ctx->used)
 		goto unlock;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 44dfaa09b584..bf2b85b25491 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1098,6 +1098,9 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	struct udp_sock *up = udp_sk(sk);
 	int ret;
 
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
 	if (!up->pending) {
 		struct msghdr msg = {	.msg_flags = flags|MSG_MORE };
 
-- 
cgit v1.2.3


From f1d8cba61c3c4b1eb88e507249c4cb8d635d9a76 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 28 Nov 2013 09:51:22 -0800
Subject: inet: fix possible seqlock deadlocks

In commit c9e9042994d3 ("ipv4: fix possible seqlock deadlock") I left
another places where IP_INC_STATS_BH() were improperly used.

udp_sendmsg(), ping_v4_sendmsg() and tcp_v4_connect() are called from
process context, not from softirq context.

This was detected by lockdep seqlock support.

Reported-by: jongman heo <jongman.heo@samsung.com>
Fixes: 584bdf8cbdf6 ("[IPV4]: Fix "ipOutNoRoutes" counter error for TCP and UDP")
Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ping.c     | 2 +-
 net/ipv4/tcp_ipv4.c | 2 +-
 net/ipv4/udp.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 840cf1b9e6ee..242e7f4ed6f4 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -772,7 +772,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		err = PTR_ERR(rt);
 		rt = NULL;
 		if (err == -ENETUNREACH)
-			IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 		goto out;
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 59a6f8b90cd9..067213924751 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -177,7 +177,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		if (err == -ENETUNREACH)
-			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 		return err;
 	}
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bf2b85b25491..44f6a20fa29d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -999,7 +999,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			err = PTR_ERR(rt);
 			rt = NULL;
 			if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+				IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 			goto out;
 		}
 
-- 
cgit v1.2.3


From 7f2cbdc28c034ef2c3be729681f631d5744e3cd5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 Dec 2013 20:12:04 -0800
Subject: tcp_memcontrol: Cleanup/fix cg_proto->memory_pressure handling.

kill memcg_tcp_enter_memory_pressure.  The only function of
memcg_tcp_enter_memory_pressure was to reduce deal with the
unnecessary abstraction that was tcp_memcontrol.  Now that struct
tcp_memcontrol is gone remove this unnecessary function, the
unnecessary function pointer, and modify sk_enter_memory_pressure to
set this field directly, just as sk_leave_memory_pressure cleas this
field directly.

This fixes a small bug I intruduced when killing struct tcp_memcontrol
that caused memcg_tcp_enter_memory_pressure to never be called and
thus failed to ever set cg_proto->memory_pressure.

Remove the cg_proto enter_memory_pressure function as it now serves
no useful purpose.

Don't test cg_proto->memory_presser in sk_leave_memory_pressure before
clearing it.  The test was originally there to ensure that the pointer
was non-NULL.  Now that cg_proto is not a pointer the pointer does not
matter.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h        | 6 ++----
 net/ipv4/tcp_memcontrol.c | 7 -------
 2 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/sock.h b/include/net/sock.h
index e3a18ff0c38b..2ef3c3eca47a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1035,7 +1035,6 @@ enum cg_proto_flags {
 };
 
 struct cg_proto {
-	void			(*enter_memory_pressure)(struct sock *sk);
 	struct res_counter	memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
 	int			memory_pressure;
@@ -1155,8 +1154,7 @@ static inline void sk_leave_memory_pressure(struct sock *sk)
 		struct proto *prot = sk->sk_prot;
 
 		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			if (cg_proto->memory_pressure)
-				cg_proto->memory_pressure = 0;
+			cg_proto->memory_pressure = 0;
 	}
 
 }
@@ -1171,7 +1169,7 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
 		struct proto *prot = sk->sk_prot;
 
 		for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-			cg_proto->enter_memory_pressure(sk);
+			cg_proto->memory_pressure = 1;
 	}
 
 	sk->sk_prot->enter_memory_pressure(sk);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 269a89ecd2f4..f7e522c558ba 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,13 +6,6 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
-static void memcg_tcp_enter_memory_pressure(struct sock *sk)
-{
-	if (sk->sk_cgrp->memory_pressure)
-		sk->sk_cgrp->memory_pressure = 1;
-}
-EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
-
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	/*
-- 
cgit v1.2.3


From 673498b8ed4c4d4b7221c5309d891c5eac2b7528 Mon Sep 17 00:00:00 2001
From: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Date: Tue, 10 Dec 2013 23:21:25 +0100
Subject: inet: fix NULL pointer Oops in fib(6)_rule_suppress

This changes ensures that the routing entry investigated by the suppress
function actually does point to a device struct before following that pointer,
fixing a possible kernel oops situation when verifying the interface group
associated with a routing table entry.

According to Daniel Golle, this Oops can be triggered by a user process trying
to establish an outgoing IPv6 connection while having no real IPv6 connectivity
set up (only autoassigned link-local addresses).

Fixes: 6ef94cfafba15 ("fib_rules: add route suppression based on ifgroup")

Reported-by: Daniel Golle <daniel.golle@gmail.com>
Tested-by: Daniel Golle <daniel.golle@gmail.com>
Signed-off-by: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_rules.c  | 5 ++++-
 net/ipv6/fib6_rules.c | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 523be38e37de..f2e15738534d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -104,7 +104,10 @@ errout:
 static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct fib_result *result = (struct fib_result *) arg->result;
-	struct net_device *dev = result->fi->fib_dev;
+	struct net_device *dev = NULL;
+
+	if (result->fi)
+		dev = result->fi->fib_dev;
 
 	/* do not accept result if the route does
 	 * not meet the required prefix length
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index e27591635f92..3fd0a578329e 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -122,7 +122,11 @@ out:
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
-	struct net_device *dev = rt->rt6i_idev->dev;
+	struct net_device *dev = NULL;
+
+	if (rt->rt6i_idev)
+		dev = rt->rt6i_idev->dev;
+
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-- 
cgit v1.2.3


From 8afdd99a1315e759de04ad6e2344f0c5f17ecb1b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 10 Dec 2013 18:07:23 -0800
Subject: udp: ipv4: fix an use after free in __udp4_lib_rcv()

Dave Jones reported a use after free in UDP stack :

[ 5059.434216] =========================
[ 5059.434314] [ BUG: held lock freed! ]
[ 5059.434420] 3.13.0-rc3+ #9 Not tainted
[ 5059.434520] -------------------------
[ 5059.434620] named/863 is freeing memory ffff88005e960000-ffff88005e96061f, with a lock still held there!
[ 5059.434815]  (slock-AF_INET){+.-...}, at: [<ffffffff8149bd21>] udp_queue_rcv_skb+0xd1/0x4b0
[ 5059.435012] 3 locks held by named/863:
[ 5059.435086]  #0:  (rcu_read_lock){.+.+..}, at: [<ffffffff8143054d>] __netif_receive_skb_core+0x11d/0x940
[ 5059.435295]  #1:  (rcu_read_lock){.+.+..}, at: [<ffffffff81467a5e>] ip_local_deliver_finish+0x3e/0x410
[ 5059.435500]  #2:  (slock-AF_INET){+.-...}, at: [<ffffffff8149bd21>] udp_queue_rcv_skb+0xd1/0x4b0
[ 5059.435734]
stack backtrace:
[ 5059.435858] CPU: 0 PID: 863 Comm: named Not tainted 3.13.0-rc3+ #9 [loadavg: 0.21 0.06 0.06 1/115 1365]
[ 5059.436052] Hardware name:                  /D510MO, BIOS MOPNV10J.86A.0175.2010.0308.0620 03/08/2010
[ 5059.436223]  0000000000000002 ffff88007e203ad8 ffffffff8153a372 ffff8800677130e0
[ 5059.436390]  ffff88007e203b10 ffffffff8108cafa ffff88005e960000 ffff88007b00cfc0
[ 5059.436554]  ffffea00017a5800 ffffffff8141c490 0000000000000246 ffff88007e203b48
[ 5059.436718] Call Trace:
[ 5059.436769]  <IRQ>  [<ffffffff8153a372>] dump_stack+0x4d/0x66
[ 5059.436904]  [<ffffffff8108cafa>] debug_check_no_locks_freed+0x15a/0x160
[ 5059.437037]  [<ffffffff8141c490>] ? __sk_free+0x110/0x230
[ 5059.437147]  [<ffffffff8112da2a>] kmem_cache_free+0x6a/0x150
[ 5059.437260]  [<ffffffff8141c490>] __sk_free+0x110/0x230
[ 5059.437364]  [<ffffffff8141c5c9>] sk_free+0x19/0x20
[ 5059.437463]  [<ffffffff8141cb25>] sock_edemux+0x25/0x40
[ 5059.437567]  [<ffffffff8141c181>] sock_queue_rcv_skb+0x81/0x280
[ 5059.437685]  [<ffffffff8149bd21>] ? udp_queue_rcv_skb+0xd1/0x4b0
[ 5059.437805]  [<ffffffff81499c82>] __udp_queue_rcv_skb+0x42/0x240
[ 5059.437925]  [<ffffffff81541d25>] ? _raw_spin_lock+0x65/0x70
[ 5059.438038]  [<ffffffff8149bebb>] udp_queue_rcv_skb+0x26b/0x4b0
[ 5059.438155]  [<ffffffff8149c712>] __udp4_lib_rcv+0x152/0xb00
[ 5059.438269]  [<ffffffff8149d7f5>] udp_rcv+0x15/0x20
[ 5059.438367]  [<ffffffff81467b2f>] ip_local_deliver_finish+0x10f/0x410
[ 5059.438492]  [<ffffffff81467a5e>] ? ip_local_deliver_finish+0x3e/0x410
[ 5059.438621]  [<ffffffff81468653>] ip_local_deliver+0x43/0x80
[ 5059.438733]  [<ffffffff81467f70>] ip_rcv_finish+0x140/0x5a0
[ 5059.438843]  [<ffffffff81468926>] ip_rcv+0x296/0x3f0
[ 5059.438945]  [<ffffffff81430b72>] __netif_receive_skb_core+0x742/0x940
[ 5059.439074]  [<ffffffff8143054d>] ? __netif_receive_skb_core+0x11d/0x940
[ 5059.442231]  [<ffffffff8108c81d>] ? trace_hardirqs_on+0xd/0x10
[ 5059.442231]  [<ffffffff81430d83>] __netif_receive_skb+0x13/0x60
[ 5059.442231]  [<ffffffff81431c1e>] netif_receive_skb+0x1e/0x1f0
[ 5059.442231]  [<ffffffff814334e0>] napi_gro_receive+0x70/0xa0
[ 5059.442231]  [<ffffffffa01de426>] rtl8169_poll+0x166/0x700 [r8169]
[ 5059.442231]  [<ffffffff81432bc9>] net_rx_action+0x129/0x1e0
[ 5059.442231]  [<ffffffff810478cd>] __do_softirq+0xed/0x240
[ 5059.442231]  [<ffffffff81047e25>] irq_exit+0x125/0x140
[ 5059.442231]  [<ffffffff81004241>] do_IRQ+0x51/0xc0
[ 5059.442231]  [<ffffffff81542bef>] common_interrupt+0x6f/0x6f

We need to keep a reference on the socket, by using skb_steal_sock()
at the right place.

Note that another patch is needed to fix a race in
udp_sk_rx_dst_set(), as we hold no lock protecting the dst.

Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux")
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Shawn Bohrer <sbohrer@rgmadvisors.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 44f6a20fa29d..2e2aecbe22c4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -560,15 +560,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 						 __be16 sport, __be16 dport,
 						 struct udp_table *udptable)
 {
-	struct sock *sk;
 	const struct iphdr *iph = ip_hdr(skb);
 
-	if (unlikely(sk = skb_steal_sock(skb)))
-		return sk;
-	else
-		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
-					 iph->daddr, dport, inet_iif(skb),
-					 udptable);
+	return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+				 iph->daddr, dport, inet_iif(skb),
+				 udptable);
 }
 
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
@@ -1739,15 +1735,15 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (udp4_csum_init(skb, uh, proto))
 		goto csum_error;
 
-	if (skb->sk) {
+	sk = skb_steal_sock(skb);
+	if (sk) {
 		int ret;
-		sk = skb->sk;
 
 		if (unlikely(sk->sk_rx_dst == NULL))
 			udp_sk_rx_dst_set(sk, skb);
 
 		ret = udp_queue_rcv_skb(sk, skb);
-
+		sock_put(sk);
 		/* a return value > 0 means to resubmit the input, but
 		 * it wants the return to be -protocol, or 0
 		 */
-- 
cgit v1.2.3


From 610438b74496b2986a9025f8e23c134cb638e338 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Dec 2013 08:10:05 -0800
Subject: udp: ipv4: fix potential use after free in udp_v4_early_demux()

pskb_may_pull() can reallocate skb->head, we need to move the
initialization of iph and uh pointers after its call.

Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Shawn Bohrer <sbohrer@rgmadvisors.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2e2aecbe22c4..16d246a51a02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1909,17 +1909,20 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 
 void udp_v4_early_demux(struct sk_buff *skb)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-	const struct udphdr *uh = udp_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct udphdr *uh;
 	struct sock *sk;
 	struct dst_entry *dst;
-	struct net *net = dev_net(skb->dev);
 	int dif = skb->dev->ifindex;
 
 	/* validate the packet */
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
 		return;
 
+	iph = ip_hdr(skb);
+	uh = udp_hdr(skb);
+
 	if (skb->pkt_type == PACKET_BROADCAST ||
 	    skb->pkt_type == PACKET_MULTICAST)
 		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
-- 
cgit v1.2.3


From 975022310233fb0f0193873d79a7b8438070fa82 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Dec 2013 14:46:51 -0800
Subject: udp: ipv4: must add synchronization in udp_sk_rx_dst_set()

Unlike TCP, UDP input path does not hold the socket lock.

Before messing with sk->sk_rx_dst, we must use a spinlock, otherwise
multiple cpus could leak a refcount.

This patch also takes care of renewing a stale dst entry.
(When the sk->sk_rx_dst would not be used by IP early demux)

Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Shawn Bohrer <sbohrer@rgmadvisors.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 16d246a51a02..62c19fdd102d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1599,12 +1599,21 @@ static void flush_stack(struct sock **stack, unsigned int count,
 		kfree_skb(skb1);
 }
 
-static void udp_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+/* For TCP sockets, sk_rx_dst is protected by socket lock
+ * For UDP, we use sk_dst_lock to guard against concurrent changes.
+ */
+static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
 {
-	struct dst_entry *dst = skb_dst(skb);
+	struct dst_entry *old;
 
-	dst_hold(dst);
-	sk->sk_rx_dst = dst;
+	spin_lock(&sk->sk_dst_lock);
+	old = sk->sk_rx_dst;
+	if (likely(old != dst)) {
+		dst_hold(dst);
+		sk->sk_rx_dst = dst;
+		dst_release(old);
+	}
+	spin_unlock(&sk->sk_dst_lock);
 }
 
 /*
@@ -1737,10 +1746,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 
 	sk = skb_steal_sock(skb);
 	if (sk) {
+		struct dst_entry *dst = skb_dst(skb);
 		int ret;
 
-		if (unlikely(sk->sk_rx_dst == NULL))
-			udp_sk_rx_dst_set(sk, skb);
+		if (unlikely(sk->sk_rx_dst != dst))
+			udp_sk_rx_dst_set(sk, dst);
 
 		ret = udp_queue_rcv_skb(sk, skb);
 		sock_put(sk);
-- 
cgit v1.2.3