From 510c321b557121861601f9d259aadd65aa274f35 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Feb 2018 19:06:02 +0800
Subject: xfrm: reuse uncached_list to track xdsts

In early time, when freeing a xdst, it would be inserted into
dst_garbage.list first. Then if it's refcnt was still held
somewhere, later it would be put into dst_busy_list in
dst_gc_task().

When one dev was being unregistered, the dev of these dsts in
dst_busy_list would be set with loopback_dev and put this dev.
So that this dev's removal wouldn't get blocked, and avoid the
kmsg warning:

  kernel:unregister_netdevice: waiting for veth0 to become \
  free. Usage count = 2

However after Commit 52df157f17e5 ("xfrm: take refcnt of dst
when creating struct xfrm_dst bundle"), the xdst will not be
freed with dst gc, and this warning happens.

To fix it, we need to find these xdsts that are still held by
others when removing the dev, and free xdst's dev and set it
with loopback_dev.

But unfortunately after flow_cache for xfrm was deleted, no
list tracks them anymore. So we need to save these xdsts
somewhere to release the xdst's dev later.

To make this easier, this patch is to reuse uncached_list to
track xdsts, so that the dev refcnt can be released in the
event NETDEV_UNREGISTER process of fib_netdev_notifier.

Thanks to Florian, we could move forward this fix quickly.

Fixes: 52df157f17e5 ("xfrm: take refcnt of dst when creating struct xfrm_dst bundle")
Reported-by: Jianlin Shi <jishi@redhat.com>
Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Tested-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/ip6_route.h | 3 +++
 include/net/route.h     | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 27d23a65f3cd..ac0866bb9e93 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -179,6 +179,9 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
 void rt6_multipath_rebalance(struct rt6_info *rt);
 
+void rt6_uncached_list_add(struct rt6_info *rt);
+void rt6_uncached_list_del(struct rt6_info *rt);
+
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
 	const struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/route.h b/include/net/route.h
index 1eb9ce470e25..40b870d58f38 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -227,6 +227,9 @@ struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
 void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
 
+void rt_add_uncached_list(struct rtable *rt);
+void rt_del_uncached_list(struct rtable *rt);
+
 static inline void ip_rt_put(struct rtable *rt)
 {
 	/* dst_release() accepts a NULL parameter.
-- 
cgit v1.2.3


From d02f51cbcf12b09ab945873e35046045875eed9a Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Sat, 3 Mar 2018 03:03:46 +0100
Subject: bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp
 skbs

SCTP GSO skbs have a gso_size of GSO_BY_FRAGS, so any sort of
unconditionally mangling of that will result in nonsense value
and would corrupt the skb later on.

Therefore, i) add two helpers skb_increase_gso_size() and
skb_decrease_gso_size() that would throw a one time warning and
bail out for such skbs and ii) refuse and return early with an
error in those BPF helpers that are affected. We do need to bail
out as early as possible from there before any changes on the
skb have been performed.

Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
Co-authored-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/networking/segmentation-offloads.txt | 11 +++-
 include/linux/skbuff.h                             | 22 ++++++++
 net/core/filter.c                                  | 60 +++++++++++++++-------
 3 files changed, 73 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt
index d47480b61ac6..23a8dd91a9ec 100644
--- a/Documentation/networking/segmentation-offloads.txt
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -153,8 +153,15 @@ To signal this, gso_size is set to the special value GSO_BY_FRAGS.
 
 Therefore, any code in the core networking stack must be aware of the
 possibility that gso_size will be GSO_BY_FRAGS and handle that case
-appropriately. (For size checks, the skb_gso_validate_*_len family of
-helpers do this automatically.)
+appropriately.
+
+There are a couple of helpers to make this easier:
+
+ - For size checks, the skb_gso_validate_*_len family of helpers correctly
+   considers GSO_BY_FRAGS.
+
+ - For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size
+   will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs.
 
 This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits
 set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c1e66bdcf583..8c67c33f40c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4038,6 +4038,12 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
 }
 
+/* Note: Should be called only if skb_is_gso(skb) is true */
+static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
+}
+
 static inline void skb_gso_reset(struct sk_buff *skb)
 {
 	skb_shinfo(skb)->gso_size = 0;
@@ -4045,6 +4051,22 @@ static inline void skb_gso_reset(struct sk_buff *skb)
 	skb_shinfo(skb)->gso_type = 0;
 }
 
+static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
+					 u16 increment)
+{
+	if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
+		return;
+	shinfo->gso_size += increment;
+}
+
+static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
+					 u16 decrement)
+{
+	if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
+		return;
+	shinfo->gso_size -= decrement;
+}
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb);
 
 static inline bool skb_warn_if_lro(const struct sk_buff *skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c121adbdbaa..48aa7c7320db 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2087,6 +2087,10 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
 	u32 off = skb_mac_header_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_cow(skb, len_diff);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2096,19 +2100,21 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* SKB_GSO_TCPV4 needs to be changed into
 		 * SKB_GSO_TCPV6.
 		 */
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
-			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
-			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV6;
+		if (shinfo->gso_type & SKB_GSO_TCPV4) {
+			shinfo->gso_type &= ~SKB_GSO_TCPV4;
+			shinfo->gso_type |=  SKB_GSO_TCPV6;
 		}
 
 		/* Due to IPv6 header, MSS needs to be downgraded. */
-		skb_shinfo(skb)->gso_size -= len_diff;
+		skb_decrease_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	skb->protocol = htons(ETH_P_IPV6);
@@ -2123,6 +2129,10 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
 	u32 off = skb_mac_header_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_unclone(skb, GFP_ATOMIC);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2132,19 +2142,21 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* SKB_GSO_TCPV6 needs to be changed into
 		 * SKB_GSO_TCPV4.
 		 */
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
-			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
-			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV4;
+		if (shinfo->gso_type & SKB_GSO_TCPV6) {
+			shinfo->gso_type &= ~SKB_GSO_TCPV6;
+			shinfo->gso_type |=  SKB_GSO_TCPV4;
 		}
 
 		/* Due to IPv4 header, MSS can be upgraded. */
-		skb_shinfo(skb)->gso_size += len_diff;
+		skb_increase_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	skb->protocol = htons(ETH_P_IP);
@@ -2243,6 +2255,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
 	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_cow(skb, len_diff);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2252,11 +2268,13 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* Due to header grow, MSS needs to be downgraded. */
-		skb_shinfo(skb)->gso_size -= len_diff;
+		skb_decrease_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	return 0;
@@ -2267,6 +2285,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
 	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
 	int ret;
 
+	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+		return -ENOTSUPP;
+
 	ret = skb_unclone(skb, GFP_ATOMIC);
 	if (unlikely(ret < 0))
 		return ret;
@@ -2276,11 +2298,13 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
 		return ret;
 
 	if (skb_is_gso(skb)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
 		/* Due to header shrink, MSS can be upgraded. */
-		skb_shinfo(skb)->gso_size += len_diff;
+		skb_increase_gso_size(shinfo, len_diff);
 		/* Header must be checked, and gso_segs recomputed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
+		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_segs = 0;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 2fe4c22c53fc2e3f35be2cd0033cb3d15ebd41b1 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 27 Feb 2018 08:03:56 -0500
Subject: media: rc: lirc does not use LIRC_CAN_SEND_SCANCODE feature

Since commit 02d742f4b209 ("media: lirc: lirc daemon fails to detect raw
IR device"), the feature LIRC_CAN_SEND_SCANCODE is no longer used as it
tripped up lircd. The ability to send scancodes for IR Tx is implied by
LIRC_CAN_SEND_PULSE (i.e. any device that can send can use IR Tx encoders).

So, remove LIRC_CAN_SEND_SCANCODE since it never used. This fixes:

Documentation/output/lirc.h.rst:6: WARNING: undefined label:
lirc-can-send-scancode (if the link has no caption the label must precede
a section header

As this flag was added for kernel 4.16, let's remove it, while not too
late.

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/lirc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index 4fe580d36e41..f5bf06ecd87d 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -54,7 +54,6 @@
 #define LIRC_CAN_SEND_RAW              LIRC_MODE2SEND(LIRC_MODE_RAW)
 #define LIRC_CAN_SEND_PULSE            LIRC_MODE2SEND(LIRC_MODE_PULSE)
 #define LIRC_CAN_SEND_MODE2            LIRC_MODE2SEND(LIRC_MODE_MODE2)
-#define LIRC_CAN_SEND_SCANCODE         LIRC_MODE2SEND(LIRC_MODE_SCANCODE)
 #define LIRC_CAN_SEND_LIRCCODE         LIRC_MODE2SEND(LIRC_MODE_LIRCCODE)
 
 #define LIRC_CAN_SEND_MASK             0x0000003f
-- 
cgit v1.2.3


From d3dcf8eb615537526bd42ff27a081d46d337816e Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 4 Mar 2018 17:29:48 +0200
Subject: rhashtable: Fix rhlist duplicates insertion

When inserting duplicate objects (those with the same key),
current rhlist implementation messes up the chain pointers by
updating the bucket pointer instead of prev next pointer to the
newly inserted node. This causes missing elements on removal and
travesal.

Fix that by properly updating pprev pointer to point to
the correct rhash_head next pointer.

Issue: 1241076
Change-Id: I86b2c140bcb4aeb10b70a72a267ff590bb2b17e7
Fixes: ca26893f05e8 ('rhashtable: Add rhlist interface')
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 4 +++-
 lib/rhashtable.c           | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index c9df2527e0cd..668a21f04b09 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -766,8 +766,10 @@ slow_path:
 		if (!key ||
 		    (params.obj_cmpfn ?
 		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-		     rhashtable_compare(&arg, rht_obj(ht, head))))
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
 			continue;
+		}
 
 		data = rht_obj(ht, head);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 3825c30aaa36..47de025b6245 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -506,8 +506,10 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		if (!key ||
 		    (ht->p.obj_cmpfn ?
 		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
-		     rhashtable_compare(&arg, rht_obj(ht, head))))
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
 			continue;
+		}
 
 		if (!ht->rhlist)
 			return rht_obj(ht, head);
-- 
cgit v1.2.3


From 35d889d10b649fda66121891ec05eca88150059d Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Mon, 5 Mar 2018 20:52:54 +0300
Subject: sch_netem: fix skb leak in netem_enqueue()

When we exceed current packets limit and we have more than one
segment in the list returned by skb_gso_segment(), netem drops
only the first one, skipping the rest, hence kmemleak reports:

unreferenced object 0xffff880b5d23b600 (size 1024):
  comm "softirq", pid 0, jiffies 4384527763 (age 2770.629s)
  hex dump (first 32 bytes):
    00 80 23 5d 0b 88 ff ff 00 00 00 00 00 00 00 00  ..#]............
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<00000000d8a19b9d>] __alloc_skb+0xc9/0x520
    [<000000001709b32f>] skb_segment+0x8c8/0x3710
    [<00000000c7b9bb88>] tcp_gso_segment+0x331/0x1830
    [<00000000c921cba1>] inet_gso_segment+0x476/0x1370
    [<000000008b762dd4>] skb_mac_gso_segment+0x1f9/0x510
    [<000000002182660a>] __skb_gso_segment+0x1dd/0x620
    [<00000000412651b9>] netem_enqueue+0x1536/0x2590 [sch_netem]
    [<0000000005d3b2a9>] __dev_queue_xmit+0x1167/0x2120
    [<00000000fc5f7327>] ip_finish_output2+0x998/0xf00
    [<00000000d309e9d3>] ip_output+0x1aa/0x2c0
    [<000000007ecbd3a4>] tcp_transmit_skb+0x18db/0x3670
    [<0000000042d2a45f>] tcp_write_xmit+0x4d4/0x58c0
    [<0000000056a44199>] tcp_tasklet_func+0x3d9/0x540
    [<0000000013d06d02>] tasklet_action+0x1ca/0x250
    [<00000000fcde0b8b>] __do_softirq+0x1b4/0x5a3
    [<00000000e7ed027c>] irq_exit+0x1e2/0x210

Fix it by adding the rest of the segments, if any, to skb 'to_free'
list. Add new __qdisc_drop_all() and qdisc_drop_all() functions
because they can be useful in the future if we need to drop segmented
GSO packets in other places.

Fixes: 6071bd1aa13e ("netem: Segment GSO packets on enqueue")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 19 +++++++++++++++++++
 net/sched/sch_netem.c     |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e2ab13687fb9..2092d33194dd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -824,6 +824,16 @@ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
 	*to_free = skb;
 }
 
+static inline void __qdisc_drop_all(struct sk_buff *skb,
+				    struct sk_buff **to_free)
+{
+	if (skb->prev)
+		skb->prev->next = *to_free;
+	else
+		skb->next = *to_free;
+	*to_free = skb;
+}
+
 static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
 						   struct qdisc_skb_head *qh,
 						   struct sk_buff **to_free)
@@ -956,6 +966,15 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
 	return NET_XMIT_DROP;
 }
 
+static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
+				 struct sk_buff **to_free)
+{
+	__qdisc_drop_all(skb, to_free);
+	qdisc_qstats_drop(sch);
+
+	return NET_XMIT_DROP;
+}
+
 /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
    long it will take to send a packet given its size.
  */
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7c179addebcd..7d6801fc5340 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -509,7 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	if (unlikely(sch->q.qlen >= sch->limit))
-		return qdisc_drop(skb, sch, to_free);
+		return qdisc_drop_all(skb, sch, to_free);
 
 	qdisc_qstats_backlog_inc(sch, skb);
 
-- 
cgit v1.2.3


From 2695578b896aea472b2c0dcbe9d92daa71738484 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 5 Mar 2018 11:41:13 -0800
Subject: net: usbnet: fix potential deadlock on 32bit hosts

Marek reported a LOCKDEP issue occurring on 32bit host,
that we tracked down to the fact that usbnet could either
run from soft or hard irqs.

This patch adds u64_stats_update_begin_irqsave() and
u64_stats_update_end_irqrestore() helpers to solve this case.

[   17.768040] ================================
[   17.772239] WARNING: inconsistent lock state
[   17.776511] 4.16.0-rc3-next-20180227-00007-g876c53a7493c #453 Not tainted
[   17.783329] --------------------------------
[   17.787580] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
[   17.793607] swapper/0/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[   17.798751]  (&syncp->seq#5){?.-.}, at: [<9b22e5f0>]
asix_rx_fixup_internal+0x188/0x288
[   17.806790] {IN-HARDIRQ-W} state was registered at:
[   17.811677]   tx_complete+0x100/0x208
[   17.815319]   __usb_hcd_giveback_urb+0x60/0xf0
[   17.819770]   xhci_giveback_urb_in_irq+0xa8/0x240
[   17.824469]   xhci_td_cleanup+0xf4/0x16c
[   17.828367]   xhci_irq+0xe74/0x2240
[   17.831827]   usb_hcd_irq+0x24/0x38
[   17.835343]   __handle_irq_event_percpu+0x98/0x510
[   17.840111]   handle_irq_event_percpu+0x1c/0x58
[   17.844623]   handle_irq_event+0x38/0x5c
[   17.848519]   handle_fasteoi_irq+0xa4/0x138
[   17.852681]   generic_handle_irq+0x18/0x28
[   17.856760]   __handle_domain_irq+0x6c/0xe4
[   17.860941]   gic_handle_irq+0x54/0xa0
[   17.864666]   __irq_svc+0x70/0xb0
[   17.867964]   arch_cpu_idle+0x20/0x3c
[   17.871578]   arch_cpu_idle+0x20/0x3c
[   17.875190]   do_idle+0x144/0x218
[   17.878468]   cpu_startup_entry+0x18/0x1c
[   17.882454]   start_kernel+0x394/0x400
[   17.886177] irq event stamp: 161912
[   17.889616] hardirqs last  enabled at (161912): [<7bedfacf>]
__netdev_alloc_skb+0xcc/0x140
[   17.897893] hardirqs last disabled at (161911): [<d58261d0>]
__netdev_alloc_skb+0x94/0x140
[   17.904903] exynos5-hsi2c 12ca0000.i2c: tx timeout
[   17.906116] softirqs last  enabled at (161904): [<387102ff>]
irq_enter+0x78/0x80
[   17.906123] softirqs last disabled at (161905): [<cf4c628e>]
irq_exit+0x134/0x158
[   17.925722].
[   17.925722] other info that might help us debug this:
[   17.933435]  Possible unsafe locking scenario:
[   17.933435].
[   17.940331]        CPU0
[   17.942488]        ----
[   17.944894]   lock(&syncp->seq#5);
[   17.948274]   <Interrupt>
[   17.950847]     lock(&syncp->seq#5);
[   17.954386].
[   17.954386]  *** DEADLOCK ***
[   17.954386].
[   17.962422] no locks held by swapper/0/0.

Fixes: c8b5d129ee29 ("net: usbnet: support 64bit stats")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c       | 10 ++++++----
 include/linux/u64_stats_sync.h | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 8a22ff67b026..d9eea8cfe6cb 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -315,6 +315,7 @@ static void __usbnet_status_stop_force(struct usbnet *dev)
 void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 {
 	struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
+	unsigned long flags;
 	int	status;
 
 	if (test_bit(EVENT_RX_PAUSED, &dev->flags)) {
@@ -326,10 +327,10 @@ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 	if (skb->protocol == 0)
 		skb->protocol = eth_type_trans (skb, dev->net);
 
-	u64_stats_update_begin(&stats64->syncp);
+	flags = u64_stats_update_begin_irqsave(&stats64->syncp);
 	stats64->rx_packets++;
 	stats64->rx_bytes += skb->len;
-	u64_stats_update_end(&stats64->syncp);
+	u64_stats_update_end_irqrestore(&stats64->syncp, flags);
 
 	netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n",
 		  skb->len + sizeof (struct ethhdr), skb->protocol);
@@ -1248,11 +1249,12 @@ static void tx_complete (struct urb *urb)
 
 	if (urb->status == 0) {
 		struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
+		unsigned long flags;
 
-		u64_stats_update_begin(&stats64->syncp);
+		flags = u64_stats_update_begin_irqsave(&stats64->syncp);
 		stats64->tx_packets += entry->packets;
 		stats64->tx_bytes += entry->length;
-		u64_stats_update_end(&stats64->syncp);
+		u64_stats_update_end_irqrestore(&stats64->syncp, flags);
 	} else {
 		dev->net->stats.tx_errors++;
 
diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 5bdbd9f49395..07ee0f84a46c 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -90,6 +90,28 @@ static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
 #endif
 }
 
+static inline unsigned long
+u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
+{
+	unsigned long flags = 0;
+
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	local_irq_save(flags);
+	write_seqcount_begin(&syncp->seq);
+#endif
+	return flags;
+}
+
+static inline void
+u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
+				unsigned long flags)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	write_seqcount_end(&syncp->seq);
+	local_irq_restore(flags);
+#endif
+}
+
 static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-- 
cgit v1.2.3


From 3a4030761ea88ff439030ca98e3094b9900e96b7 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 9 Mar 2018 14:50:34 +0800
Subject: vhost_net: examine pointer types during un-producing

After commit fc72d1d54dd9 ("tuntap: XDP transmission"), we can
actually queueing XDP pointers in the pointer ring, so we should
examine the pointer type before freeing the pointer.

Fixes: fc72d1d54dd9 ("tuntap: XDP transmission")
Reported-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 3 ++-
 drivers/vhost/net.c    | 2 +-
 include/linux/if_tun.h | 4 ++++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7433bb2e4451..28cfa642e39a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -655,7 +655,7 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 	return tun;
 }
 
-static void tun_ptr_free(void *ptr)
+void tun_ptr_free(void *ptr)
 {
 	if (!ptr)
 		return;
@@ -667,6 +667,7 @@ static void tun_ptr_free(void *ptr)
 		__skb_array_destroy_skb(ptr);
 	}
 }
+EXPORT_SYMBOL_GPL(tun_ptr_free);
 
 static void tun_queue_purge(struct tun_file *tfile)
 {
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index efb93063fda1..8139bc70ad7d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -170,7 +170,7 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
 		ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
 				   vhost_net_buf_get_size(rxq),
-				   __skb_array_destroy_skb);
+				   tun_ptr_free);
 		rxq->head = rxq->tail = 0;
 	}
 }
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index c5b0a75a7812..fd00170b494f 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -25,6 +25,7 @@ struct ptr_ring *tun_get_tx_ring(struct file *file);
 bool tun_is_xdp_buff(void *ptr);
 void *tun_xdp_to_ptr(void *ptr);
 void *tun_ptr_to_xdp(void *ptr);
+void tun_ptr_free(void *ptr);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -50,5 +51,8 @@ static inline void *tun_ptr_to_xdp(void *ptr)
 {
 	return NULL;
 }
+static inline void tun_ptr_free(void *ptr)
+{
+}
 #endif /* CONFIG_TUN */
 #endif /* __IF_TUN_H */
-- 
cgit v1.2.3


From b1d0a5d0cba4597c0394997b2d5fced3e3841b4e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 10 Mar 2018 01:15:45 +0100
Subject: netfilter: x_tables: add and use xt_check_proc_name

recent and hashlimit both create /proc files, but only check that
name is 0 terminated.

This can trigger WARN() from procfs when name is "" or "/".
Add helper for this and then use it for both.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: <syzbot+0502b00edac2a0680b61@syzkaller.appspotmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  2 ++
 net/netfilter/x_tables.c           | 30 ++++++++++++++++++++++++++++++
 net/netfilter/xt_hashlimit.c       | 16 ++++++++++------
 net/netfilter/xt_recent.c          |  6 +++---
 4 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1313b35c3ab7..14529511c4b8 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -285,6 +285,8 @@ unsigned int *xt_alloc_entry_offsets(unsigned int size);
 bool xt_find_jump_offset(const unsigned int *offsets,
 			 unsigned int target, unsigned int size);
 
+int xt_check_proc_name(const char *name, unsigned int size);
+
 int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 		   bool inv_proto);
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index fa1655aff8d3..4aa01c90e9d1 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -423,6 +423,36 @@ textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto)
 	return buf;
 }
 
+/**
+ * xt_check_proc_name - check that name is suitable for /proc file creation
+ *
+ * @name: file name candidate
+ * @size: length of buffer
+ *
+ * some x_tables modules wish to create a file in /proc.
+ * This function makes sure that the name is suitable for this
+ * purpose, it checks that name is NUL terminated and isn't a 'special'
+ * name, like "..".
+ *
+ * returns negative number on error or 0 if name is useable.
+ */
+int xt_check_proc_name(const char *name, unsigned int size)
+{
+	if (name[0] == '\0')
+		return -EINVAL;
+
+	if (strnlen(name, size) == size)
+		return -ENAMETOOLONG;
+
+	if (strcmp(name, ".") == 0 ||
+	    strcmp(name, "..") == 0 ||
+	    strchr(name, '/'))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_check_proc_name);
+
 int xt_check_match(struct xt_mtchk_param *par,
 		   unsigned int size, u_int8_t proto, bool inv_proto)
 {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 66f5aca62a08..3360f13dc208 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -917,8 +917,9 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
-	if (info->name[sizeof(info->name) - 1] != '\0')
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
 
@@ -935,8 +936,9 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par)
 	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
-	if (info->name[sizeof(info->name) - 1] != '\0')
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
 
@@ -950,9 +952,11 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par)
 static int hashlimit_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
+	int ret;
 
-	if (info->name[sizeof(info->name) - 1] != '\0')
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
 					 info->name, 3);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 6d232d18faff..81ee1d6543b2 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -361,9 +361,9 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
 				    info->hit_count, XT_RECENT_MAX_NSTAMPS - 1);
 		return -EINVAL;
 	}
-	if (info->name[0] == '\0' ||
-	    strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN)
-		return -EINVAL;
+	ret = xt_check_proc_name(info->name, sizeof(info->name));
+	if (ret)
+		return ret;
 
 	if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot)
 		nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1;
-- 
cgit v1.2.3


From a2c054a896b8ac794ddcfc7c92e2dc7ec4ed4ed5 Mon Sep 17 00:00:00 2001
From: Brad Mouring <brad.mouring@ni.com>
Date: Thu, 8 Mar 2018 16:23:03 -0600
Subject: net: phy: Tell caller result of phy_change()

In 664fcf123a30e (net: phy: Threaded interrupts allow some simplification)
the phy_interrupt system was changed to use a traditional threaded
interrupt scheme instead of a workqueue approach.

With this change, the phy status check moved into phy_change, which
did not report back to the caller whether or not the interrupt was
handled. This means that, in the case of a shared phy interrupt,
only the first phydev's interrupt registers are checked (since
phy_interrupt() would always return IRQ_HANDLED). This leads to
interrupt storms when it is a secondary device that's actually the
interrupt source.

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 145 +++++++++++++++++++++++++-------------------------
 include/linux/phy.h   |   1 -
 2 files changed, 72 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a6f924fee584..9aabfa1a455a 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -617,6 +617,77 @@ static void phy_error(struct phy_device *phydev)
 	phy_trigger_machine(phydev, false);
 }
 
+/**
+ * phy_disable_interrupts - Disable the PHY interrupts from the PHY side
+ * @phydev: target phy_device struct
+ */
+static int phy_disable_interrupts(struct phy_device *phydev)
+{
+	int err;
+
+	/* Disable PHY interrupts */
+	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
+	if (err)
+		goto phy_err;
+
+	/* Clear the interrupt */
+	err = phy_clear_interrupt(phydev);
+	if (err)
+		goto phy_err;
+
+	return 0;
+
+phy_err:
+	phy_error(phydev);
+
+	return err;
+}
+
+/**
+ * phy_change - Called by the phy_interrupt to handle PHY changes
+ * @phydev: phy_device struct that interrupted
+ */
+static irqreturn_t phy_change(struct phy_device *phydev)
+{
+	if (phy_interrupt_is_valid(phydev)) {
+		if (phydev->drv->did_interrupt &&
+		    !phydev->drv->did_interrupt(phydev))
+			return IRQ_NONE;
+
+		if (phydev->state == PHY_HALTED)
+			if (phy_disable_interrupts(phydev))
+				goto phy_err;
+	}
+
+	mutex_lock(&phydev->lock);
+	if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state))
+		phydev->state = PHY_CHANGELINK;
+	mutex_unlock(&phydev->lock);
+
+	/* reschedule state queue work to run as soon as possible */
+	phy_trigger_machine(phydev, true);
+
+	if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev))
+		goto phy_err;
+	return IRQ_HANDLED;
+
+phy_err:
+	phy_error(phydev);
+	return IRQ_NONE;
+}
+
+/**
+ * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes
+ * @work: work_struct that describes the work to be done
+ */
+void phy_change_work(struct work_struct *work)
+{
+	struct phy_device *phydev =
+		container_of(work, struct phy_device, phy_queue);
+
+	phy_change(phydev);
+}
+
 /**
  * phy_interrupt - PHY interrupt handler
  * @irq: interrupt line
@@ -632,9 +703,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	if (PHY_HALTED == phydev->state)
 		return IRQ_NONE;		/* It can't be ours.  */
 
-	phy_change(phydev);
-
-	return IRQ_HANDLED;
+	return phy_change(phydev);
 }
 
 /**
@@ -651,32 +720,6 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 	return phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
 }
 
-/**
- * phy_disable_interrupts - Disable the PHY interrupts from the PHY side
- * @phydev: target phy_device struct
- */
-static int phy_disable_interrupts(struct phy_device *phydev)
-{
-	int err;
-
-	/* Disable PHY interrupts */
-	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
-	if (err)
-		goto phy_err;
-
-	/* Clear the interrupt */
-	err = phy_clear_interrupt(phydev);
-	if (err)
-		goto phy_err;
-
-	return 0;
-
-phy_err:
-	phy_error(phydev);
-
-	return err;
-}
-
 /**
  * phy_start_interrupts - request and enable interrupts for a PHY device
  * @phydev: target phy_device struct
@@ -719,50 +762,6 @@ int phy_stop_interrupts(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_stop_interrupts);
 
-/**
- * phy_change - Called by the phy_interrupt to handle PHY changes
- * @phydev: phy_device struct that interrupted
- */
-void phy_change(struct phy_device *phydev)
-{
-	if (phy_interrupt_is_valid(phydev)) {
-		if (phydev->drv->did_interrupt &&
-		    !phydev->drv->did_interrupt(phydev))
-			return;
-
-		if (phydev->state == PHY_HALTED)
-			if (phy_disable_interrupts(phydev))
-				goto phy_err;
-	}
-
-	mutex_lock(&phydev->lock);
-	if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state))
-		phydev->state = PHY_CHANGELINK;
-	mutex_unlock(&phydev->lock);
-
-	/* reschedule state queue work to run as soon as possible */
-	phy_trigger_machine(phydev, true);
-
-	if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev))
-		goto phy_err;
-	return;
-
-phy_err:
-	phy_error(phydev);
-}
-
-/**
- * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes
- * @work: work_struct that describes the work to be done
- */
-void phy_change_work(struct work_struct *work)
-{
-	struct phy_device *phydev =
-		container_of(work, struct phy_device, phy_queue);
-
-	phy_change(phydev);
-}
-
 /**
  * phy_stop - Bring down the PHY link, and stop checking the status
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d7069539f351..b260fb336b25 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1012,7 +1012,6 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
 int phy_drivers_register(struct phy_driver *new_driver, int n,
 			 struct module *owner);
 void phy_state_machine(struct work_struct *work);
-void phy_change(struct phy_device *phydev);
 void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
-- 
cgit v1.2.3


From bf2ae2e4bf9360e07c0cdfa166bcdc0afd92f4ce Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 10 Mar 2018 18:57:50 +0800
Subject: sock_diag: request _diag module only when the family or proto has
 been registered

Now when using 'ss' in iproute, kernel would try to load all _diag
modules, which also causes corresponding family and proto modules
to be loaded as well due to module dependencies.

Like after running 'ss', sctp, dccp, af_packet (if it works as a module)
would be loaded.

For example:

  $ lsmod|grep sctp
  $ ss
  $ lsmod|grep sctp
  sctp_diag              16384  0
  sctp                  323584  5 sctp_diag
  inet_diag              24576  4 raw_diag,tcp_diag,sctp_diag,udp_diag
  libcrc32c              16384  3 nf_conntrack,nf_nat,sctp

As these family and proto modules are loaded unintentionally, it
could cause some problems, like:

- Some debug tools use 'ss' to collect the socket info, which loads all
  those diag and family and protocol modules. It's noisy for identifying
  issues.

- Users usually expect to drop sctp init packet silently when they
  have no sense of sctp protocol instead of sending abort back.

- It wastes resources (especially with multiple netns), and SCTP module
  can't be unloaded once it's loaded.

...

In short, it's really inappropriate to have these family and proto
modules loaded unexpectedly when just doing debugging with inet_diag.

This patch is to introduce sock_load_diag_module() where it loads
the _diag module only when it's corresponding family or proto has
been already registered.

Note that we can't just load _diag module without the family or
proto loaded, as some symbols used in _diag module are from the
family or proto module.

v1->v2:
  - move inet proto check to inet_diag to avoid a compiling err.
v2->v3:
  - define sock_load_diag_module in sock.c and export one symbol
    only.
  - improve the changelog.

Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h  |  1 +
 include/net/sock.h   |  1 +
 net/core/sock.c      | 21 +++++++++++++++++++++
 net/core/sock_diag.c | 12 ++++--------
 net/ipv4/inet_diag.c |  3 +--
 net/socket.c         |  5 +++++
 6 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 91216b16feb7..2a0391eea05c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -222,6 +222,7 @@ enum {
 int sock_wake_async(struct socket_wq *sk_wq, int how, int band);
 int sock_register(const struct net_proto_family *fam);
 void sock_unregister(int family);
+bool sock_is_registered(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
 		  struct socket **res, int kern);
 int sock_create(int family, int type, int proto, struct socket **res);
diff --git a/include/net/sock.h b/include/net/sock.h
index 169c92afcafa..ae23f3b389ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1137,6 +1137,7 @@ struct proto {
 
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
+int sock_load_diag_module(int family, int protocol);
 
 #ifdef SOCK_REFCNT_DEBUG
 static inline void sk_refcnt_debug_inc(struct sock *sk)
diff --git a/net/core/sock.c b/net/core/sock.c
index c501499a04fe..85b0b64e7f9d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3261,6 +3261,27 @@ void proto_unregister(struct proto *prot)
 }
 EXPORT_SYMBOL(proto_unregister);
 
+int sock_load_diag_module(int family, int protocol)
+{
+	if (!protocol) {
+		if (!sock_is_registered(family))
+			return -ENOENT;
+
+		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+				      NETLINK_SOCK_DIAG, family);
+	}
+
+#ifdef CONFIG_INET
+	if (family == AF_INET &&
+	    !rcu_access_pointer(inet_protos[protocol]))
+		return -ENOENT;
+#endif
+
+	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+			      NETLINK_SOCK_DIAG, family, protocol);
+}
+EXPORT_SYMBOL(sock_load_diag_module);
+
 #ifdef CONFIG_PROC_FS
 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(proto_list_mutex)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 146b50e30659..c37b5be7c5e4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -220,8 +220,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -EINVAL;
 
 	if (sock_diag_handlers[req->sdiag_family] == NULL)
-		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				NETLINK_SOCK_DIAG, req->sdiag_family);
+		sock_load_diag_module(req->sdiag_family, 0);
 
 	mutex_lock(&sock_diag_table_mutex);
 	hndl = sock_diag_handlers[req->sdiag_family];
@@ -247,8 +246,7 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	case TCPDIAG_GETSOCK:
 	case DCCPDIAG_GETSOCK:
 		if (inet_rcv_compat == NULL)
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-					NETLINK_SOCK_DIAG, AF_INET);
+			sock_load_diag_module(AF_INET, 0);
 
 		mutex_lock(&sock_diag_table_mutex);
 		if (inet_rcv_compat != NULL)
@@ -281,14 +279,12 @@ static int sock_diag_bind(struct net *net, int group)
 	case SKNLGRP_INET_TCP_DESTROY:
 	case SKNLGRP_INET_UDP_DESTROY:
 		if (!sock_diag_handlers[AF_INET])
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				       NETLINK_SOCK_DIAG, AF_INET);
+			sock_load_diag_module(AF_INET, 0);
 		break;
 	case SKNLGRP_INET6_TCP_DESTROY:
 	case SKNLGRP_INET6_UDP_DESTROY:
 		if (!sock_diag_handlers[AF_INET6])
-			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				       NETLINK_SOCK_DIAG, AF_INET6);
+			sock_load_diag_module(AF_INET6, 0);
 		break;
 	}
 	return 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a383f299ce24..4e5bc4b2f14e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -53,8 +53,7 @@ static DEFINE_MUTEX(inet_diag_table_mutex);
 static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
 {
 	if (!inet_diag_table[proto])
-		request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
-			       NETLINK_SOCK_DIAG, AF_INET, proto);
+		sock_load_diag_module(AF_INET, proto);
 
 	mutex_lock(&inet_diag_table_mutex);
 	if (!inet_diag_table[proto])
diff --git a/net/socket.c b/net/socket.c
index a93c99b518ca..08847c3b8c39 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2587,6 +2587,11 @@ void sock_unregister(int family)
 }
 EXPORT_SYMBOL(sock_unregister);
 
+bool sock_is_registered(int family)
+{
+	return family < NPROTO && rcu_access_pointer(net_families[family]);
+}
+
 static int __init sock_init(void)
 {
 	int err;
-- 
cgit v1.2.3


From c2b37f76485f073f020e60b5954b6dc4e55f693c Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Thu, 8 Mar 2018 15:51:41 +0200
Subject: IB/mlx5: Fix integer overflows in mlx5_ib_create_srq

This patch validates user provided input to prevent integer overflow due
to integer manipulation in the mlx5_ib_create_srq function.

Cc: syzkaller <syzkaller@googlegroups.com>
Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/srq.c | 15 +++++++++------
 include/linux/mlx5/driver.h      |  4 ++--
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 6d5fadad9090..3c7522d025f2 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -241,8 +241,8 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_srq *srq;
-	int desc_size;
-	int buf_size;
+	size_t desc_size;
+	size_t buf_size;
 	int err;
 	struct mlx5_srq_attr in = {0};
 	__u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
@@ -266,15 +266,18 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 
 	desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
 		    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
+	if (desc_size == 0 || srq->msrq.max_gs > desc_size)
+		return ERR_PTR(-EINVAL);
 	desc_size = roundup_pow_of_two(desc_size);
-	desc_size = max_t(int, 32, desc_size);
+	desc_size = max_t(size_t, 32, desc_size);
+	if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg))
+		return ERR_PTR(-EINVAL);
 	srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
 		sizeof(struct mlx5_wqe_data_seg);
 	srq->msrq.wqe_shift = ilog2(desc_size);
 	buf_size = srq->msrq.max * desc_size;
-	mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n",
-		    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
-		    srq->msrq.max_avail_gather);
+	if (buf_size < desc_size)
+		return ERR_PTR(-EINVAL);
 	in.type = init_attr->srq_type;
 
 	if (pd->uobject)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6ed79a8a8318..9d3a03364e6e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -453,8 +453,8 @@ struct mlx5_core_srq {
 	struct mlx5_core_rsc_common	common; /* must be first */
 	u32		srqn;
 	int		max;
-	int		max_gs;
-	int		max_avail_gather;
+	size_t		max_gs;
+	size_t		max_avail_gather;
 	int		wqe_shift;
 	void (*event)	(struct mlx5_core_srq *, enum mlx5_event);
 
-- 
cgit v1.2.3


From 6417250d3f894e66a68ba1cd93676143f2376a6f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 6 Mar 2018 19:34:42 -0800
Subject: workqueue: remove unused cancel_work()

Found this by accident.
There are no usages of bare cancel_work() in current kernel source.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 1 -
 kernel/workqueue.c        | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bc0cda180c8b..0c3301421c57 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -456,7 +456,6 @@ extern int schedule_on_each_cpu(work_func_t func);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern bool flush_work(struct work_struct *work);
-extern bool cancel_work(struct work_struct *work);
 extern bool cancel_work_sync(struct work_struct *work);
 
 extern bool flush_delayed_work(struct delayed_work *dwork);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ccd1080dd6e7..6ec6ba65127b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
 	return ret;
 }
 
-/*
- * See cancel_delayed_work()
- */
-bool cancel_work(struct work_struct *work)
-{
-	return __cancel_work(work, false);
-}
-
 /**
  * cancel_delayed_work - cancel a delayed work
  * @dwork: delayed_work to cancel
-- 
cgit v1.2.3


From 4dcb31d4649df36297296b819437709f5407059c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 14 Mar 2018 09:04:16 -0700
Subject: net: use skb_to_full_sk() in skb_update_prio()

Andrei Vagin reported a KASAN: slab-out-of-bounds error in
skb_update_prio()

Since SYNACK might be attached to a request socket, we need to
get back to the listener socket.
Since this listener is manipulated without locks, add const
qualifiers to sock_cgroup_prioidx() so that the const can also
be used in skb_update_prio()

Also add the const qualifier to sock_cgroup_classid() for consistency.

Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup-defs.h |  4 ++--
 net/core/dev.c              | 22 +++++++++++++++-------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9f242b876fde..f8e76d01a5ad 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -755,13 +755,13 @@ struct sock_cgroup_data {
  * updaters and return part of the previous pointer as the prioidx or
  * classid.  Such races are short-lived and the result isn't critical.
  */
-static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
+static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
 {
 	/* fallback to 1 which is always the ID of the root cgroup */
 	return (skcd->is_data & 1) ? skcd->prioidx : 1;
 }
 
-static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
+static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
 {
 	/* fallback to 0 which is the unconfigured default classid */
 	return (skcd->is_data & 1) ? skcd->classid : 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2cedf520cb28..12be20535714 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3278,15 +3278,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 static void skb_update_prio(struct sk_buff *skb)
 {
-	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+	const struct netprio_map *map;
+	const struct sock *sk;
+	unsigned int prioidx;
 
-	if (!skb->priority && skb->sk && map) {
-		unsigned int prioidx =
-			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
+	if (skb->priority)
+		return;
+	map = rcu_dereference_bh(skb->dev->priomap);
+	if (!map)
+		return;
+	sk = skb_to_full_sk(skb);
+	if (!sk)
+		return;
 
-		if (prioidx < map->priomap_len)
-			skb->priority = map->priomap[prioidx];
-	}
+	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
+
+	if (prioidx < map->priomap_len)
+		skb->priority = map->priomap[prioidx];
 }
 #else
 #define skb_update_prio(skb)
-- 
cgit v1.2.3


From d52e5a7e7ca49457dd31fc8b42fb7c0d58a31221 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Mar 2018 10:21:14 +0100
Subject: ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu

Prior to the rework of PMTU information storage in commit
2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer."),
when a PMTU event advertising a PMTU smaller than
net.ipv4.route.min_pmtu was received, we would disable setting the DF
flag on packets by locking the MTU metric, and set the PMTU to
net.ipv4.route.min_pmtu.

Since then, we don't disable DF, and set PMTU to
net.ipv4.route.min_pmtu, so the intermediate router that has this link
with a small MTU will have to drop the packets.

This patch reestablishes pre-2.6.39 behavior by splitting
rtable->rt_pmtu into a bitfield with rt_mtu_locked and rt_pmtu.
rt_mtu_locked indicates that we shouldn't set the DF bit on that path,
and is checked in ip_dont_fragment().

One possible workaround is to set net.ipv4.route.min_pmtu to a value low
enough to accommodate the lowest MTU encountered.

Fixes: 2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer.")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h        | 11 +++++++++--
 include/net/ip_fib.h    |  1 +
 include/net/route.h     |  3 ++-
 net/ipv4/route.c        | 26 +++++++++++++++++++-------
 net/ipv4/xfrm4_policy.c |  1 +
 5 files changed, 32 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 746abff9ce51..f49b3a576bec 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -328,6 +328,13 @@ int ip_decrease_ttl(struct iphdr *iph)
 	return --iph->ttl;
 }
 
+static inline int ip_mtu_locked(const struct dst_entry *dst)
+{
+	const struct rtable *rt = (const struct rtable *)dst;
+
+	return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
+}
+
 static inline
 int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 {
@@ -335,7 +342,7 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 
 	return  pmtudisc == IP_PMTUDISC_DO ||
 		(pmtudisc == IP_PMTUDISC_WANT &&
-		 !(dst_metric_locked(dst, RTAX_MTU)));
+		 !ip_mtu_locked(dst));
 }
 
 static inline bool ip_sk_accept_pmtu(const struct sock *sk)
@@ -361,7 +368,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	struct net *net = dev_net(dst->dev);
 
 	if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
-	    dst_metric_locked(dst, RTAX_MTU) ||
+	    ip_mtu_locked(dst) ||
 	    !forwarding)
 		return dst_mtu(dst);
 
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f80524396c06..77d0a78cf7d2 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -59,6 +59,7 @@ struct fib_nh_exception {
 	int				fnhe_genid;
 	__be32				fnhe_daddr;
 	u32				fnhe_pmtu;
+	bool				fnhe_mtu_locked;
 	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
 	struct rtable __rcu		*fnhe_rth_input;
diff --git a/include/net/route.h b/include/net/route.h
index 40b870d58f38..20a92ca9e115 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -63,7 +63,8 @@ struct rtable {
 	__be32			rt_gateway;
 
 	/* Miscellaneous cached information */
-	u32			rt_pmtu;
+	u32			rt_mtu_locked:1,
+				rt_pmtu:31;
 
 	u32			rt_table_id;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f9dbb8cb66bf..299e247b2032 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -634,6 +634,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 {
 	rt->rt_pmtu = fnhe->fnhe_pmtu;
+	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 	rt->dst.expires = fnhe->fnhe_expires;
 
 	if (fnhe->fnhe_gw) {
@@ -644,7 +645,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 }
 
 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
-				  u32 pmtu, unsigned long expires)
+				  u32 pmtu, bool lock, unsigned long expires)
 {
 	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
@@ -681,8 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 			fnhe->fnhe_genid = genid;
 		if (gw)
 			fnhe->fnhe_gw = gw;
-		if (pmtu)
+		if (pmtu) {
 			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_mtu_locked = lock;
+		}
 		fnhe->fnhe_expires = max(1UL, expires);
 		/* Update all cached dsts too */
 		rt = rcu_dereference(fnhe->fnhe_rth_input);
@@ -706,6 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 		fnhe->fnhe_daddr = daddr;
 		fnhe->fnhe_gw = gw;
 		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_mtu_locked = lock;
 		fnhe->fnhe_expires = expires;
 
 		/* Exception created; mark the cached routes for the nexthop
@@ -787,7 +791,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 				struct fib_nh *nh = &FIB_RES_NH(res);
 
 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
-						0, jiffies + ip_rt_gc_timeout);
+						0, false,
+						jiffies + ip_rt_gc_timeout);
 			}
 			if (kill_route)
 				rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1009,15 +1014,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
 	struct dst_entry *dst = &rt->dst;
 	struct fib_result res;
+	bool lock = false;
 
-	if (dst_metric_locked(dst, RTAX_MTU))
+	if (ip_mtu_locked(dst))
 		return;
 
 	if (ipv4_mtu(dst) < mtu)
 		return;
 
-	if (mtu < ip_rt_min_pmtu)
+	if (mtu < ip_rt_min_pmtu) {
+		lock = true;
 		mtu = ip_rt_min_pmtu;
+	}
 
 	if (rt->rt_pmtu == mtu &&
 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
@@ -1027,7 +1035,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
 
-		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
 				      jiffies + ip_rt_mtu_expires);
 	}
 	rcu_read_unlock();
@@ -1280,7 +1288,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 
 	mtu = READ_ONCE(dst->dev->mtu);
 
-	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+	if (unlikely(ip_mtu_locked(dst))) {
 		if (rt->rt_uses_gateway && mtu > 576)
 			mtu = 576;
 	}
@@ -1521,6 +1529,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 		rt->rt_is_input = 0;
 		rt->rt_iif = 0;
 		rt->rt_pmtu = 0;
+		rt->rt_mtu_locked = 0;
 		rt->rt_gateway = 0;
 		rt->rt_uses_gateway = 0;
 		rt->rt_table_id = 0;
@@ -2546,6 +2555,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_is_input = ort->rt_is_input;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_pmtu = ort->rt_pmtu;
+		rt->rt_mtu_locked = ort->rt_mtu_locked;
 
 		rt->rt_genid = rt_genid_ipv4(net);
 		rt->rt_flags = ort->rt_flags;
@@ -2648,6 +2658,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
 	if (rt->rt_pmtu && expires)
 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+	if (rt->rt_mtu_locked && expires)
+		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
 	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8d33f7b311f4..fbebda67ac1b 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -100,6 +100,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
 	xdst->u.rt.rt_table_id = rt->rt_table_id;
 	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
 	rt_add_uncached_list(&xdst->u.rt);
-- 
cgit v1.2.3


From c658dc58c7eaa8569ceb0edd1ddbdfda84fe8aa5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 15 Mar 2018 11:22:28 +0200
Subject: mmc: core: Fix tracepoint print of blk_addr and blksz

Swap the positions of blk_addr and blksz in the tracepoint print arguments
so that they match the print format.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Fixes: d2f82254e4e8 ("mmc: core: Add members to mmc_request and mmc_data for CQE's")
Cc: <stable@vger.kernel.org> # 4.14+
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/trace/events/mmc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/mmc.h b/include/trace/events/mmc.h
index 200f731be557..7b706ff21335 100644
--- a/include/trace/events/mmc.h
+++ b/include/trace/events/mmc.h
@@ -86,8 +86,8 @@ TRACE_EVENT(mmc_request_start,
 		  __entry->stop_flags, __entry->stop_retries,
 		  __entry->sbc_opcode, __entry->sbc_arg,
 		  __entry->sbc_flags, __entry->sbc_retries,
-		  __entry->blocks, __entry->blk_addr,
-		  __entry->blksz, __entry->data_flags, __entry->tag,
+		  __entry->blocks, __entry->blksz,
+		  __entry->blk_addr, __entry->data_flags, __entry->tag,
 		  __entry->can_retune, __entry->doing_retune,
 		  __entry->retune_now, __entry->need_retune,
 		  __entry->hold_retune, __entry->retune_period)
-- 
cgit v1.2.3


From 4bbb3e0e8239f9079bf1fe20b3c0cb598714ae61 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Tue, 13 Mar 2018 14:51:27 +0900
Subject: net: Fix vlan untag for bridge and vlan_dev with reorder_hdr off

When we have a bridge with vlan_filtering on and a vlan device on top of
it, packets would be corrupted in skb_vlan_untag() called from
br_dev_xmit().

The problem sits in skb_reorder_vlan_header() used in skb_vlan_untag(),
which makes use of skb->mac_len. In this function mac_len is meant for
handling rx path with vlan devices with reorder_header disabled, but in
tx path mac_len is typically 0 and cannot be used, which is the problem
in this case.

The current code even does not properly handle rx path (skb_vlan_untag()
called from __netif_receive_skb_core()) with reorder_header off actually.

In rx path single tag case, it works as follows:

- Before skb_reorder_vlan_header()

 mac_header                                data
   v                                        v
   +-------------------+-------------+------+----
   |        ETH        |    VLAN     | ETH  |
   |       ADDRS       | TPID | TCI  | TYPE |
   +-------------------+-------------+------+----
   <-------- mac_len --------->
                       <------------->
                        to be removed

- After skb_reorder_vlan_header()

            mac_header                     data
                 v                          v
                 +-------------------+------+----
                 |        ETH        | ETH  |
                 |       ADDRS       | TYPE |
                 +-------------------+------+----
                 <-------- mac_len --------->

This is ok, but in rx double tag case, it corrupts packets:

- Before skb_reorder_vlan_header()

 mac_header                                              data
   v                                                      v
   +-------------------+-------------+-------------+------+----
   |        ETH        |    VLAN     |    VLAN     | ETH  |
   |       ADDRS       | TPID | TCI  | TPID | TCI  | TYPE |
   +-------------------+-------------+-------------+------+----
   <--------------- mac_len ---------------->
                                     <------------->
                                    should be removed
                       <--------------------------->
                         actually will be removed

- After skb_reorder_vlan_header()

            mac_header                                   data
                 v                                        v
                               +-------------------+------+----
                               |        ETH        | ETH  |
                               |       ADDRS       | TYPE |
                               +-------------------+------+----
                 <--------------- mac_len ---------------->

So, two of vlan tags are both removed while only inner one should be
removed and mac_header (and mac_len) is broken.

skb_vlan_untag() is meant for removing the vlan header at (skb->data - 2),
so use skb->data and skb->mac_header to calculate the right offset.

Reported-by: Brandon Carpenter <brandon.carpenter@cypherpath.com>
Fixes: a6e18ff11170 ("vlan: Fix untag operations of stacked vlans with REORDER_HEADER off")
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 net/core/skbuff.c             | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 8bbbcb5cd94b..820de5d222d2 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -30,6 +30,7 @@
  */
 
 #define ETH_ALEN	6		/* Octets in one ethernet addr	 */
+#define ETH_TLEN	2		/* Octets in ethernet type field */
 #define ETH_HLEN	14		/* Total octets in header.	 */
 #define ETH_ZLEN	60		/* Min. octets in frame sans FCS */
 #define ETH_DATA_LEN	1500		/* Max. octets in payload	 */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index baf990528943..b103f46ec512 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5020,13 +5020,16 @@ EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
 
 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
 {
+	int mac_len;
+
 	if (skb_cow(skb, skb_headroom(skb)) < 0) {
 		kfree_skb(skb);
 		return NULL;
 	}
 
-	memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
-		2 * ETH_ALEN);
+	mac_len = skb->data - skb_mac_header(skb);
+	memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
+		mac_len - VLAN_HLEN - ETH_TLEN);
 	skb->mac_header += VLAN_HLEN;
 	return skb;
 }
-- 
cgit v1.2.3


From cbe7128c4b92e2004984f477fd38dfa81662f02e Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Tue, 13 Mar 2018 14:51:28 +0900
Subject: vlan: Fix out of order vlan headers with reorder header off

With reorder header off, received packets are untagged in skb_vlan_untag()
called from within __netif_receive_skb_core(), and later the tag will be
inserted back in vlan_do_receive().

This caused out of order vlan headers when we create a vlan device on top
of another vlan device, because vlan_do_receive() inserts a tag as the
outermost vlan tag. E.g. the outer tag is first removed in skb_vlan_untag()
and inserted back in vlan_do_receive(), then the inner tag is next removed
and inserted back as the outermost tag.

This patch fixes the behaviour by inserting the inner tag at the right
position.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 66 ++++++++++++++++++++++++++++++++++++++++---------
 net/8021q/vlan_core.c   |  4 +--
 2 files changed, 57 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 5e6a2d4dc366..c4a1cff9c768 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -300,30 +300,34 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features,
 }
 
 /**
- * __vlan_insert_tag - regular VLAN tag inserting
+ * __vlan_insert_inner_tag - inner VLAN tag inserting
  * @skb: skbuff to tag
  * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
+ * @mac_len: MAC header length including outer vlan headers
  *
- * Inserts the VLAN tag into @skb as part of the payload
+ * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
  * Returns error if skb_cow_head failes.
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-static inline int __vlan_insert_tag(struct sk_buff *skb,
-				    __be16 vlan_proto, u16 vlan_tci)
+static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
+					  __be16 vlan_proto, u16 vlan_tci,
+					  unsigned int mac_len)
 {
 	struct vlan_ethhdr *veth;
 
 	if (skb_cow_head(skb, VLAN_HLEN) < 0)
 		return -ENOMEM;
 
-	veth = skb_push(skb, VLAN_HLEN);
+	skb_push(skb, VLAN_HLEN);
 
-	/* Move the mac addresses to the beginning of the new header. */
-	memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);
+	/* Move the mac header sans proto to the beginning of the new header. */
+	memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
 	skb->mac_header -= VLAN_HLEN;
 
+	veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);
+
 	/* first, the ethernet type */
 	veth->h_vlan_proto = vlan_proto;
 
@@ -334,12 +338,30 @@ static inline int __vlan_insert_tag(struct sk_buff *skb,
 }
 
 /**
- * vlan_insert_tag - regular VLAN tag inserting
+ * __vlan_insert_tag - regular VLAN tag inserting
  * @skb: skbuff to tag
  * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
+ * Returns error if skb_cow_head failes.
+ *
+ * Does not change skb->protocol so this function can be used during receive.
+ */
+static inline int __vlan_insert_tag(struct sk_buff *skb,
+				    __be16 vlan_proto, u16 vlan_tci)
+{
+	return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
+}
+
+/**
+ * vlan_insert_inner_tag - inner VLAN tag inserting
+ * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
+ * @vlan_tci: VLAN TCI to insert
+ * @mac_len: MAC header length including outer vlan headers
+ *
+ * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
  * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
  *
  * Following the skb_unshare() example, in case of error, the calling function
@@ -347,12 +369,14 @@ static inline int __vlan_insert_tag(struct sk_buff *skb,
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
-					      __be16 vlan_proto, u16 vlan_tci)
+static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
+						    __be16 vlan_proto,
+						    u16 vlan_tci,
+						    unsigned int mac_len)
 {
 	int err;
 
-	err = __vlan_insert_tag(skb, vlan_proto, vlan_tci);
+	err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
 	if (err) {
 		dev_kfree_skb_any(skb);
 		return NULL;
@@ -360,6 +384,26 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
 	return skb;
 }
 
+/**
+ * vlan_insert_tag - regular VLAN tag inserting
+ * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
+ * @vlan_tci: VLAN TCI to insert
+ *
+ * Inserts the VLAN tag into @skb as part of the payload
+ * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ *
+ * Does not change skb->protocol so this function can be used during receive.
+ */
+static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
+					      __be16 vlan_proto, u16 vlan_tci)
+{
+	return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
+}
+
 /**
  * vlan_insert_tag_set_proto - regular VLAN tag inserting
  * @skb: skbuff to tag
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 64aa9f755e1d..45c9bf5ff3a0 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -48,8 +48,8 @@ bool vlan_do_receive(struct sk_buff **skbp)
 		 * original position later
 		 */
 		skb_push(skb, offset);
-		skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto,
-					      skb->vlan_tci);
+		skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto,
+						    skb->vlan_tci, skb->mac_len);
 		if (!skb)
 			return false;
 		skb_pull(skb, offset + VLAN_HLEN);
-- 
cgit v1.2.3


From a6618f4aedb2b60932d766bd82ae7ce866e842aa Mon Sep 17 00:00:00 2001
From: Kirill Marinushkin <k.marinushkin@gmail.com>
Date: Mon, 19 Mar 2018 07:11:08 +0100
Subject: ALSA: usb-audio: Fix parsing descriptor of UAC2 processing unit

Currently, the offsets in the UAC2 processing unit descriptor are
calculated incorrectly. It causes an issue when connecting the device which
provides such a feature:

~~~~
[84126.724420] usb 1-1.3.1: invalid Processing Unit descriptor (id 18)
~~~~

After this patch is applied, the UAC2 processing unit inits w/o this error.

Fixes: 23caaf19b11e ("ALSA: usb-mixer: Add support for Audio Class v2.0")
Signed-off-by: Kirill Marinushkin <k.marinushkin@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/linux/usb/audio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h
index 17a022c5b414..da3315ed1bcd 100644
--- a/include/uapi/linux/usb/audio.h
+++ b/include/uapi/linux/usb/audio.h
@@ -370,7 +370,7 @@ static inline __u8 uac_processing_unit_bControlSize(struct uac_processing_unit_d
 {
 	return (protocol == UAC_VERSION_1) ?
 		desc->baSourceID[desc->bNrInPins + 4] :
-		desc->baSourceID[desc->bNrInPins + 6];
+		2; /* in UAC2, this value is constant */
 }
 
 static inline __u8 *uac_processing_unit_bmControls(struct uac_processing_unit_descriptor *desc,
@@ -378,7 +378,7 @@ static inline __u8 *uac_processing_unit_bmControls(struct uac_processing_unit_de
 {
 	return (protocol == UAC_VERSION_1) ?
 		&desc->baSourceID[desc->bNrInPins + 5] :
-		&desc->baSourceID[desc->bNrInPins + 7];
+		&desc->baSourceID[desc->bNrInPins + 6];
 }
 
 static inline __u8 uac_processing_unit_iProcessing(struct uac_processing_unit_descriptor *desc,
-- 
cgit v1.2.3


From b3a5d111994450909158929560906f2c1c6c1d85 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Mar 2018 12:45:12 -0700
Subject: percpu_ref: Update doc to dissuade users from depending on internal
 RCU grace periods

percpu_ref internally uses sched-RCU to implement the percpu -> atomic
mode switching and the documentation suggested that this could be
depended upon.  This doesn't seem like a good idea.

* percpu_ref uses sched-RCU which has different grace periods regular
  RCU.  Users may combine percpu_ref with regular RCU usage and
  incorrectly believe that regular RCU grace periods are performed by
  percpu_ref.  This can lead to, for example, use-after-free due to
  premature freeing.

* percpu_ref has a grace period when switching from percpu to atomic
  mode.  It doesn't have one between the last put and release.  This
  distinction is subtle and can lead to surprising bugs.

* percpu_ref allows starting in and switching to atomic mode manually
  for debugging and other purposes.  This means that there may not be
  any grace periods from kill to release.

This patch makes it clear that the grace periods are percpu_ref's
internal implementation detail and can't be depended upon by the
users.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu-refcount.h | 18 ++++++++++++------
 lib/percpu-refcount.c           |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 864d167a1073..009cdf3d65b6 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -30,10 +30,14 @@
  * calls io_destroy() or the process exits.
  *
  * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
- * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove
- * the kioctx from the proccess's list of kioctxs - after that, there can't be
- * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
- * the initial ref with percpu_ref_put().
+ * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
+ * After that, there can't be any new users of the kioctx (from lookup_ioctx())
+ * and it's then safe to drop the initial ref with percpu_ref_put().
+ *
+ * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
+ * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
+ * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
+ * with RCU protection, it must be done explicitly.
  *
  * Code that does a two stage shutdown like this often needs some kind of
  * explicit synchronization to ensure the initial refcount can only be dropped
@@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref);
  * Must be used to drop the initial ref on a percpu refcount; must be called
  * precisely once before shutdown.
  *
- * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
- * percpu counters and dropping the initial ref.
+ * Switches @ref into atomic mode before gathering up the percpu counters
+ * and dropping the initial ref.
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 static inline void percpu_ref_kill(struct percpu_ref *ref)
 {
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 30e7dd88148b..9f96fa7bc000 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
  * process of switching to atomic mode by percpu_ref_switch_to_atomic().
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill)
-- 
cgit v1.2.3


From 80cf79ae4f688ead1dca8878990709c0446c3992 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 19 Mar 2018 12:21:43 +0200
Subject: RDMA/verbs: Remove restrack entry from XRCD structure

XRCD object is not implemented in the restrack, so lets remove it.

Fixes: 02d8883f520e ("RDMA/restrack: Add general infrastructure to track RDMA resources")
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 73b2387e3f74..ff3ed435701f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1537,10 +1537,6 @@ struct ib_xrcd {
 
 	struct mutex		tgt_qp_mutex;
 	struct list_head	tgt_qp_list;
-	/*
-	 * Implementation details of the RDMA core, don't use in drivers:
-	 */
-	struct rdma_restrack_entry res;
 };
 
 struct ib_ah {
-- 
cgit v1.2.3


From 578ae447e7e5d78c90ac40a06406c1741f79ba96 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 19 Mar 2018 13:18:57 -0500
Subject: jump_label: Disable jump labels in __exit code

With the following commit:

  333522447063 ("jump_label: Explicitly disable jump labels in __init code")

... we explicitly disabled jump labels in __init code, so they could be
detected and not warned about in the following commit:

  dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt")

In-kernel __exit code has the same issue.  It's never used, so it's
freed along with the rest of initmem.  But jump label entries in __exit
code aren't explicitly disabled, so we get the following warning when
enabling pr_debug() in __exit code:

  can't patch jump_label at dmi_sysfs_exit+0x0/0x2d
  WARNING: CPU: 0 PID: 22572 at kernel/jump_label.c:376 __jump_label_update+0x9d/0xb0

Fix the warning by disabling all jump labels in initmem (which includes
both __init and __exit code).

Reported-and-tested-by: Li Wang <liwang@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt")
Link: http://lkml.kernel.org/r/7121e6e595374f06616c505b6e690e275c0054d1.1521483452.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 4 ++--
 init/main.c                | 2 +-
 kernel/jump_label.c        | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2168cc6b8b30..b46b541c67c4 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -151,7 +151,7 @@ extern struct jump_entry __start___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 
 extern void jump_label_init(void);
-extern void jump_label_invalidate_init(void);
+extern void jump_label_invalidate_initmem(void);
 extern void jump_label_lock(void);
 extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
@@ -199,7 +199,7 @@ static __always_inline void jump_label_init(void)
 	static_key_initialized = true;
 }
 
-static inline void jump_label_invalidate_init(void) {}
+static inline void jump_label_invalidate_initmem(void) {}
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
diff --git a/init/main.c b/init/main.c
index 969eaf140ef0..21efbf6ace93 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1001,7 +1001,7 @@ static int __ref kernel_init(void *unused)
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	ftrace_free_init_mem();
-	jump_label_invalidate_init();
+	jump_label_invalidate_initmem();
 	free_initmem();
 	mark_readonly();
 	system_state = SYSTEM_RUNNING;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index e7214093dcd1..01ebdf1f9f40 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -16,6 +16,7 @@
 #include <linux/jump_label_ratelimit.h>
 #include <linux/bug.h>
 #include <linux/cpu.h>
+#include <asm/sections.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -421,15 +422,15 @@ void __init jump_label_init(void)
 	cpus_read_unlock();
 }
 
-/* Disable any jump label entries in __init code */
-void __init jump_label_invalidate_init(void)
+/* Disable any jump label entries in __init/__exit code */
+void __init jump_label_invalidate_initmem(void)
 {
 	struct jump_entry *iter_start = __start___jump_table;
 	struct jump_entry *iter_stop = __stop___jump_table;
 	struct jump_entry *iter;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		if (init_kernel_text(iter->code))
+		if (init_section_contains((void *)(unsigned long)iter->code, 1))
 			iter->code = 0;
 	}
 }
-- 
cgit v1.2.3


From 6b00c35138b404be98b85f4a703be594cbed501c Mon Sep 17 00:00:00 2001
From: Jagdish Gediya <jagdish.gediya@nxp.com>
Date: Thu, 22 Mar 2018 01:08:10 +0530
Subject: mtd: nand: fsl_ifc: Read ECCSTAT0 and ECCSTAT1 registers for IFC 2.0

Due to missing information in Hardware manual, current
implementation doesn't read ECCSTAT0 and ECCSTAT1 registers
for IFC 2.0.

Add support to read ECCSTAT0 and ECCSTAT1 registers during
ecccheck for IFC 2.0.

Fixes: 656441478ed5 ("mtd: nand: ifc: Fix location of eccstat registers for IFC V1.0")
Cc: stable@vger.kernel.org # v3.18+
Signed-off-by: Jagdish Gediya <jagdish.gediya@nxp.com>
Reviewed-by: Prabhakar Kushwaha <prabhakar.kushwaha@nxp.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/fsl_ifc_nand.c | 6 +-----
 include/linux/fsl_ifc.h         | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index fae01b04abc7..5a9c2f0020c2 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -227,11 +227,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
 		int sector_end = sector_start + chip->ecc.steps - 1;
 		__be32 *eccstat_regs;
 
-		if (ctrl->version >= FSL_IFC_VERSION_2_0_0)
-			eccstat_regs = ifc->ifc_nand.v2_nand_eccstat;
-		else
-			eccstat_regs = ifc->ifc_nand.v1_nand_eccstat;
-
+		eccstat_regs = ifc->ifc_nand.nand_eccstat;
 		eccstat = ifc_in32(&eccstat_regs[sector_start / 4]);
 
 		for (i = sector_start; i <= sector_end; i++) {
diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h
index c332f0a45607..3fdfede2f0f3 100644
--- a/include/linux/fsl_ifc.h
+++ b/include/linux/fsl_ifc.h
@@ -734,11 +734,7 @@ struct fsl_ifc_nand {
 	u32 res19[0x10];
 	__be32 nand_fsr;
 	u32 res20;
-	/* The V1 nand_eccstat is actually 4 words that overlaps the
-	 * V2 nand_eccstat.
-	 */
-	__be32 v1_nand_eccstat[2];
-	__be32 v2_nand_eccstat[6];
+	__be32 nand_eccstat[8];
 	u32 res21[0x1c];
 	__be32 nanndcr;
 	u32 res22[0x2];
-- 
cgit v1.2.3


From 7c181f4fcdc62e5dc7a87fd33387d322262c3b52 Mon Sep 17 00:00:00 2001
From: Ben Caradoc-Davies <ben@transient.nz>
Date: Mon, 19 Mar 2018 12:57:44 +1300
Subject: mac80211: add ieee80211_hw flag for QoS NDP support

Commit 7b6ddeaf27ec ("mac80211: use QoS NDP for AP probing") added an
argument qos_ok to ieee80211_nullfunc_get to support QoS NDP. Despite
the claim in the commit log "Change all the drivers to *not* allow
QoS NDP for now, even though it looks like most of them should be OK
with that", this commit enables QoS NDP in response to beacons (see
change to mlme.c:ieee80211_send_nullfunc), causing ath9k_htc to lose
IP connectivity. See:
https://patchwork.kernel.org/patch/10241109/
https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=891060

Introduce a hardware flag to allow such buggy drivers to override the
correct default behaviour of mac80211 of sending QoS NDP packets.

Signed-off-by: Ben Caradoc-Davies <ben@transient.nz>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 ++++
 net/mac80211/debugfs.c | 1 +
 net/mac80211/mlme.c    | 3 ++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c96511fa9198..2b581bd93812 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2063,6 +2063,9 @@ struct ieee80211_txq {
  * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on
  *	TDLS links.
  *
+ * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't
+ *	support QoS NDP for AP probing - that's most likely a driver bug.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2106,6 +2109,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_REPORTS_LOW_ACK,
 	IEEE80211_HW_SUPPORTS_TX_FRAG,
 	IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA,
+	IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 1f466d12a6bc..94c7ee9df33b 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -212,6 +212,7 @@ static const char *hw_flag_names[] = {
 	FLAG(REPORTS_LOW_ACK),
 	FLAG(SUPPORTS_TX_FRAG),
 	FLAG(SUPPORTS_TDLS_BUFFER_STA),
+	FLAG(DOESNT_SUPPORT_QOS_NDP),
 #undef FLAG
 };
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 39b660b9a908..5f303abac5ad 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -896,7 +896,8 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
 	struct ieee80211_hdr_3addr *nullfunc;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true);
+	skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif,
+		!ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP));
 	if (!skb)
 		return;
 
-- 
cgit v1.2.3


From 5df7af85ecd88e8b5f1f31d6456c3cf38a8bbdda Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 20 Mar 2018 09:44:52 +0800
Subject: net: phy: Add general dummy stubs for MMD register access

For some phy devices, even though they don't support the MMD extended
register access, it does have some side effect if we are trying to
read/write the MMD registers via indirect method. So introduce general
dummy stubs for MMD register access which these devices can use to avoid
such side effect.

Fixes: b6b5e8a69118 ("gianfar: Disable EEE autoneg by default")
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 17 +++++++++++++++++
 include/linux/phy.h          |  4 ++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index fe16f5894092..74664a6c0cdc 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1673,6 +1673,23 @@ int genphy_config_init(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(genphy_config_init);
 
+/* This is used for the phy device which doesn't support the MMD extended
+ * register access, but it does have side effect when we are trying to access
+ * the MMD register via indirect method.
+ */
+int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(genphy_read_mmd_unsupported);
+
+int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
+				 u16 regnum, u16 val)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(genphy_write_mmd_unsupported);
+
 int genphy_suspend(struct phy_device *phydev)
 {
 	return phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index b260fb336b25..7c4c2379e010 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -984,6 +984,10 @@ static inline int genphy_no_soft_reset(struct phy_device *phydev)
 {
 	return 0;
 }
+int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad,
+				u16 regnum);
+int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
+				 u16 regnum, u16 val);
 
 /* Clause 45 PHY */
 int genphy_c45_restart_aneg(struct phy_device *phydev);
-- 
cgit v1.2.3


From b6bdb7517c3d3f41f20e5c2948d6bc3f8897394e Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Thu, 22 Mar 2018 16:17:20 -0700
Subject: mm/vmalloc: add interfaces to free unmapped page table

On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings.  A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.

 1. ioremap a 4K size, valid page table will build,
 2. iounmap it, pte0 will set to 0;
 3. ioremap the same address with 2M size, pgd/pmd is unchanged,
    then set the a new value for pmd;
 4. pte0 is leaked;
 5. CPU may meet exception because the old pmd is still in TLB,
    which will lead to kernel panic.

This panic is not reproducible on x86.  INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86.  x86
still has memory leak.

The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:

 - The iounmap() path is shared with vunmap(). Since vmap() only
   supports pte mappings, making vunmap() to free a pte page is an
   overhead for regular vmap users as they do not need a pte page freed
   up.

 - Checking if all entries in a pte page are cleared in the unmap path
   is racy, and serializing this check is expensive.

 - The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
   Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
   purge.

Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.

This patch implements their stub functions on x86 and arm64, which work
as workaround.

[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c           | 10 ++++++++++
 arch/x86/mm/pgtable.c         | 24 ++++++++++++++++++++++++
 include/asm-generic/pgtable.h | 10 ++++++++++
 lib/ioremap.c                 |  6 ++++--
 4 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8c704f1e53c2..2dbb2c9f1ec1 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp)
 	pmd_clear(pmdp);
 	return 1;
 }
+
+int pud_free_pmd_page(pud_t *pud)
+{
+	return pud_none(*pud);
+}
+
+int pmd_free_pte_page(pmd_t *pmd)
+{
+	return pmd_none(*pmd);
+}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 004abf9ebf12..1eed7ed518e6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -702,4 +702,28 @@ int pmd_clear_huge(pmd_t *pmd)
 
 	return 0;
 }
+
+/**
+ * pud_free_pmd_page - Clear pud entry and free pmd page.
+ * @pud: Pointer to a PUD.
+ *
+ * Context: The pud range has been unmaped and TLB purged.
+ * Return: 1 if clearing the entry succeeded. 0 otherwise.
+ */
+int pud_free_pmd_page(pud_t *pud)
+{
+	return pud_none(*pud);
+}
+
+/**
+ * pmd_free_pte_page - Clear pmd entry and free pte page.
+ * @pmd: Pointer to a PMD.
+ *
+ * Context: The pmd range has been unmaped and TLB purged.
+ * Return: 1 if clearing the entry succeeded. 0 otherwise.
+ */
+int pmd_free_pte_page(pmd_t *pmd)
+{
+	return pmd_none(*pmd);
+}
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 2cfa3075d148..bfbb44a5ad38 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pud_clear_huge(pud_t *pud);
 int pmd_clear_huge(pmd_t *pmd);
+int pud_free_pmd_page(pud_t *pud);
+int pmd_free_pte_page(pmd_t *pmd);
 #else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
 static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 {
@@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd)
 {
 	return 0;
 }
+static inline int pud_free_pmd_page(pud_t *pud)
+{
+	return 0;
+}
+static inline int pmd_free_pte_page(pmd_t *pmd)
+{
+	return 0;
+}
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
 #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
diff --git a/lib/ioremap.c b/lib/ioremap.c
index b808a390e4c3..54e5bbaa3200 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 
 		if (ioremap_pmd_enabled() &&
 		    ((next - addr) == PMD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
+		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
+		    pmd_free_pte_page(pmd)) {
 			if (pmd_set_huge(pmd, phys_addr + addr, prot))
 				continue;
 		}
@@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 
 		if (ioremap_pud_enabled() &&
 		    ((next - addr) == PUD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
+		    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
+		    pud_free_pmd_page(pud)) {
 			if (pud_set_huge(pud, phys_addr + addr, prot))
 				continue;
 		}
-- 
cgit v1.2.3


From f59f1caf72ba00d519c793c3deb32cd3be32edc2 Mon Sep 17 00:00:00 2001
From: Daniel Vacek <neelx@redhat.com>
Date: Thu, 22 Mar 2018 16:17:38 -0700
Subject: Revert "mm: page_alloc: skip over regions of invalid pfns where
 possible"

This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of
invalid pfns where possible").  The commit is meant to be a boot init
speed up skipping the loop in memmap_init_zone() for invalid pfns.

But given some specific memory mapping on x86_64 (or more generally
theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the
implementation also skips valid pfns which is plain wrong and causes
'kernel BUG at mm/page_alloc.c:1389!'

  crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
  kernel BUG at mm/page_alloc.c:1389!
  invalid opcode: 0000 [#1] SMP
  --
  RIP: 0010: move_freepages+0x15e/0x160
  --
  Call Trace:
    move_freepages_block+0x73/0x80
    __rmqueue+0x263/0x460
    get_page_from_freelist+0x7e1/0x9e0
    __alloc_pages_nodemask+0x176/0x420
  --

  crash> page_init_bug -v | grep RAM
  <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
  <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
  <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
  <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
  <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)

  crash> page_init_bug | head -6
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
  <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  <struct page 0xffffea0001ede200>       505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
  <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
  <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  BUG, zones differ!

  crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
        PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
  ffffea0001e00000  78000000                0        0  0 0
  ffffea0001ed7fc0  7b5ff000                0        0  0 0
  ffffea0001ed8000  7b600000                0        0  0 0       <<<<
  ffffea0001ede1c0  7b787000                0        0  0 0
  ffffea0001ede200  7b788000                0        0  1 1fffff00000000

Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com
Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
Signed-off-by: Daniel Vacek <neelx@redhat.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  1 -
 mm/memblock.c            | 28 ----------------------------
 mm/page_alloc.c          | 11 +----------
 3 files changed, 1 insertion(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8be5077efb5f..f92ea7783652 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
-unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index b6ba6b7adadc..48376bd33274 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
-						      unsigned long max_pfn)
-{
-	struct memblock_type *type = &memblock.memory;
-	unsigned int right = type->cnt;
-	unsigned int mid, left = 0;
-	phys_addr_t addr = PFN_PHYS(++pfn);
-
-	do {
-		mid = (right + left) / 2;
-
-		if (addr < type->regions[mid].base)
-			right = mid;
-		else if (addr >= (type->regions[mid].base +
-				  type->regions[mid].size))
-			left = mid + 1;
-		else {
-			/* addr is within the region, so pfn is valid */
-			return pfn;
-		}
-	} while (left < right);
-
-	if (right == type->cnt)
-		return -1UL;
-	else
-		return PHYS_PFN(type->regions[right].base);
-}
-
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 010dee0f089e..1741dd23e7c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn)) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-			/*
-			 * Skip to the pfn preceding the next valid one (or
-			 * end_pfn), such that we hit a valid pfn (or end_pfn)
-			 * on our next iteration of the loop.
-			 */
-			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
+		if (!early_pfn_valid(pfn))
 			continue;
-		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-- 
cgit v1.2.3