From 00085f1efa387a8ce100e3734920f7639c80caa3 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Wed, 3 Aug 2016 13:46:00 -0700
Subject: dma-mapping: use unsigned long for dma_attrs

The dma-mapping core and the implementations do not change the DMA
attributes passed by pointer.  Thus the pointer can point to const data.
However the attributes do not have to be a bitfield.  Instead unsigned
long will do fine:

1. This is just simpler.  Both in terms of reading the code and setting
   attributes.  Instead of initializing local attributes on the stack
   and passing pointer to it to dma_set_attr(), just set the bits.

2. It brings safeness and checking for const correctness because the
   attributes are passed by value.

Semantic patches for this change (at least most of them):

    virtual patch
    virtual context

    @r@
    identifier f, attrs;

    @@
    f(...,
    - struct dma_attrs *attrs
    + unsigned long attrs
    , ...)
    {
    ...
    }

    @@
    identifier r.f;
    @@
    f(...,
    - NULL
    + 0
     )

and

    // Options: --all-includes
    virtual patch
    virtual context

    @r@
    identifier f, attrs;
    type t;

    @@
    t f(..., struct dma_attrs *attrs);

    @@
    identifier r.f;
    @@
    f(...,
    - NULL
    + 0
     )

Link: http://lkml.kernel.org/r/1468399300-5399-2-git-send-email-k.kozlowski@samsung.com
Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Hans-Christian Noren Egtvedt <egtvedt@samfundet.no>
Acked-by: Mark Salter <msalter@redhat.com> [c6x]
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com> [cris]
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> [drm]
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Joerg Roedel <jroedel@suse.de> [iommu]
Acked-by: Fabien Dessenne <fabien.dessenne@st.com> [bdisp]
Reviewed-by: Marek Szyprowski <m.szyprowski@samsung.com> [vb2-core]
Acked-by: David Vrabel <david.vrabel@citrix.com> [xen]
Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [xen swiotlb]
Acked-by: Joerg Roedel <jroedel@suse.de> [iommu]
Acked-by: Richard Kuo <rkuo@codeaurora.org> [hexagon]
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> [m68k]
Acked-by: Gerald Schaefer <gerald.schaefer@de.ibm.com> [s390]
Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Acked-by: Hans-Christian Noren Egtvedt <egtvedt@samfundet.no> [avr32]
Acked-by: Vineet Gupta <vgupta@synopsys.com> [arc]
Acked-by: Robin Murphy <robin.murphy@arm.com> [arm64 and dma-iommu]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dma-noop.c |  9 +++++----
 lib/swiotlb.c  | 13 +++++++------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/lib/dma-noop.c b/lib/dma-noop.c
index 72145646857e..3d766e78fbe2 100644
--- a/lib/dma-noop.c
+++ b/lib/dma-noop.c
@@ -10,7 +10,7 @@
 
 static void *dma_noop_alloc(struct device *dev, size_t size,
 			    dma_addr_t *dma_handle, gfp_t gfp,
-			    struct dma_attrs *attrs)
+			    unsigned long attrs)
 {
 	void *ret;
 
@@ -22,7 +22,7 @@ static void *dma_noop_alloc(struct device *dev, size_t size,
 
 static void dma_noop_free(struct device *dev, size_t size,
 			  void *cpu_addr, dma_addr_t dma_addr,
-			  struct dma_attrs *attrs)
+			  unsigned long attrs)
 {
 	free_pages((unsigned long)cpu_addr, get_order(size));
 }
@@ -30,13 +30,14 @@ static void dma_noop_free(struct device *dev, size_t size,
 static dma_addr_t dma_noop_map_page(struct device *dev, struct page *page,
 				      unsigned long offset, size_t size,
 				      enum dma_data_direction dir,
-				      struct dma_attrs *attrs)
+				      unsigned long attrs)
 {
 	return page_to_phys(page) + offset;
 }
 
 static int dma_noop_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
-			     enum dma_data_direction dir, struct dma_attrs *attrs)
+			     enum dma_data_direction dir,
+			     unsigned long attrs)
 {
 	int i;
 	struct scatterlist *sg;
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 76f29ecba8f4..22e13a0e19d7 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -738,7 +738,7 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
 dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 			    unsigned long offset, size_t size,
 			    enum dma_data_direction dir,
-			    struct dma_attrs *attrs)
+			    unsigned long attrs)
 {
 	phys_addr_t map, phys = page_to_phys(page) + offset;
 	dma_addr_t dev_addr = phys_to_dma(dev, phys);
@@ -807,7 +807,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 
 void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 			size_t size, enum dma_data_direction dir,
-			struct dma_attrs *attrs)
+			unsigned long attrs)
 {
 	unmap_single(hwdev, dev_addr, size, dir);
 }
@@ -877,7 +877,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_device);
  */
 int
 swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		     enum dma_data_direction dir, struct dma_attrs *attrs)
+		     enum dma_data_direction dir, unsigned long attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -914,7 +914,7 @@ int
 swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
 	       enum dma_data_direction dir)
 {
-	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0);
 }
 EXPORT_SYMBOL(swiotlb_map_sg);
 
@@ -924,7 +924,8 @@ EXPORT_SYMBOL(swiotlb_map_sg);
  */
 void
 swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-		       int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
+		       int nelems, enum dma_data_direction dir,
+		       unsigned long attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -941,7 +942,7 @@ void
 swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
 		 enum dma_data_direction dir)
 {
-	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0);
 }
 EXPORT_SYMBOL(swiotlb_unmap_sg);
 
-- 
cgit v1.2.3


From 9049fc745300c5e2236cbfc69f5e8cadb6f1f57c Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 3 Aug 2016 13:46:39 -0700
Subject: dynamic_debug: add jump label support

Although dynamic debug is often only used for debug builds, sometimes
its enabled for production builds as well.  Minimize its impact by using
jump labels.  This reduces the text section by 7000+ bytes in the kernel
image below.  It does increase data, but this should only be referenced
when changing the direction of the branches, and hence usually not in
cache.

     text     data     bss       dec     hex  filename
  8194852  4879776  925696  14000324  d5a0c4  vmlinux.pre
  8187337  4960224  925696  14073257  d6bda9  vmlinux.post

Link: http://lkml.kernel.org/r/d165b465e8c89bc582d973758d40be44c33f018b.1467837322.git.jbaron@akamai.com
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Joe Perches <joe@perches.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dynamic_debug.h | 60 ++++++++++++++++++++++++++++++++++++++-----
 lib/dynamic_debug.c           |  7 +++++
 2 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'lib')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 4f1bbc68cd1b..546d68057e3b 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -1,6 +1,10 @@
 #ifndef _DYNAMIC_DEBUG_H
 #define _DYNAMIC_DEBUG_H
 
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+#include <linux/jump_label.h>
+#endif
+
 /*
  * An instance of this structure is created in a special
  * ELF section at every dynamic debug callsite.  At runtime,
@@ -33,6 +37,12 @@ struct _ddebug {
 #define _DPRINTK_FLAGS_DEFAULT 0
 #endif
 	unsigned int flags:8;
+#ifdef HAVE_JUMP_LABEL
+	union {
+		struct static_key_true dd_key_true;
+		struct static_key_false dd_key_false;
+	} key;
+#endif
 } __attribute__((aligned(8)));
 
 
@@ -60,7 +70,7 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
 			  const struct net_device *dev,
 			  const char *fmt, ...);
 
-#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)		\
+#define DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, key, init)	\
 	static struct _ddebug  __aligned(8)			\
 	__attribute__((section("__verbose"))) name = {		\
 		.modname = KBUILD_MODNAME,			\
@@ -68,13 +78,51 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
 		.filename = __FILE__,				\
 		.format = (fmt),				\
 		.lineno = __LINE__,				\
-		.flags =  _DPRINTK_FLAGS_DEFAULT,		\
+		.flags = _DPRINTK_FLAGS_DEFAULT,		\
+		dd_key_init(key, init)				\
 	}
 
+#ifdef HAVE_JUMP_LABEL
+
+#define dd_key_init(key, init) key = (init)
+
+#ifdef DEBUG
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
+	DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, .key.dd_key_true, \
+					  (STATIC_KEY_TRUE_INIT))
+
+#define DYNAMIC_DEBUG_BRANCH(descriptor) \
+	static_branch_likely(&descriptor.key.dd_key_true)
+#else
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
+	DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, .key.dd_key_false, \
+					  (STATIC_KEY_FALSE_INIT))
+
+#define DYNAMIC_DEBUG_BRANCH(descriptor) \
+	static_branch_unlikely(&descriptor.key.dd_key_false)
+#endif
+
+#else
+
+#define dd_key_init(key, init)
+
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \
+	DEFINE_DYNAMIC_DEBUG_METADATA_KEY(name, fmt, 0, 0)
+
+#ifdef DEBUG
+#define DYNAMIC_DEBUG_BRANCH(descriptor) \
+	likely(descriptor.flags & _DPRINTK_FLAGS_PRINT)
+#else
+#define DYNAMIC_DEBUG_BRANCH(descriptor) \
+	unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)
+#endif
+
+#endif
+
 #define dynamic_pr_debug(fmt, ...)				\
 do {								\
 	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);		\
-	if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))	\
+	if (DYNAMIC_DEBUG_BRANCH(descriptor))			\
 		__dynamic_pr_debug(&descriptor, pr_fmt(fmt),	\
 				   ##__VA_ARGS__);		\
 } while (0)
@@ -82,7 +130,7 @@ do {								\
 #define dynamic_dev_dbg(dev, fmt, ...)				\
 do {								\
 	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);		\
-	if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))	\
+	if (DYNAMIC_DEBUG_BRANCH(descriptor))			\
 		__dynamic_dev_dbg(&descriptor, dev, fmt,	\
 				  ##__VA_ARGS__);		\
 } while (0)
@@ -90,7 +138,7 @@ do {								\
 #define dynamic_netdev_dbg(dev, fmt, ...)			\
 do {								\
 	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);		\
-	if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))	\
+	if (DYNAMIC_DEBUG_BRANCH(descriptor))			\
 		__dynamic_netdev_dbg(&descriptor, dev, fmt,	\
 				     ##__VA_ARGS__);		\
 } while (0)
@@ -100,7 +148,7 @@ do {								\
 do {								\
 	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor,		\
 		__builtin_constant_p(prefix_str) ? prefix_str : "hexdump");\
-	if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT))	\
+	if (DYNAMIC_DEBUG_BRANCH(descriptor))			\
 		print_hex_dump(KERN_DEBUG, prefix_str,		\
 			       prefix_type, rowsize, groupsize,	\
 			       buf, len, ascii);		\
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index fe42b6ec3f0c..da796e2dc4f5 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -188,6 +188,13 @@ static int ddebug_change(const struct ddebug_query *query,
 			newflags = (dp->flags & mask) | flags;
 			if (newflags == dp->flags)
 				continue;
+#ifdef HAVE_JUMP_LABEL
+			if (dp->flags & _DPRINTK_FLAGS_PRINT) {
+				if (!(flags & _DPRINTK_FLAGS_PRINT))
+					static_branch_disable(&dp->key.dd_key_true);
+			} else if (flags & _DPRINTK_FLAGS_PRINT)
+				static_branch_enable(&dp->key.dd_key_true);
+#endif
 			dp->flags = newflags;
 			vpr_info("changed %s:%d [%s]%s =%s\n",
 				 trim_prefix(dp->filename), dp->lineno,
-- 
cgit v1.2.3


From 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 8 Aug 2016 13:02:01 -0700
Subject: unsafe_[get|put]_user: change interface to use a error target label

When I initially added the unsafe_[get|put]_user() helpers in commit
5b24a7a2aa20 ("Add 'unsafe' user access functions for batched
accesses"), I made the mistake of modeling the interface on our
traditional __[get|put]_user() functions, which return zero on success,
or -EFAULT on failure.

That interface is fairly easy to use, but it's actually fairly nasty for
good code generation, since it essentially forces the caller to check
the error value for each access.

In particular, since the error handling is already internally
implemented with an exception handler, and we already use "asm goto" for
various other things, we could fairly easily make the error cases just
jump directly to an error label instead, and avoid the need for explicit
checking after each operation.

So switch the interface to pass in an error label, rather than checking
the error value in the caller.  Best do it now before we start growing
more users (the signal handling code in particular would be a good place
to use the new interface).

So rather than

	if (unsafe_get_user(x, ptr))
		... handle error ..

the interface is now

	unsafe_get_user(x, ptr, label);

where an error during the user mode fetch will now just cause a jump to
'label' in the caller.

Right now the actual _implementation_ of this all still ends up being a
"if (err) goto label", and does not take advantage of any exception
label tricks, but for "unsafe_put_user()" in particular it should be
fairly straightforward to convert to using the exception table model.

Note that "unsafe_get_user()" is much harder to convert to a clever
exception table model, because current versions of gcc do not allow the
use of "asm goto" (for the exception) with output values (for the actual
value to be fetched).  But that is hopefully not a limitation in the
long term.

[ Also note that it might be a good idea to switch unsafe_get_user() to
  actually _return_ the value it fetches from user space, but this
  commit only changes the error handling semantics ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h | 16 ++++++++--------
 include/linux/uaccess.h        |  4 ++--
 lib/strncpy_from_user.c        |  8 ++++----
 lib/strnlen_user.c             |  7 +++----
 4 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'lib')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c03bfb68c503..52f230094c51 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -812,21 +812,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 #define user_access_begin()	__uaccess_begin()
 #define user_access_end()	__uaccess_end()
 
-#define unsafe_put_user(x, ptr)						\
-({										\
+#define unsafe_put_user(x, ptr, err_label)					\
+do {										\
 	int __pu_err;								\
 	__put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT);		\
-	__builtin_expect(__pu_err, 0);						\
-})
+	if (unlikely(__pu_err)) goto err_label;					\
+} while (0)
 
-#define unsafe_get_user(x, ptr)						\
-({										\
+#define unsafe_get_user(x, ptr, err_label)					\
+do {										\
 	int __gu_err;								\
 	unsigned long __gu_val;							\
 	__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;				\
-	__builtin_expect(__gu_err, 0);						\
-})
+	if (unlikely(__gu_err)) goto err_label;					\
+} while (0)
 
 #endif /* _ASM_X86_UACCESS_H */
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 349557825428..f30c187ed785 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 #ifndef user_access_begin
 #define user_access_begin() do { } while (0)
 #define user_access_end() do { } while (0)
-#define unsafe_get_user(x, ptr) __get_user(x, ptr)
-#define unsafe_put_user(x, ptr) __put_user(x, ptr)
+#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
+#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
 #endif
 
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 33f655ef48cd..9c5fe8110413 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -40,8 +40,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
 		unsigned long c, data;
 
 		/* Fall back to byte-at-a-time if we get a page fault */
-		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
-			break;
+		unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);
+
 		*(unsigned long *)(dst+res) = c;
 		if (has_zero(c, &data, &constants)) {
 			data = prep_zero_mask(c, data, &constants);
@@ -56,8 +56,7 @@ byte_at_a_time:
 	while (max) {
 		char c;
 
-		if (unlikely(unsafe_get_user(c,src+res)))
-			return -EFAULT;
+		unsafe_get_user(c,src+res, efault);
 		dst[res] = c;
 		if (!c)
 			return res;
@@ -76,6 +75,7 @@ byte_at_a_time:
 	 * Nope: we hit the address space limit, and we still had more
 	 * characters the caller would have wanted. That's an EFAULT.
 	 */
+efault:
 	return -EFAULT;
 }
 
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 2625943625d7..8e105ed4df12 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 	src -= align;
 	max += align;
 
-	if (unlikely(unsafe_get_user(c,(unsigned long __user *)src)))
-		return 0;
+	unsafe_get_user(c, (unsigned long __user *)src, efault);
 	c |= aligned_byte_mask(align);
 
 	for (;;) {
@@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 		if (unlikely(max <= sizeof(unsigned long)))
 			break;
 		max -= sizeof(unsigned long);
-		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
-			return 0;
+		unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
 	}
 	res -= align;
 
@@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 	 * Nope: we hit the address space limit, and we still had more
 	 * characters the caller would have wanted. That's 0.
 	 */
+efault:
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4cf0b354d92ee2c642532ee39e330f8f580fd985 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 12 Aug 2016 12:03:52 +0200
Subject: rhashtable: avoid large lock-array allocations

Sander reports following splat after netfilter nat bysrc table got
converted to rhashtable:

swapper/0: page allocation failure: order:3, mode:0x2084020(GFP_ATOMIC|__GFP_COMP)
 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.8.0-rc1 [..]
 [<ffffffff811633ed>] warn_alloc_failed+0xdd/0x140
 [<ffffffff811638b1>] __alloc_pages_nodemask+0x3e1/0xcf0
 [<ffffffff811a72ed>] alloc_pages_current+0x8d/0x110
 [<ffffffff8117cb7f>] kmalloc_order+0x1f/0x70
 [<ffffffff811aec19>] __kmalloc+0x129/0x140
 [<ffffffff8146d561>] bucket_table_alloc+0xc1/0x1d0
 [<ffffffff8146da1d>] rhashtable_insert_rehash+0x5d/0xe0
 [<ffffffff819fcfff>] nf_nat_setup_info+0x2ef/0x400

The failure happens when allocating the spinlock array.
Even with GFP_KERNEL its unlikely for such a large allocation
to succeed.

Thomas Graf pointed me at inet_ehash_locks_alloc(), so in addition
to adding NOWARN for atomic allocations this also makes the bucket-array
sizing more conservative.

In commit 095dc8e0c3686 ("tcp: fix/cleanup inet_ehash_locks_alloc()"),
Eric Dumazet says: "Budget 2 cache lines per cpu worth of 'spinlocks'".
IOW, consider size needed by a single spinlock when determining
number of locks per cpu.  So with 64 byte per cacheline and 4 byte per
spinlock this gives 32 locks per cpu.

Resulting size of the lock-array (sizeof(spinlock) == 4):

cpus:    1   2   4   8   16   32   64
old:    1k  1k  4k  8k  16k  16k  16k
new:   128 256 512  1k   2k   4k   8k

8k allocation should have decent chance of success even
with GFP_ATOMIC, and should not fail with GFP_KERNEL.

With 72-byte spinlock (LOCKDEP):
cpus :   1   2
old:    9k 18k
new:   ~2k ~4k

Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Suggested-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 5d845ffd7982..42acd81f10db 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -30,7 +30,7 @@
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4U
-#define BUCKET_LOCKS_PER_CPU   128UL
+#define BUCKET_LOCKS_PER_CPU	32UL
 
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
@@ -70,7 +70,7 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	unsigned int nr_pcpus = num_possible_cpus();
 #endif
 
-	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
+	nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
 	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
 
 	/* Never allocate more than 0.5 locks per bucket */
@@ -83,6 +83,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 			tbl->locks = vmalloc(size * sizeof(spinlock_t));
 		else
 #endif
+		if (gfp != GFP_KERNEL)
+			gfp |= __GFP_NOWARN | __GFP_NORETRY;
+
 		tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
 					   gfp);
 		if (!tbl->locks)
-- 
cgit v1.2.3


From 12311959ecf8a3a64676c01b62ce67a0c5f0fd49 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Fri, 12 Aug 2016 20:10:44 +0200
Subject: rhashtable: fix shift by 64 when shrinking

I got this:

    ================================================================================
    UBSAN: Undefined behaviour in ./include/linux/log2.h:63:13
    shift exponent 64 is too large for 64-bit type 'long unsigned int'
    CPU: 1 PID: 721 Comm: kworker/1:1 Not tainted 4.8.0-rc1+ #87
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014
    Workqueue: events rht_deferred_worker
     0000000000000000 ffff88011661f8d8 ffffffff82344f50 0000000041b58ab3
     ffffffff84f98000 ffffffff82344ea4 ffff88011661f900 ffff88011661f8b0
     0000000000000001 ffff88011661f6b8 dffffc0000000000 ffffffff867f7640
    Call Trace:
     [<ffffffff82344f50>] dump_stack+0xac/0xfc
     [<ffffffff82344ea4>] ? _atomic_dec_and_lock+0xc4/0xc4
     [<ffffffff8242f5b8>] ubsan_epilogue+0xd/0x8a
     [<ffffffff82430c41>] __ubsan_handle_shift_out_of_bounds+0x255/0x29a
     [<ffffffff824309ec>] ? __ubsan_handle_out_of_bounds+0x180/0x180
     [<ffffffff84003436>] ? nl80211_req_set_reg+0x256/0x2f0
     [<ffffffff812112ba>] ? print_context_stack+0x8a/0x160
     [<ffffffff81200031>] ? amd_pmu_reset+0x341/0x380
     [<ffffffff823af808>] rht_deferred_worker+0x1618/0x1790
     [<ffffffff823af808>] ? rht_deferred_worker+0x1618/0x1790
     [<ffffffff823ae1f0>] ? rhashtable_jhash2+0x370/0x370
     [<ffffffff8134c12d>] ? process_one_work+0x6fd/0x1970
     [<ffffffff8134c1cf>] process_one_work+0x79f/0x1970
     [<ffffffff8134c12d>] ? process_one_work+0x6fd/0x1970
     [<ffffffff8134ba30>] ? try_to_grab_pending+0x4c0/0x4c0
     [<ffffffff8134d564>] ? worker_thread+0x1c4/0x1340
     [<ffffffff8134d8ff>] worker_thread+0x55f/0x1340
     [<ffffffff845e904f>] ? __schedule+0x4df/0x1d40
     [<ffffffff8134d3a0>] ? process_one_work+0x1970/0x1970
     [<ffffffff8134d3a0>] ? process_one_work+0x1970/0x1970
     [<ffffffff813642f7>] kthread+0x237/0x390
     [<ffffffff813640c0>] ? __kthread_parkme+0x280/0x280
     [<ffffffff845f8c93>] ? _raw_spin_unlock_irq+0x33/0x50
     [<ffffffff845f95df>] ret_from_fork+0x1f/0x40
     [<ffffffff813640c0>] ? __kthread_parkme+0x280/0x280
    ================================================================================

roundup_pow_of_two() is undefined when called with an argument of 0, so
let's avoid the call and just fall back to ht->p.min_size (which should
never be smaller than HASH_MIN_SIZE).

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 42acd81f10db..5ba520b544d7 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -324,12 +324,14 @@ static int rhashtable_expand(struct rhashtable *ht)
 static int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
-	unsigned int size;
+	unsigned int nelems = atomic_read(&ht->nelems);
+	unsigned int size = 0;
 	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
-	size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2);
+	if (nelems)
+		size = roundup_pow_of_two(nelems * 3 / 2);
 	if (size < ht->p.min_size)
 		size = ht->p.min_size;
 
-- 
cgit v1.2.3