48 files changed, 592 insertions, 1361 deletions
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index f6fcb8a79889..a810fa8ba404 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -45,35 +45,42 @@ config DEBUG_WX
 
 		If in doubt, say "Y".
 
-# RMK wants arm kernels compiled with frame pointers or stack unwinding.
-# If you know what you are doing and are willing to live without stack
-# traces, you can get a slightly smaller kernel by setting this option to
-# n, but then RMK will have to kill you ;).
-config FRAME_POINTER
-	bool
-	depends on !THUMB2_KERNEL
-	default y if !ARM_UNWIND || FUNCTION_GRAPH_TRACER
+choice
+	prompt "Choose kernel unwinder"
+	default UNWINDER_ARM if AEABI && !FUNCTION_GRAPH_TRACER
+	default UNWINDER_FRAME_POINTER if !AEABI || FUNCTION_GRAPH_TRACER
 	help
-	  If you say N here, the resulting kernel will be slightly smaller and
-	  faster. However, if neither FRAME_POINTER nor ARM_UNWIND are enabled,
-	  when a problem occurs with the kernel, the information that is
-	  reported is severely limited.
+	  This determines which method will be used for unwinding kernel stack
+	  traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+	  livepatch, lockdep, and more.
+
+config UNWINDER_FRAME_POINTER
+	bool "Frame pointer unwinder"
+	depends on !THUMB2_KERNEL && !CC_IS_CLANG
+	select ARCH_WANT_FRAME_POINTERS
+	select FRAME_POINTER
+	help
+	  This option enables the frame pointer unwinder for unwinding
+	  kernel stack traces.
 
-config ARM_UNWIND
-	bool "Enable stack unwinding support (EXPERIMENTAL)"
+config UNWINDER_ARM
+	bool "ARM EABI stack unwinder"
 	depends on AEABI
-	default y
+	select ARM_UNWIND
 	help
 	  This option enables stack unwinding support in the kernel
 	  using the information automatically generated by the
 	  compiler. The resulting kernel image is slightly bigger but
 	  the performance is not affected. Currently, this feature
-	  only works with EABI compilers. If unsure say Y.
+	  only works with EABI compilers.
 
-config OLD_MCOUNT
+endchoice
+
+config ARM_UNWIND
+	bool
+
+config FRAME_POINTER
 	bool
-	depends on FUNCTION_TRACER && FRAME_POINTER
-	default y
 
 config DEBUG_USER
 	bool "Verbose user fault messages"
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index d1516f85f25d..05a91d8b89f3 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -74,7 +74,7 @@ endif
 arch-$(CONFIG_CPU_32v5)		=-D__LINUX_ARM_ARCH__=5 $(call cc-option,-march=armv5te,-march=armv4t)
 arch-$(CONFIG_CPU_32v4T)	=-D__LINUX_ARM_ARCH__=4 -march=armv4t
 arch-$(CONFIG_CPU_32v4)		=-D__LINUX_ARM_ARCH__=4 -march=armv4
-arch-$(CONFIG_CPU_32v3)		=-D__LINUX_ARM_ARCH__=3 -march=armv3
+arch-$(CONFIG_CPU_32v3)		=-D__LINUX_ARM_ARCH__=3 -march=armv3m
 
 # Evaluate arch cc-option calls now
 arch-y := $(arch-y)
@@ -264,13 +264,9 @@ platdirs := $(patsubst %,arch/arm/plat-%/,$(sort $(plat-y)))
 
 ifneq ($(CONFIG_ARCH_MULTIPLATFORM),y)
 ifneq ($(CONFIG_ARM_SINGLE_ARMV7M),y)
-ifeq ($(KBUILD_SRC),)
-KBUILD_CPPFLAGS += $(patsubst %,-I%include,$(machdirs) $(platdirs))
-else
 KBUILD_CPPFLAGS += $(patsubst %,-I$(srctree)/%include,$(machdirs) $(platdirs))
 endif
 endif
-endif
 
 export	TEXT_OFFSET GZFLAGS MMUEXT
 
@@ -307,12 +303,7 @@ else
 KBUILD_IMAGE := $(boot)/zImage
 endif
 
-# Build the DT binary blobs if we have OF configured
-ifeq ($(CONFIG_USE_OF),y)
-KBUILD_DTBS := dtbs
-endif
-
-all:	$(notdir $(KBUILD_IMAGE)) $(KBUILD_DTBS)
+all:	$(notdir $(KBUILD_IMAGE))
 
 
 archheaders:
@@ -339,17 +330,6 @@ $(BOOT_TARGETS): vmlinux
 $(INSTALL_TARGETS):
 	$(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $@
 
-%.dtb: | scripts
-	$(Q)$(MAKE) $(build)=$(boot)/dts MACHINE=$(MACHINE) $(boot)/dts/$@
-
-PHONY += dtbs dtbs_install
-
-dtbs: prepare scripts
-	$(Q)$(MAKE) $(build)=$(boot)/dts
-
-dtbs_install:
-	$(Q)$(MAKE) $(dtbinst)=$(boot)/dts
-
 PHONY += vdso_install
 vdso_install:
 ifeq ($(CONFIG_VDSO),y)
@@ -371,8 +351,6 @@ define archhelp
   echo  '  uImage        - U-Boot wrapped zImage'
   echo  '  bootpImage    - Combined zImage and initial RAM disk'
   echo  '                  (supply initrd image via make variable INITRD=<path>)'
-  echo  '* dtbs          - Build device tree blobs for enabled boards'
-  echo  '  dtbs_install  - Install dtbs to $(INSTALL_DTBS_PATH)'
   echo  '  install       - Install uncompressed kernel'
   echo  '  zinstall      - Install compressed kernel'
   echo  '  uinstall      - Install U-Boot wrapped compressed kernel'
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 517e0e18f0b8..6c7ccb428c07 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -114,6 +114,35 @@
 #endif
 		.endm
 
+		/*
+		 * Debug kernel copy by printing the memory addresses involved
+		 */
+		.macro dbgkc, begin, end, cbegin, cend
+#ifdef DEBUG
+		kputc   #'\n'
+		kputc   #'C'
+		kputc   #':'
+		kputc   #'0'
+		kputc   #'x'
+		kphex   \begin, 8	/* Start of compressed kernel */
+		kputc	#'-'
+		kputc	#'0'
+		kputc	#'x'
+		kphex	\end, 8		/* End of compressed kernel */
+		kputc	#'-'
+		kputc	#'>'
+		kputc   #'0'
+		kputc   #'x'
+		kphex   \cbegin, 8	/* Start of kernel copy */
+		kputc	#'-'
+		kputc	#'0'
+		kputc	#'x'
+		kphex	\cend, 8	/* End of kernel copy */
+		kputc	#'\n'
+		kputc	#'\r'
+#endif
+		.endm
+
 		.section ".start", #alloc, #execinstr
 /*
  * sort out different calling conventions
@@ -450,6 +479,20 @@ dtb_check_done:
 		add	r6, r9, r5
 		add	r9, r9, r10
 
+#ifdef DEBUG
+		sub     r10, r6, r5
+		sub     r10, r9, r10
+		/*
+		 * We are about to copy the kernel to a new memory area.
+		 * The boundaries of the new memory area can be found in
+		 * r10 and r9, whilst r5 and r6 contain the boundaries
+		 * of the memory we are going to copy.
+		 * Calling dbgkc will help with the printing of this
+		 * information.
+		 */
+		dbgkc	r5, r6, r10, r9
+#endif
+
 1:		ldmdb	r6!, {r0 - r3, r10 - r12, lr}
 		cmp	r6, r5
 		stmdb	r9!, {r0 - r3, r10 - r12, lr}
diff --git a/arch/arm/boot/compressed/libfdt_env.h b/arch/arm/boot/compressed/libfdt_env.h
index 07437816e098..b36c0289a308 100644
--- a/arch/arm/boot/compressed/libfdt_env.h
+++ b/arch/arm/boot/compressed/libfdt_env.h
@@ -6,6 +6,8 @@
 #include <linux/string.h>
 #include <asm/byteorder.h>
 
+#define INT_MAX			((int)(~0U>>1))
+
 typedef __be16 fdt16_t;
 typedef __be32 fdt32_t;
 typedef __be64 fdt64_t;
diff --git a/arch/arm/boot/dts/imx7d.dtsi b/arch/arm/boot/dts/imx7d.dtsi
index 7234e8330a57..efbdeaaa8dcd 100644
--- a/arch/arm/boot/dts/imx7d.dtsi
+++ b/arch/arm/boot/dts/imx7d.dtsi
@@ -146,8 +146,9 @@
 		fsl,max-link-speed = <2>;
 		power-domains = <&pgc_pcie_phy>;
 		resets = <&src IMX7_RESET_PCIEPHY>,
-			 <&src IMX7_RESET_PCIE_CTRL_APPS_EN>;
-		reset-names = "pciephy", "apps";
+			 <&src IMX7_RESET_PCIE_CTRL_APPS_EN>,
+			 <&src IMX7_RESET_PCIE_CTRL_APPS_TURNOFF>;
+		reset-names = "pciephy", "apps", "turnoff";
 		status = "disabled";
 	};
 };
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 925d1364727a..ef0c7feea6e2 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
 	select CRYPTO_CRYPTD
+	select CRYPTO_GF128MUL
 	help
 	  Use an implementation of GHASH (used by the GCM AEAD chaining mode)
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
@@ -121,10 +122,4 @@ config CRYPTO_CHACHA20_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 
-config CRYPTO_SPECK_NEON
-	tristate "NEON accelerated Speck cipher algorithms"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLKCIPHER
-	select CRYPTO_SPECK
-
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 8de542c48ade..bd5bceef0605 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -54,7 +53,6 @@ ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
index 451a849ad518..50e7b9896818 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -18,6 +18,34 @@
  * (at your option) any later version.
  */
 
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
+  *
+  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
+  * (c)  vrev32.16			(16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
+  *					 needs index vector)
+  *
+  * ChaCha20 has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit
+  * rotations, the only choices are (a) and (b).  We use (a) since it takes
+  * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
 #include <linux/linkage.h>
 
 	.text
@@ -46,7 +74,9 @@ ENTRY(chacha20_block_xor_neon)
 	vmov		q10, q2
 	vmov		q11, q3
 
+	adr		ip, .Lrol8_table
 	mov		r3, #10
+	vld1.8		{d10}, [ip, :64]
 
 .Ldoubleround:
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -62,9 +92,9 @@ ENTRY(chacha20_block_xor_neon)
 
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vadd.i32	q0, q0, q1
-	veor		q4, q3, q0
-	vshl.u32	q3, q4, #8
-	vsri.u32	q3, q4, #24
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
 
 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vadd.i32	q2, q2, q3
@@ -92,9 +122,9 @@ ENTRY(chacha20_block_xor_neon)
 
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vadd.i32	q0, q0, q1
-	veor		q4, q3, q0
-	vshl.u32	q3, q4, #8
-	vsri.u32	q3, q4, #24
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
 
 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vadd.i32	q2, q2, q3
@@ -139,13 +169,17 @@ ENTRY(chacha20_block_xor_neon)
 	bx		lr
 ENDPROC(chacha20_block_xor_neon)
 
+	.align		4
+.Lctrinc:	.word	0, 1, 2, 3
+.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
+
 	.align		5
 ENTRY(chacha20_4block_xor_neon)
-	push		{r4-r6, lr}
-	mov		ip, sp			// preserve the stack pointer
-	sub		r3, sp, #0x20		// allocate a 32 byte buffer
-	bic		r3, r3, #0x1f		// aligned to 32 bytes
-	mov		sp, r3
+	push		{r4-r5}
+	mov		r4, sp			// preserve the stack pointer
+	sub		ip, sp, #0x20		// allocate a 32 byte buffer
+	bic		ip, ip, #0x1f		// aligned to 32 bytes
+	mov		sp, ip
 
 	// r0: Input state matrix, s
 	// r1: 4 data blocks output, o
@@ -155,25 +189,24 @@ ENTRY(chacha20_4block_xor_neon)
 	// This function encrypts four consecutive ChaCha20 blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. For final XORing step we transpose the
-	// matrix by interleaving 32- and then 64-bit words, which allows us to
-	// do XOR in NEON registers.
+	// requires no word shuffling. The words are re-interleaved before the
+	// final addition of the original state and the XORing step.
 	//
 
-	// x0..15[0-3] = s0..3[0..3]
-	add		r3, r0, #0x20
+	// x0..15[0-3] = s0..15[0-3]
+	add		ip, r0, #0x20
 	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [r3]
+	vld1.32		{q2-q3}, [ip]
 
-	adr		r3, CTRINC
+	adr		r5, .Lctrinc
 	vdup.32		q15, d7[1]
 	vdup.32		q14, d7[0]
-	vld1.32		{q11}, [r3, :128]
+	vld1.32		{q4}, [r5, :128]
 	vdup.32		q13, d6[1]
 	vdup.32		q12, d6[0]
-	vadd.i32	q12, q12, q11		// x12 += counter values 0-3
 	vdup.32		q11, d5[1]
 	vdup.32		q10, d5[0]
+	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
 	vdup.32		q9, d4[1]
 	vdup.32		q8, d4[0]
 	vdup.32		q7, d3[1]
@@ -185,9 +218,13 @@ ENTRY(chacha20_4block_xor_neon)
 	vdup.32		q1, d0[1]
 	vdup.32		q0, d0[0]
 
+	adr		ip, .Lrol8_table
 	mov		r3, #10
+	b		1f
 
 .Ldoubleround4:
+	vld1.32		{q8-q9}, [sp, :256]
+1:
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
@@ -236,24 +273,25 @@ ENTRY(chacha20_4block_xor_neon)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
 	vadd.i32	q0, q0, q4
 	vadd.i32	q1, q1, q5
 	vadd.i32	q2, q2, q6
 	vadd.i32	q3, q3, q7
 
-	veor		q8, q12, q0
-	veor		q9, q13, q1
-	vshl.u32	q12, q8, #8
-	vshl.u32	q13, q9, #8
-	vsri.u32	q12, q8, #24
-	vsri.u32	q13, q9, #24
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
 
-	veor		q8, q14, q2
-	veor		q9, q15, q3
-	vshl.u32	q14, q8, #8
-	vshl.u32	q15, q9, #8
-	vsri.u32	q14, q8, #24
-	vsri.u32	q15, q9, #24
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
 
 	vld1.32		{q8-q9}, [sp, :256]
 
@@ -332,24 +370,25 @@ ENTRY(chacha20_4block_xor_neon)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
 	vadd.i32	q0, q0, q5
 	vadd.i32	q1, q1, q6
 	vadd.i32	q2, q2, q7
 	vadd.i32	q3, q3, q4
 
-	veor		q8, q15, q0
-	veor		q9, q12, q1
-	vshl.u32	q15, q8, #8
-	vshl.u32	q12, q9, #8
-	vsri.u32	q15, q8, #24
-	vsri.u32	q12, q9, #24
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
 
-	veor		q8, q13, q2
-	veor		q9, q14, q3
-	vshl.u32	q13, q8, #8
-	vshl.u32	q14, q9, #8
-	vsri.u32	q13, q8, #24
-	vsri.u32	q14, q9, #24
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
 
 	vld1.32		{q8-q9}, [sp, :256]
 
@@ -379,104 +418,76 @@ ENTRY(chacha20_4block_xor_neon)
 	vsri.u32	q6, q9, #25
 
 	subs		r3, r3, #1
-	beq		0f
-
-	vld1.32		{q8-q9}, [sp, :256]
-	b		.Ldoubleround4
-
-	// x0[0-3] += s0[0]
-	// x1[0-3] += s0[1]
-	// x2[0-3] += s0[2]
-	// x3[0-3] += s0[3]
-0:	ldmia		r0!, {r3-r6}
-	vdup.32		q8, r3
-	vdup.32		q9, r4
-	vadd.i32	q0, q0, q8
-	vadd.i32	q1, q1, q9
-	vdup.32		q8, r5
-	vdup.32		q9, r6
-	vadd.i32	q2, q2, q8
-	vadd.i32	q3, q3, q9
-
-	// x4[0-3] += s1[0]
-	// x5[0-3] += s1[1]
-	// x6[0-3] += s1[2]
-	// x7[0-3] += s1[3]
-	ldmia		r0!, {r3-r6}
-	vdup.32		q8, r3
-	vdup.32		q9, r4
-	vadd.i32	q4, q4, q8
-	vadd.i32	q5, q5, q9
-	vdup.32		q8, r5
-	vdup.32		q9, r6
-	vadd.i32	q6, q6, q8
-	vadd.i32	q7, q7, q9
-
-	// interleave 32-bit words in state n, n+1
-	vzip.32		q0, q1
-	vzip.32		q2, q3
-	vzip.32		q4, q5
-	vzip.32		q6, q7
-
-	// interleave 64-bit words in state n, n+2
+	bne		.Ldoubleround4
+
+	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+	// x8..9[0-3] are on the stack.
+
+	// Re-interleave the words in the first two rows of each block (x0..7).
+	// Also add the counter values 0-3 to x12[0-3].
+	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
+	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
+	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
+	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
+	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
+	  vadd.u32	q12, q8			// x12 += counter values 0-3
 	vswp		d1, d4
 	vswp		d3, d6
+	  vld1.32	{q8-q9}, [r0]!		// load s0..7
 	vswp		d9, d12
 	vswp		d11, d14
 
-	// xor with corresponding input, write to output
+	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+	// after XORing the first 32 bytes.
+	vswp		q1, q4
+
+	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
+	vadd.u32	q0, q0, q8
+	vadd.u32	q2, q2, q8
+	vadd.u32	q4, q4, q8
+	vadd.u32	q3, q3, q8
+
+	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
+	vadd.u32	q1, q1, q9
+	vadd.u32	q6, q6, q9
+	vadd.u32	q5, q5, q9
+	vadd.u32	q7, q7, q9
+
+	// XOR first 32 bytes using keystream from first two rows of first block
 	vld1.8		{q8-q9}, [r2]!
 	veor		q8, q8, q0
-	veor		q9, q9, q4
+	veor		q9, q9, q1
 	vst1.8		{q8-q9}, [r1]!
 
+	// Re-interleave the words in the last two rows of each block (x8..15).
 	vld1.32		{q8-q9}, [sp, :256]
-
-	// x8[0-3] += s2[0]
-	// x9[0-3] += s2[1]
-	// x10[0-3] += s2[2]
-	// x11[0-3] += s2[3]
-	ldmia		r0!, {r3-r6}
-	vdup.32		q0, r3
-	vdup.32		q4, r4
-	vadd.i32	q8, q8, q0
-	vadd.i32	q9, q9, q4
-	vdup.32		q0, r5
-	vdup.32		q4, r6
-	vadd.i32	q10, q10, q0
-	vadd.i32	q11, q11, q4
-
-	// x12[0-3] += s3[0]
-	// x13[0-3] += s3[1]
-	// x14[0-3] += s3[2]
-	// x15[0-3] += s3[3]
-	ldmia		r0!, {r3-r6}
-	vdup.32		q0, r3
-	vdup.32		q4, r4
-	adr		r3, CTRINC
-	vadd.i32	q12, q12, q0
-	vld1.32		{q0}, [r3, :128]
-	vadd.i32	q13, q13, q4
-	vadd.i32	q12, q12, q0		// x12 += counter values 0-3
-
-	vdup.32		q0, r5
-	vdup.32		q4, r6
-	vadd.i32	q14, q14, q0
-	vadd.i32	q15, q15, q4
-
-	// interleave 32-bit words in state n, n+1
-	vzip.32		q8, q9
-	vzip.32		q10, q11
-	vzip.32		q12, q13
-	vzip.32		q14, q15
-
-	// interleave 64-bit words in state n, n+2
-	vswp		d17, d20
-	vswp		d19, d22
+	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
+	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
+	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
+	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
+	  vld1.32	{q0-q1}, [r0]	// load s8..15
 	vswp		d25, d28
 	vswp		d27, d30
+	vswp		d17, d20
+	vswp		d19, d22
+
+	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
+	vadd.u32	q8,  q8,  q0
+	vadd.u32	q10, q10, q0
+	vadd.u32	q9,  q9,  q0
+	vadd.u32	q11, q11, q0
+
+	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+	vadd.u32	q12, q12, q1
+	vadd.u32	q14, q14, q1
+	vadd.u32	q13, q13, q1
+	vadd.u32	q15, q15, q1
 
-	vmov		q4, q1
+	// XOR the rest of the data with the keystream
 
 	vld1.8		{q0-q1}, [r2]!
 	veor		q0, q0, q8
@@ -509,13 +520,11 @@ ENTRY(chacha20_4block_xor_neon)
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]
+	  mov		sp, r4		// restore original stack pointer
 	veor		q0, q0, q11
 	veor		q1, q1, q15
 	vst1.8		{q0-q1}, [r1]
 
-	mov		sp, ip
-	pop		{r4-r6, pc}
+	pop		{r4-r5}
+	bx		lr
 ENDPROC(chacha20_4block_xor_neon)
-
-	.align		4
-CTRINC:	.word		0, 1, 2, 3
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
index 96e62ec105d0..cd9e93b46c2d 100644
--- a/arch/arm/crypto/crc32-ce-glue.c
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -236,7 +236,7 @@ static void __exit crc32_pmull_mod_exit(void)
 				  ARRAY_SIZE(crc32_pmull_algs));
 }
 
-static const struct cpu_feature crc32_cpu_feature[] = {
+static const struct cpu_feature __maybe_unused crc32_cpu_feature[] = {
 	{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
 };
 MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 2f78c10b1881..406009afa9cf 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -63,6 +63,33 @@
 	k48		.req	d31
 	SHASH2_p64	.req	d31
 
+	HH		.req	q10
+	HH3		.req	q11
+	HH4		.req	q12
+	HH34		.req	q13
+
+	HH_L		.req	d20
+	HH_H		.req	d21
+	HH3_L		.req	d22
+	HH3_H		.req	d23
+	HH4_L		.req	d24
+	HH4_H		.req	d25
+	HH34_L		.req	d26
+	HH34_H		.req	d27
+	SHASH2_H	.req	d29
+
+	XL2		.req	q5
+	XM2		.req	q6
+	XH2		.req	q7
+	T3		.req	q8
+
+	XL2_L		.req	d10
+	XL2_H		.req	d11
+	XM2_L		.req	d12
+	XM2_H		.req	d13
+	T3_L		.req	d16
+	T3_H		.req	d17
+
 	.text
 	.fpu		crypto-neon-fp-armv8
 
@@ -175,12 +202,77 @@
 	beq		0f
 	vld1.64		{T1}, [ip]
 	teq		r0, #0
-	b		1f
+	b		3f
+
+0:	.ifc		\pn, p64
+	tst		r0, #3			// skip until #blocks is a
+	bne		2f			// round multiple of 4
+
+	vld1.8		{XL2-XM2}, [r2]!
+1:	vld1.8		{T3-T2}, [r2]!
+	vrev64.8	XL2, XL2
+	vrev64.8	XM2, XM2
+
+	subs		r0, r0, #4
+
+	vext.8		T1, XL2, XL2, #8
+	veor		XL2_H, XL2_H, XL_L
+	veor		XL, XL, T1
+
+	vrev64.8	T3, T3
+	vrev64.8	T1, T2
+
+	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
+	veor		XL2_H, XL2_H, XL_H
+	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
+	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
+
+	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
+	veor		XM2_L, XM2_L, XM2_H
+	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
+	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
+
+	veor		XH, XH, XH2
+	veor		XL, XL, XL2
+	veor		XM, XM, XM2
+
+	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
+	veor		T3_L, T3_L, T3_H
+	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
+	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
+
+	veor		XH, XH, XH2
+	veor		XL, XL, XL2
+	veor		XM, XM, XM2
+
+	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
+	veor		T1_L, T1_L, T1_H
+	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
+	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
+
+	veor		XH, XH, XH2
+	veor		XL, XL, XL2
+	veor		XM, XM, XM2
 
-0:	vld1.64		{T1}, [r2]!
+	beq		4f
+
+	vld1.8		{XL2-XM2}, [r2]!
+
+	veor		T1, XL, XH
+	veor		XM, XM, T1
+
+	__pmull_reduce_p64
+
+	veor		T1, T1, XH
+	veor		XL, XL, T1
+
+	b		1b
+	.endif
+
+2:	vld1.64		{T1}, [r2]!
 	subs		r0, r0, #1
 
-1:	/* multiply XL by SHASH in GF(2^128) */
+3:	/* multiply XL by SHASH in GF(2^128) */
 #ifndef CONFIG_CPU_BIG_ENDIAN
 	vrev64.8	T1, T1
 #endif
@@ -193,7 +285,7 @@
 	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
 	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
 
-	veor		T1, XL, XH
+4:	veor		T1, XL, XH
 	veor		XM, XM, T1
 
 	__pmull_reduce_\pn
@@ -212,8 +304,14 @@
 	 *			   struct ghash_key const *k, const char *head)
 	 */
 ENTRY(pmull_ghash_update_p64)
-	vld1.64		{SHASH}, [r3]
+	vld1.64		{SHASH}, [r3]!
+	vld1.64		{HH}, [r3]!
+	vld1.64		{HH3-HH4}, [r3]
+
 	veor		SHASH2_p64, SHASH_L, SHASH_H
+	veor		SHASH2_H, HH_L, HH_H
+	veor		HH34_L, HH3_L, HH3_H
+	veor		HH34_H, HH4_L, HH4_H
 
 	vmov.i8		MASK, #0xe1
 	vshl.u64	MASK, MASK, #57
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 8930fc4e7c22..b7d30b6cf49c 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -1,7 +1,7 @@
 /*
  * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
  *
- * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
@@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash");
 #define GHASH_DIGEST_SIZE	16
 
 struct ghash_key {
-	u64	a;
-	u64	b;
+	u64	h[2];
+	u64	h2[2];
+	u64	h3[2];
+	u64	h4[2];
 };
 
 struct ghash_desc_ctx {
@@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
 	return 0;
 }
 
+static void ghash_reflect(u64 h[], const be128 *k)
+{
+	u64 carry = be64_to_cpu(k->a) >> 63;
+
+	h[0] = (be64_to_cpu(k->b) << 1) | carry;
+	h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
+
+	if (carry)
+		h[1] ^= 0xc200000000000000UL;
+}
+
 static int ghash_setkey(struct crypto_shash *tfm,
 			const u8 *inkey, unsigned int keylen)
 {
 	struct ghash_key *key = crypto_shash_ctx(tfm);
-	u64 a, b;
+	be128 h, k;
 
 	if (keylen != GHASH_BLOCK_SIZE) {
 		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 
-	/* perform multiplication by 'x' in GF(2^128) */
-	b = get_unaligned_be64(inkey);
-	a = get_unaligned_be64(inkey + 8);
+	memcpy(&k, inkey, GHASH_BLOCK_SIZE);
+	ghash_reflect(key->h, &k);
+
+	h = k;
+	gf128mul_lle(&h, &k);
+	ghash_reflect(key->h2, &h);
 
-	key->a = (a << 1) | (b >> 63);
-	key->b = (b << 1) | (a >> 63);
+	gf128mul_lle(&h, &k);
+	ghash_reflect(key->h3, &h);
 
-	if (b >> 63)
-		key->b ^= 0xc200000000000000UL;
+	gf128mul_lle(&h, &k);
+	ghash_reflect(key->h4, &h);
 
 	return 0;
 }
diff --git a/arch/arm/crypto/speck-neon-core.S b/arch/arm/crypto/speck-neon-core.S
deleted file mode 100644
index 57caa742016e..000000000000
--- a/arch/arm/crypto/speck-neon-core.S
+++ /dev/null
@@ -1,434 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-	.fpu		neon
-
-	// arguments
-	ROUND_KEYS	.req	r0	// const {u64,u32} *round_keys
-	NROUNDS		.req	r1	// int nrounds
-	DST		.req	r2	// void *dst
-	SRC		.req	r3	// const void *src
-	NBYTES		.req	r4	// unsigned int nbytes
-	TWEAK		.req	r5	// void *tweak
-
-	// registers which hold the data being encrypted/decrypted
-	X0		.req	q0
-	X0_L		.req	d0
-	X0_H		.req	d1
-	Y0		.req	q1
-	Y0_H		.req	d3
-	X1		.req	q2
-	X1_L		.req	d4
-	X1_H		.req	d5
-	Y1		.req	q3
-	Y1_H		.req	d7
-	X2		.req	q4
-	X2_L		.req	d8
-	X2_H		.req	d9
-	Y2		.req	q5
-	Y2_H		.req	d11
-	X3		.req	q6
-	X3_L		.req	d12
-	X3_H		.req	d13
-	Y3		.req	q7
-	Y3_H		.req	d15
-
-	// the round key, duplicated in all lanes
-	ROUND_KEY	.req	q8
-	ROUND_KEY_L	.req	d16
-	ROUND_KEY_H	.req	d17
-
-	// index vector for vtbl-based 8-bit rotates
-	ROTATE_TABLE	.req	d18
-
-	// multiplication table for updating XTS tweaks
-	GF128MUL_TABLE	.req	d19
-	GF64MUL_TABLE	.req	d19
-
-	// current XTS tweak value(s)
-	TWEAKV		.req	q10
-	TWEAKV_L	.req	d20
-	TWEAKV_H	.req	d21
-
-	TMP0		.req	q12
-	TMP0_L		.req	d24
-	TMP0_H		.req	d25
-	TMP1		.req	q13
-	TMP2		.req	q14
-	TMP3		.req	q15
-
-	.align		4
-.Lror64_8_table:
-	.byte		1, 2, 3, 4, 5, 6, 7, 0
-.Lror32_8_table:
-	.byte		1, 2, 3, 0, 5, 6, 7, 4
-.Lrol64_8_table:
-	.byte		7, 0, 1, 2, 3, 4, 5, 6
-.Lrol32_8_table:
-	.byte		3, 0, 1, 2, 7, 4, 5, 6
-.Lgf128mul_table:
-	.byte		0, 0x87
-	.fill		14
-.Lgf64mul_table:
-	.byte		0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
-	.fill		12
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- *
- * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
- * the vtbl approach is faster on some processors and the same speed on others.
- */
-.macro _speck_round_128bytes	n
-
-	// x = ror(x, 8)
-	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
-	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
-	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
-	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
-	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
-	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
-	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
-	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
-
-	// x += y
-	vadd.u\n	X0, Y0
-	vadd.u\n	X1, Y1
-	vadd.u\n	X2, Y2
-	vadd.u\n	X3, Y3
-
-	// x ^= k
-	veor		X0, ROUND_KEY
-	veor		X1, ROUND_KEY
-	veor		X2, ROUND_KEY
-	veor		X3, ROUND_KEY
-
-	// y = rol(y, 3)
-	vshl.u\n	TMP0, Y0, #3
-	vshl.u\n	TMP1, Y1, #3
-	vshl.u\n	TMP2, Y2, #3
-	vshl.u\n	TMP3, Y3, #3
-	vsri.u\n	TMP0, Y0, #(\n - 3)
-	vsri.u\n	TMP1, Y1, #(\n - 3)
-	vsri.u\n	TMP2, Y2, #(\n - 3)
-	vsri.u\n	TMP3, Y3, #(\n - 3)
-
-	// y ^= x
-	veor		Y0, TMP0, X0
-	veor		Y1, TMP1, X1
-	veor		Y2, TMP2, X2
-	veor		Y3, TMP3, X3
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes	n
-
-	// y ^= x
-	veor		TMP0, Y0, X0
-	veor		TMP1, Y1, X1
-	veor		TMP2, Y2, X2
-	veor		TMP3, Y3, X3
-
-	// y = ror(y, 3)
-	vshr.u\n	Y0, TMP0, #3
-	vshr.u\n	Y1, TMP1, #3
-	vshr.u\n	Y2, TMP2, #3
-	vshr.u\n	Y3, TMP3, #3
-	vsli.u\n	Y0, TMP0, #(\n - 3)
-	vsli.u\n	Y1, TMP1, #(\n - 3)
-	vsli.u\n	Y2, TMP2, #(\n - 3)
-	vsli.u\n	Y3, TMP3, #(\n - 3)
-
-	// x ^= k
-	veor		X0, ROUND_KEY
-	veor		X1, ROUND_KEY
-	veor		X2, ROUND_KEY
-	veor		X3, ROUND_KEY
-
-	// x -= y
-	vsub.u\n	X0, Y0
-	vsub.u\n	X1, Y1
-	vsub.u\n	X2, Y2
-	vsub.u\n	X3, Y3
-
-	// x = rol(x, 8);
-	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
-	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
-	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
-	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
-	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
-	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
-	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
-	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
-.endm
-
-.macro _xts128_precrypt_one	dst_reg, tweak_buf, tmp
-
-	// Load the next source block
-	vld1.8		{\dst_reg}, [SRC]!
-
-	// Save the current tweak in the tweak buffer
-	vst1.8		{TWEAKV}, [\tweak_buf:128]!
-
-	// XOR the next source block with the current tweak
-	veor		\dst_reg, TWEAKV
-
-	/*
-	 * Calculate the next tweak by multiplying the current one by x,
-	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
-	 */
-	vshr.u64	\tmp, TWEAKV, #63
-	vshl.u64	TWEAKV, #1
-	veor		TWEAKV_H, \tmp\()_L
-	vtbl.8		\tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
-	veor		TWEAKV_L, \tmp\()_H
-.endm
-
-.macro _xts64_precrypt_two	dst_reg, tweak_buf, tmp
-
-	// Load the next two source blocks
-	vld1.8		{\dst_reg}, [SRC]!
-
-	// Save the current two tweaks in the tweak buffer
-	vst1.8		{TWEAKV}, [\tweak_buf:128]!
-
-	// XOR the next two source blocks with the current two tweaks
-	veor		\dst_reg, TWEAKV
-
-	/*
-	 * Calculate the next two tweaks by multiplying the current ones by x^2,
-	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
-	 */
-	vshr.u64	\tmp, TWEAKV, #62
-	vshl.u64	TWEAKV, #2
-	vtbl.8		\tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
-	vtbl.8		\tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
-	veor		TWEAKV, \tmp
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt	n, decrypting
-	push		{r4-r7}
-	mov		r7, sp
-
-	/*
-	 * The first four parameters were passed in registers r0-r3.  Load the
-	 * additional parameters, which were passed on the stack.
-	 */
-	ldr		NBYTES, [sp, #16]
-	ldr		TWEAK, [sp, #20]
-
-	/*
-	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
-	 * round key rather than the first, since for decryption the round keys
-	 * are used in reverse order.
-	 */
-.if \decrypting
-.if \n == 64
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
-	sub		ROUND_KEYS, #8
-.else
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
-	sub		ROUND_KEYS, #4
-.endif
-.endif
-
-	// Load the index vector for vtbl-based 8-bit rotates
-.if \decrypting
-	ldr		r12, =.Lrol\n\()_8_table
-.else
-	ldr		r12, =.Lror\n\()_8_table
-.endif
-	vld1.8		{ROTATE_TABLE}, [r12:64]
-
-	// One-time XTS preparation
-
-	/*
-	 * Allocate stack space to store 128 bytes worth of tweaks.  For
-	 * performance, this space is aligned to a 16-byte boundary so that we
-	 * can use the load/store instructions that declare 16-byte alignment.
-	 * For Thumb2 compatibility, don't do the 'bic' directly on 'sp'.
-	 */
-	sub		r12, sp, #128
-	bic		r12, #0xf
-	mov		sp, r12
-
-.if \n == 64
-	// Load first tweak
-	vld1.8		{TWEAKV}, [TWEAK]
-
-	// Load GF(2^128) multiplication table
-	ldr		r12, =.Lgf128mul_table
-	vld1.8		{GF128MUL_TABLE}, [r12:64]
-.else
-	// Load first tweak
-	vld1.8		{TWEAKV_L}, [TWEAK]
-
-	// Load GF(2^64) multiplication table
-	ldr		r12, =.Lgf64mul_table
-	vld1.8		{GF64MUL_TABLE}, [r12:64]
-
-	// Calculate second tweak, packing it together with the first
-	vshr.u64	TMP0_L, TWEAKV_L, #63
-	vtbl.u8		TMP0_L, {GF64MUL_TABLE}, TMP0_L
-	vshl.u64	TWEAKV_H, TWEAKV_L, #1
-	veor		TWEAKV_H, TMP0_L
-.endif
-
-.Lnext_128bytes_\@:
-
-	/*
-	 * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
-	 * values, and save the tweaks on the stack for later.  Then
-	 * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
-	 * that the X[0-3] registers contain only the second halves of blocks,
-	 * and the Y[0-3] registers contain only the first halves of blocks.
-	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
-	 */
-	mov		r12, sp
-.if \n == 64
-	_xts128_precrypt_one	X0, r12, TMP0
-	_xts128_precrypt_one	Y0, r12, TMP0
-	_xts128_precrypt_one	X1, r12, TMP0
-	_xts128_precrypt_one	Y1, r12, TMP0
-	_xts128_precrypt_one	X2, r12, TMP0
-	_xts128_precrypt_one	Y2, r12, TMP0
-	_xts128_precrypt_one	X3, r12, TMP0
-	_xts128_precrypt_one	Y3, r12, TMP0
-	vswp		X0_L, Y0_H
-	vswp		X1_L, Y1_H
-	vswp		X2_L, Y2_H
-	vswp		X3_L, Y3_H
-.else
-	_xts64_precrypt_two	X0, r12, TMP0
-	_xts64_precrypt_two	Y0, r12, TMP0
-	_xts64_precrypt_two	X1, r12, TMP0
-	_xts64_precrypt_two	Y1, r12, TMP0
-	_xts64_precrypt_two	X2, r12, TMP0
-	_xts64_precrypt_two	Y2, r12, TMP0
-	_xts64_precrypt_two	X3, r12, TMP0
-	_xts64_precrypt_two	Y3, r12, TMP0
-	vuzp.32		Y0, X0
-	vuzp.32		Y1, X1
-	vuzp.32		Y2, X2
-	vuzp.32		Y3, X3
-.endif
-
-	// Do the cipher rounds
-
-	mov		r12, ROUND_KEYS
-	mov		r6, NROUNDS
-
-.Lnext_round_\@:
-.if \decrypting
-.if \n == 64
-	vld1.64		ROUND_KEY_L, [r12]
-	sub		r12, #8
-	vmov		ROUND_KEY_H, ROUND_KEY_L
-.else
-	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
-	sub		r12, #4
-.endif
-	_speck_unround_128bytes	\n
-.else
-.if \n == 64
-	vld1.64		ROUND_KEY_L, [r12]!
-	vmov		ROUND_KEY_H, ROUND_KEY_L
-.else
-	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
-.endif
-	_speck_round_128bytes	\n
-.endif
-	subs		r6, r6, #1
-	bne		.Lnext_round_\@
-
-	// Re-interleave the 'x' and 'y' elements of each block
-.if \n == 64
-	vswp		X0_L, Y0_H
-	vswp		X1_L, Y1_H
-	vswp		X2_L, Y2_H
-	vswp		X3_L, Y3_H
-.else
-	vzip.32		Y0, X0
-	vzip.32		Y1, X1
-	vzip.32		Y2, X2
-	vzip.32		Y3, X3
-.endif
-
-	// XOR the encrypted/decrypted blocks with the tweaks we saved earlier
-	mov		r12, sp
-	vld1.8		{TMP0, TMP1}, [r12:128]!
-	vld1.8		{TMP2, TMP3}, [r12:128]!
-	veor		X0, TMP0
-	veor		Y0, TMP1
-	veor		X1, TMP2
-	veor		Y1, TMP3
-	vld1.8		{TMP0, TMP1}, [r12:128]!
-	vld1.8		{TMP2, TMP3}, [r12:128]!
-	veor		X2, TMP0
-	veor		Y2, TMP1
-	veor		X3, TMP2
-	veor		Y3, TMP3
-
-	// Store the ciphertext in the destination buffer
-	vst1.8		{X0, Y0}, [DST]!
-	vst1.8		{X1, Y1}, [DST]!
-	vst1.8		{X2, Y2}, [DST]!
-	vst1.8		{X3, Y3}, [DST]!
-
-	// Continue if there are more 128-byte chunks remaining, else return
-	subs		NBYTES, #128
-	bne		.Lnext_128bytes_\@
-
-	// Store the next tweak
-.if \n == 64
-	vst1.8		{TWEAKV}, [TWEAK]
-.else
-	vst1.8		{TWEAKV_L}, [TWEAK]
-.endif
-
-	mov		sp, r7
-	pop		{r4-r7}
-	bx		lr
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
-	_speck_xts_crypt	n=64, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
-	_speck_xts_crypt	n=64, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
-	_speck_xts_crypt	n=32, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
-	_speck_xts_crypt	n=32, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm/crypto/speck-neon-glue.c b/arch/arm/crypto/speck-neon-glue.c
deleted file mode 100644
index f012c3ea998f..000000000000
--- a/arch/arm/crypto/speck-neon-glue.c
+++ /dev/null
@@ -1,288 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Note: the NIST recommendation for XTS only specifies a 128-bit block size,
- * but a 64-bit version (needed for Speck64) is fairly straightforward; the math
- * is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial
- * x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004:
- * "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes
- * OCB and PMAC"), represented as 0x1B.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE	128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
-	struct speck128_tfm_ctx main_key;
-	struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
-				     u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
-					  const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct skcipher_request *req,
-		     speck128_crypt_one_t crypt_one,
-		     speck128_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	le128 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
-			gf128mul_x_ble(&tweak, &tweak);
-
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck128_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_encrypt,
-				    speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_decrypt,
-				    speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			       unsigned int keylen)
-{
-	struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
-	struct speck64_tfm_ctx main_key;
-	struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
-				    u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
-					 const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
-		    speck64_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	__le64 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			*(__le64 *)dst = *(__le64 *)src ^ tweak;
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			*(__le64 *)dst ^= tweak;
-			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
-					    ((tweak & cpu_to_le64(1ULL << 63)) ?
-					     0x1B : 0));
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck64_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_encrypt,
-				   speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_decrypt,
-				   speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			      unsigned int keylen)
-{
-	struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct skcipher_alg speck_algs[] = {
-	{
-		.base.cra_name		= "xts(speck128)",
-		.base.cra_driver_name	= "xts-speck128-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK128_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck128_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
-		.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
-		.ivsize			= SPECK128_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck128_xts_setkey,
-		.encrypt		= speck128_xts_encrypt,
-		.decrypt		= speck128_xts_decrypt,
-	}, {
-		.base.cra_name		= "xts(speck64)",
-		.base.cra_driver_name	= "xts-speck64-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK64_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck64_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
-		.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
-		.ivsize			= SPECK64_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck64_xts_setkey,
-		.encrypt		= speck64_xts_encrypt,
-		.decrypt		= speck64_xts_decrypt,
-	}
-};
-
-static int __init speck_neon_module_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-	return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
-	crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index b17ee03d280b..88286dd483ff 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -467,6 +467,17 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 #endif
 	.endm
 
+	.macro uaccess_mask_range_ptr, addr:req, size:req, limit:req, tmp:req
+#ifdef CONFIG_CPU_SPECTRE
+	sub	\tmp, \limit, #1
+	subs	\tmp, \tmp, \addr	@ tmp = limit - 1 - addr
+	addhs	\tmp, \tmp, #1		@ if (tmp >= 0) {
+	subhss	\tmp, \tmp, \size	@ tmp = limit - (addr + size) }
+	movlo	\addr, #0		@ if (tmp < 0) addr = NULL
+	csdb
+#endif
+	.endm
+
 	.macro	uaccess_disable, tmp, isb=1
 #ifdef CONFIG_CPU_SW_DOMAIN_PAN
 	/*
diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h
index 237aa52d8733..36c951dd23b8 100644
--- a/arch/arm/include/asm/bug.h
+++ b/arch/arm/include/asm/bug.h
@@ -62,8 +62,8 @@ do {								\
 struct pt_regs;
 void die(const char *msg, struct pt_regs *regs, int err);
 
-struct siginfo;
-void arm_notify_die(const char *str, struct pt_regs *regs, struct siginfo *info,
+void arm_notify_die(const char *str, struct pt_regs *regs,
+		int signo, int si_code, void __user *addr,
 		unsigned long err, unsigned long trap);
 
 #ifdef CONFIG_ARM_LPAE
diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index 9e842ff41768..18b0197f2384 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -16,9 +16,6 @@ extern void __gnu_mcount_nc(void);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 struct dyn_arch_ftrace {
-#ifdef CONFIG_OLD_MCOUNT
-	bool	old_mcount;
-#endif
 };
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h
index d4014fbe5ea3..0d9f3918fa7e 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -29,6 +29,7 @@
  * ptes.
  * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
  */
+#define __HAVE_ARCH_HUGE_PTEP_GET
 static inline pte_t huge_ptep_get(pte_t *ptep)
 {
 	pte_t retval = *ptep;
@@ -37,35 +38,4 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
 	return retval;
 }
 
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-				   pte_t *ptep, pte_t pte)
-{
-	set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
-					 unsigned long addr, pte_t *ptep)
-{
-	ptep_clear_flush(vma, addr, ptep);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
-					   unsigned long addr, pte_t *ptep)
-{
-	ptep_set_wrprotect(mm, addr, ptep);
-}
-
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-					    unsigned long addr, pte_t *ptep)
-{
-	return ptep_get_and_clear(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
-					     unsigned long addr, pte_t *ptep,
-					     pte_t pte, int dirty)
-{
-	return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
-
 #endif /* _ASM_ARM_HUGETLB_3LEVEL_H */
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 7d26f6c4f0f5..b67256c22b08 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -23,18 +23,8 @@
 #define _ASM_ARM_HUGETLB_H
 
 #include <asm/page.h>
-#include <asm-generic/hugetlb.h>
-
 #include <asm/hugetlb-3level.h>
-
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
-					  unsigned long addr, unsigned long end,
-					  unsigned long floor,
-					  unsigned long ceiling)
-{
-	free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
+#include <asm-generic/hugetlb.h>
 
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					 unsigned long addr, unsigned long len)
@@ -42,27 +32,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 	return 0;
 }
 
-static inline int prepare_hugepage_range(struct file *file,
-					 unsigned long addr, unsigned long len)
-{
-	struct hstate *h = hstate_file(file);
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-	if (addr & ~huge_page_mask(h))
-		return -EINVAL;
-	return 0;
-}
-
-static inline int huge_pte_none(pte_t pte)
-{
-	return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
-	return pte_wrprotect(pte);
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 	clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 2d43dca29c72..b95f8d0d9f17 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -133,8 +133,7 @@
  * space.
  */
 #define KVM_PHYS_SHIFT	(40)
-#define KVM_PHYS_SIZE	(_AC(1, ULL) << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK	(KVM_PHYS_SIZE - _AC(1, ULL))
+
 #define PTRS_PER_S2_PGD	(_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
 
 /* Virtualization Translation Control Register (VTCR) bits */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 3ad482d2f1eb..5ca5d9af0c26 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void)
 	kvm_call_hyp(__init_stage2_translation);
 }
 
-static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 {
 	return 0;
 }
@@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
 struct kvm *kvm_arch_alloc_vm(void);
 void kvm_arch_free_vm(struct kvm *kvm);
 
+static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+	/*
+	 * On 32bit ARM, VMs get a static 40bit IPA stage2 setup,
+	 * so any non-zero value used as type is illegal.
+	 */
+	if (type)
+		return -EINVAL;
+	return 0;
+}
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 847f01fa429d..1098ffc3d54b 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -35,16 +35,12 @@
 		addr;							\
 	})
 
-/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
- */
-#define KVM_MMU_CACHE_MIN_PAGES	2
-
 #ifndef __ASSEMBLY__
 
 #include <linux/highmem.h>
 #include <asm/cacheflush.h>
 #include <asm/cputype.h>
+#include <asm/kvm_arm.h>
 #include <asm/kvm_hyp.h>
 #include <asm/pgalloc.h>
 #include <asm/stage2_pgtable.h>
@@ -52,6 +48,13 @@
 /* Ensure compatibility with arm64 */
 #define VA_BITS			32
 
+#define kvm_phys_shift(kvm)		KVM_PHYS_SHIFT
+#define kvm_phys_size(kvm)		(1ULL << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm)		(kvm_phys_size(kvm) - 1ULL)
+#define kvm_vttbr_baddr_mask(kvm)	VTTBR_BADDR_MASK
+
+#define stage2_pgd_size(kvm)		(PTRS_PER_S2_PGD * sizeof(pgd_t))
+
 int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
 			   void __iomem **kaddr,
@@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void)
 
 #define kvm_phys_to_vttbr(addr)		(addr)
 
+static inline void kvm_set_ipa_limit(void) {}
+
 static inline bool kvm_cpu_has_cnp(void)
 {
 	return false;
diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
index 460d616bb2d6..f6a7ea805232 100644
--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
@@ -19,43 +19,53 @@
 #ifndef __ARM_S2_PGTABLE_H_
 #define __ARM_S2_PGTABLE_H_
 
-#define stage2_pgd_none(pgd)			pgd_none(pgd)
-#define stage2_pgd_clear(pgd)			pgd_clear(pgd)
-#define stage2_pgd_present(pgd)			pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud)		pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address)		pud_offset(pgd, address)
-#define stage2_pud_free(pud)			pud_free(NULL, pud)
-
-#define stage2_pud_none(pud)			pud_none(pud)
-#define stage2_pud_clear(pud)			pud_clear(pud)
-#define stage2_pud_present(pud)			pud_present(pud)
-#define stage2_pud_populate(pud, pmd)		pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address)		pmd_offset(pud, address)
-#define stage2_pmd_free(pmd)			pmd_free(NULL, pmd)
-
-#define stage2_pud_huge(pud)			pud_huge(pud)
+/*
+ * kvm_mmu_cache_min_pages() is the number of pages required
+ * to install a stage-2 translation. We pre-allocate the entry
+ * level table at VM creation. Since we have a 3 level page-table,
+ * we need only two pages to add a new mapping.
+ */
+#define kvm_mmu_cache_min_pages(kvm)	2
+
+#define stage2_pgd_none(kvm, pgd)		pgd_none(pgd)
+#define stage2_pgd_clear(kvm, pgd)		pgd_clear(pgd)
+#define stage2_pgd_present(kvm, pgd)		pgd_present(pgd)
+#define stage2_pgd_populate(kvm, pgd, pud)	pgd_populate(NULL, pgd, pud)
+#define stage2_pud_offset(kvm, pgd, address)	pud_offset(pgd, address)
+#define stage2_pud_free(kvm, pud)		pud_free(NULL, pud)
+
+#define stage2_pud_none(kvm, pud)		pud_none(pud)
+#define stage2_pud_clear(kvm, pud)		pud_clear(pud)
+#define stage2_pud_present(kvm, pud)		pud_present(pud)
+#define stage2_pud_populate(kvm, pud, pmd)	pud_populate(NULL, pud, pmd)
+#define stage2_pmd_offset(kvm, pud, address)	pmd_offset(pud, address)
+#define stage2_pmd_free(kvm, pmd)		pmd_free(NULL, pmd)
+
+#define stage2_pud_huge(kvm, pud)		pud_huge(pud)
 
 /* Open coded p*d_addr_end that can deal with 64bit addresses */
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
 	phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
 
 	return (boundary - 1 < end - 1) ? boundary : end;
 }
 
-#define stage2_pud_addr_end(addr, end)		(end)
+#define stage2_pud_addr_end(kvm, addr, end)	(end)
 
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
 	phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK;
 
 	return (boundary - 1 < end - 1) ? boundary : end;
 }
 
-#define stage2_pgd_index(addr)				pgd_index(addr)
+#define stage2_pgd_index(kvm, addr)		pgd_index(addr)
 
-#define stage2_pte_table_empty(ptep)			kvm_page_empty(ptep)
-#define stage2_pmd_table_empty(pmdp)			kvm_page_empty(pmdp)
-#define stage2_pud_table_empty(pudp)			false
+#define stage2_pte_table_empty(kvm, ptep)	kvm_page_empty(ptep)
+#define stage2_pmd_table_empty(kvm, pmdp)	kvm_page_empty(pmdp)
+#define stage2_pud_table_empty(kvm, pudp)	false
 
 #endif	/* __ARM_S2_PGTABLE_H_ */
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 9b37b6ab27fe..8f55dc520a3e 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -121,8 +121,8 @@ extern void vfp_flush_hwstate(struct thread_info *);
 struct user_vfp;
 struct user_vfp_exc;
 
-extern int vfp_preserve_user_clear_hwstate(struct user_vfp __user *,
-					   struct user_vfp_exc __user *);
+extern int vfp_preserve_user_clear_hwstate(struct user_vfp *,
+					   struct user_vfp_exc *);
 extern int vfp_restore_user_hwstate(struct user_vfp *,
 				    struct user_vfp_exc *);
 #endif
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 5451e1f05a19..c136eef8f690 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -69,6 +69,14 @@ extern int __put_user_bad(void);
 static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
+
+	/*
+	 * Prevent a mispredicted conditional call to set_fs from forwarding
+	 * the wrong address limit to access_ok under speculation.
+	 */
+	dsb(nsh);
+	isb();
+
 	modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
 }
 
@@ -92,6 +100,32 @@ static inline void set_fs(mm_segment_t fs)
 	__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 
 /*
+ * Sanitise a uaccess pointer such that it becomes NULL if addr+size
+ * is above the current addr_limit.
+ */
+#define uaccess_mask_range_ptr(ptr, size)			\
+	((__typeof__(ptr))__uaccess_mask_range_ptr(ptr, size))
+static inline void __user *__uaccess_mask_range_ptr(const void __user *ptr,
+						    size_t size)
+{
+	void __user *safe_ptr = (void __user *)ptr;
+	unsigned long tmp;
+
+	asm volatile(
+	"	sub	%1, %3, #1\n"
+	"	subs	%1, %1, %0\n"
+	"	addhs	%1, %1, #1\n"
+	"	subhss	%1, %1, %2\n"
+	"	movlo	%0, #0\n"
+	: "+r" (safe_ptr), "=&r" (tmp)
+	: "r" (size), "r" (current_thread_info()->addr_limit)
+	: "cc");
+
+	csdb();
+	return safe_ptr;
+}
+
+/*
  * Single-value transfer routines.  They automatically use the right
  * size if we just have the right pointer type.  Note that the functions
  * which read from user space (*get_*) need to take care not to leak
@@ -362,6 +396,14 @@ do {									\
 	__pu_err;							\
 })
 
+#ifdef CONFIG_CPU_SPECTRE
+/*
+ * When mitigating Spectre variant 1.1, all accessors need to include
+ * verification of the address space.
+ */
+#define __put_user(x, ptr) put_user(x, ptr)
+
+#else
 #define __put_user(x, ptr)						\
 ({									\
 	long __pu_err = 0;						\
@@ -369,12 +411,6 @@ do {									\
 	__pu_err;							\
 })
 
-#define __put_user_error(x, ptr, err)					\
-({									\
-	__put_user_switch((x), (ptr), (err), __put_user_nocheck);	\
-	(void) 0;							\
-})
-
 #define __put_user_nocheck(x, __pu_ptr, __err, __size)			\
 	do {								\
 		unsigned long __pu_addr = (unsigned long)__pu_ptr;	\
@@ -454,6 +490,7 @@ do {									\
 	: "r" (x), "i" (-EFAULT)				\
 	: "cc")
 
+#endif /* !CONFIG_CPU_SPECTRE */
 
 #ifdef CONFIG_MMU
 extern unsigned long __must_check
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 076090d2dbf5..88ef2ce1f69a 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -16,23 +16,23 @@
 #include <uapi/asm/unistd.h>
 #include <asm/unistd-nr.h>
 
+#define __ARCH_WANT_NEW_STAT
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_GETHOSTNAME
 #define __ARCH_WANT_SYS_PAUSE
 #define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
 #define __ARCH_WANT_SYS_NICE
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_OLD_MMAP
 #define __ARCH_WANT_SYS_OLD_SELECT
+#define __ARCH_WANT_SYS_UTIME
 
 #if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT)
 #define __ARCH_WANT_SYS_TIME
 #define __ARCH_WANT_SYS_IPC
 #define __ARCH_WANT_SYS_OLDUMOUNT
 #define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_UTIME
 #define __ARCH_WANT_SYS_OLD_GETRLIMIT
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_SYS_SOCKETCALL
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 783fbb4de5f9..8fa2dc21d332 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -167,9 +167,6 @@ EXPORT_SYMBOL(_find_next_bit_be);
 #endif
 
 #ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_OLD_MCOUNT
-EXPORT_SYMBOL(mcount);
-#endif
 EXPORT_SYMBOL(__gnu_mcount_nc);
 #endif
 
diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c
index ecaa68dd1af5..13bcd3b867cb 100644
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c
@@ -87,14 +87,11 @@ void __init arm_dt_init_cpu_maps(void)
 	if (!cpus)
 		return;
 
-	for_each_child_of_node(cpus, cpu) {
+	for_each_of_cpu_node(cpu) {
 		const __be32 *cell;
 		int prop_bytes;
 		u32 hwid;
 
-		if (of_node_cmp(cpu->type, "cpu"))
-			continue;
-
 		pr_debug(" * %pOF...\n", cpu);
 		/*
 		 * A device tree containing CPU nodes with missing "reg"
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 746565a876dc..0465d65d23de 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -296,16 +296,15 @@ __sys_trace:
 	cmp	scno, #-1			@ skip the syscall?
 	bne	2b
 	add	sp, sp, #S_OFF			@ restore stack
-	b	ret_slow_syscall
 
-__sys_trace_return:
-	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
+__sys_trace_return_nosave:
+	enable_irq_notrace
 	mov	r0, sp
 	bl	syscall_trace_exit
 	b	ret_slow_syscall
 
-__sys_trace_return_nosave:
-	enable_irq_notrace
+__sys_trace_return:
+	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
 	mov	r0, sp
 	bl	syscall_trace_exit
 	b	ret_slow_syscall
diff --git a/arch/arm/kernel/entry-ftrace.S b/arch/arm/kernel/entry-ftrace.S
index efcd9f25a14b..0be69e551a64 100644
--- a/arch/arm/kernel/entry-ftrace.S
+++ b/arch/arm/kernel/entry-ftrace.S
@@ -15,23 +15,8 @@
  * start of every function.  In mcount, apart from the function's address (in
  * lr), we need to get hold of the function's caller's address.
  *
- * Older GCCs (pre-4.4) inserted a call to a routine called mcount like this:
- *
- *	bl	mcount
- *
- * These versions have the limitation that in order for the mcount routine to
- * be able to determine the function's caller's address, an APCS-style frame
- * pointer (which is set up with something like the code below) is required.
- *
- *	mov     ip, sp
- *	push    {fp, ip, lr, pc}
- *	sub     fp, ip, #4
- *
- * With EABI, these frame pointers are not available unless -mapcs-frame is
- * specified, and if building as Thumb-2, not even then.
- *
- * Newer GCCs (4.4+) solve this problem by introducing a new version of mcount,
- * with call sites like:
+ * Newer GCCs (4.4+) solve this problem by using a version of mcount with call
+ * sites like:
  *
  *	push	{lr}
  *	bl	__gnu_mcount_nc
@@ -46,17 +31,10 @@
  * allows it to be clobbered in subroutines and doesn't use it to hold
  * parameters.)
  *
- * When using dynamic ftrace, we patch out the mcount call by a "mov r0, r0"
- * for the mcount case, and a "pop {lr}" for the __gnu_mcount_nc case (see
- * arch/arm/kernel/ftrace.c).
+ * When using dynamic ftrace, we patch out the mcount call by a "pop {lr}"
+ * instead of the __gnu_mcount_nc call (see arch/arm/kernel/ftrace.c).
  */
 
-#ifndef CONFIG_OLD_MCOUNT
-#if (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
-#error Ftrace requires CONFIG_FRAME_POINTER=y with GCC older than 4.4.0.
-#endif
-#endif
-
 .macro mcount_adjust_addr rd, rn
 	bic	\rd, \rn, #1		@ clear the Thumb bit if present
 	sub	\rd, \rd, #MCOUNT_INSN_SIZE
@@ -209,51 +187,6 @@ ftrace_graph_call\suffix:
 	mcount_exit
 .endm
 
-#ifdef CONFIG_OLD_MCOUNT
-/*
- * mcount
- */
-
-.macro mcount_enter
-	stmdb	sp!, {r0-r3, lr}
-.endm
-
-.macro mcount_get_lr reg
-	ldr	\reg, [fp, #-4]
-.endm
-
-.macro mcount_exit
-	ldr	lr, [fp, #-4]
-	ldmia	sp!, {r0-r3, pc}
-.endm
-
-ENTRY(mcount)
-#ifdef CONFIG_DYNAMIC_FTRACE
-	stmdb	sp!, {lr}
-	ldr	lr, [fp, #-4]
-	ldmia	sp!, {pc}
-#else
-	__mcount _old
-#endif
-ENDPROC(mcount)
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(ftrace_caller_old)
-	__ftrace_caller _old
-ENDPROC(ftrace_caller_old)
-#endif
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller_old)
-	__ftrace_graph_caller
-ENDPROC(ftrace_graph_caller_old)
-#endif
-
-.purgem mcount_enter
-.purgem mcount_get_lr
-.purgem mcount_exit
-#endif
-
 /*
  * __gnu_mcount_nc
  */
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 5617932a83df..0142fcfcc3d3 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -47,30 +47,6 @@ void arch_ftrace_update_code(int command)
 	stop_machine(__ftrace_modify_code, &command, NULL);
 }
 
-#ifdef CONFIG_OLD_MCOUNT
-#define OLD_MCOUNT_ADDR	((unsigned long) mcount)
-#define OLD_FTRACE_ADDR ((unsigned long) ftrace_caller_old)
-
-#define	OLD_NOP		0xe1a00000	/* mov r0, r0 */
-
-static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec)
-{
-	return rec->arch.old_mcount ? OLD_NOP : NOP;
-}
-
-static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr)
-{
-	if (!rec->arch.old_mcount)
-		return addr;
-
-	if (addr == MCOUNT_ADDR)
-		addr = OLD_MCOUNT_ADDR;
-	else if (addr == FTRACE_ADDR)
-		addr = OLD_FTRACE_ADDR;
-
-	return addr;
-}
-#else
 static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec)
 {
 	return NOP;
@@ -80,7 +56,6 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr)
 {
 	return addr;
 }
-#endif
 
 int ftrace_arch_code_modify_prepare(void)
 {
@@ -150,15 +125,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	}
 #endif
 
-#ifdef CONFIG_OLD_MCOUNT
-	if (!ret) {
-		pc = (unsigned long)&ftrace_call_old;
-		new = ftrace_call_replace(pc, (unsigned long)func);
-
-		ret = ftrace_modify_code(pc, 0, new, false);
-	}
-#endif
-
 	return ret;
 }
 
@@ -203,16 +169,6 @@ int ftrace_make_nop(struct module *mod,
 	new = ftrace_nop_replace(rec);
 	ret = ftrace_modify_code(ip, old, new, true);
 
-#ifdef CONFIG_OLD_MCOUNT
-	if (ret == -EINVAL && addr == MCOUNT_ADDR) {
-		rec->arch.old_mcount = true;
-
-		old = ftrace_call_replace(ip, adjust_address(rec, addr));
-		new = ftrace_nop_replace(rec);
-		ret = ftrace_modify_code(ip, old, new, true);
-	}
-#endif
-
 	return ret;
 }
 
@@ -290,13 +246,6 @@ static int ftrace_modify_graph_caller(bool enable)
 #endif
 
 
-#ifdef CONFIG_OLD_MCOUNT
-	if (!ret)
-		ret = __ftrace_modify_caller(&ftrace_graph_call_old,
-					     ftrace_graph_caller_old,
-					     enable);
-#endif
-
 	return ret;
 }
 
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 36718a424358..6fa5b6387556 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -203,15 +203,8 @@ void ptrace_disable(struct task_struct *child)
  */
 void ptrace_break(struct task_struct *tsk, struct pt_regs *regs)
 {
-	siginfo_t info;
-
-	clear_siginfo(&info);
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code  = TRAP_BRKPT;
-	info.si_addr  = (void __user *)instruction_pointer(regs);
-
-	force_sig_info(SIGTRAP, &info, tsk);
+	force_sig_fault(SIGTRAP, TRAP_BRKPT,
+			(void __user *)instruction_pointer(regs), tsk);
 }
 
 static int break_trap(struct pt_regs *regs, unsigned int instr)
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index b8f766cf3a90..b908382b69ff 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -77,8 +77,6 @@ static int preserve_iwmmxt_context(struct iwmmxt_sigframe __user *frame)
 		kframe->magic = IWMMXT_MAGIC;
 		kframe->size = IWMMXT_STORAGE_SIZE;
 		iwmmxt_task_copy(current_thread_info(), &kframe->storage);
-
-		err = __copy_to_user(frame, kframe, sizeof(*frame));
 	} else {
 		/*
 		 * For bug-compatibility with older kernels, some space
@@ -86,10 +84,14 @@ static int preserve_iwmmxt_context(struct iwmmxt_sigframe __user *frame)
 		 * Set the magic and size appropriately so that properly
 		 * written userspace can skip it reliably:
 		 */
-		__put_user_error(DUMMY_MAGIC, &frame->magic, err);
-		__put_user_error(IWMMXT_STORAGE_SIZE, &frame->size, err);
+		*kframe = (struct iwmmxt_sigframe) {
+			.magic = DUMMY_MAGIC,
+			.size  = IWMMXT_STORAGE_SIZE,
+		};
 	}
 
+	err = __copy_to_user(frame, kframe, sizeof(*kframe));
+
 	return err;
 }
 
@@ -135,17 +137,18 @@ static int restore_iwmmxt_context(char __user **auxp)
 
 static int preserve_vfp_context(struct vfp_sigframe __user *frame)
 {
-	const unsigned long magic = VFP_MAGIC;
-	const unsigned long size = VFP_STORAGE_SIZE;
+	struct vfp_sigframe kframe;
 	int err = 0;
 
-	__put_user_error(magic, &frame->magic, err);
-	__put_user_error(size, &frame->size, err);
+	memset(&kframe, 0, sizeof(kframe));
+	kframe.magic = VFP_MAGIC;
+	kframe.size = VFP_STORAGE_SIZE;
 
+	err = vfp_preserve_user_clear_hwstate(&kframe.ufp, &kframe.ufp_exc);
 	if (err)
-		return -EFAULT;
+		return err;
 
-	return vfp_preserve_user_clear_hwstate(&frame->ufp, &frame->ufp_exc);
+	return __copy_to_user(frame, &kframe, sizeof(kframe));
 }
 
 static int restore_vfp_context(char __user **auxp)
@@ -288,30 +291,35 @@ static int
 setup_sigframe(struct sigframe __user *sf, struct pt_regs *regs, sigset_t *set)
 {
 	struct aux_sigframe __user *aux;
+	struct sigcontext context;
 	int err = 0;
 
-	__put_user_error(regs->ARM_r0, &sf->uc.uc_mcontext.arm_r0, err);
-	__put_user_error(regs->ARM_r1, &sf->uc.uc_mcontext.arm_r1, err);
-	__put_user_error(regs->ARM_r2, &sf->uc.uc_mcontext.arm_r2, err);
-	__put_user_error(regs->ARM_r3, &sf->uc.uc_mcontext.arm_r3, err);
-	__put_user_error(regs->ARM_r4, &sf->uc.uc_mcontext.arm_r4, err);
-	__put_user_error(regs->ARM_r5, &sf->uc.uc_mcontext.arm_r5, err);
-	__put_user_error(regs->ARM_r6, &sf->uc.uc_mcontext.arm_r6, err);
-	__put_user_error(regs->ARM_r7, &sf->uc.uc_mcontext.arm_r7, err);
-	__put_user_error(regs->ARM_r8, &sf->uc.uc_mcontext.arm_r8, err);
-	__put_user_error(regs->ARM_r9, &sf->uc.uc_mcontext.arm_r9, err);
-	__put_user_error(regs->ARM_r10, &sf->uc.uc_mcontext.arm_r10, err);
-	__put_user_error(regs->ARM_fp, &sf->uc.uc_mcontext.arm_fp, err);
-	__put_user_error(regs->ARM_ip, &sf->uc.uc_mcontext.arm_ip, err);
-	__put_user_error(regs->ARM_sp, &sf->uc.uc_mcontext.arm_sp, err);
-	__put_user_error(regs->ARM_lr, &sf->uc.uc_mcontext.arm_lr, err);
-	__put_user_error(regs->ARM_pc, &sf->uc.uc_mcontext.arm_pc, err);
-	__put_user_error(regs->ARM_cpsr, &sf->uc.uc_mcontext.arm_cpsr, err);
-
-	__put_user_error(current->thread.trap_no, &sf->uc.uc_mcontext.trap_no, err);
-	__put_user_error(current->thread.error_code, &sf->uc.uc_mcontext.error_code, err);
-	__put_user_error(current->thread.address, &sf->uc.uc_mcontext.fault_address, err);
-	__put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err);
+	context = (struct sigcontext) {
+		.arm_r0        = regs->ARM_r0,
+		.arm_r1        = regs->ARM_r1,
+		.arm_r2        = regs->ARM_r2,
+		.arm_r3        = regs->ARM_r3,
+		.arm_r4        = regs->ARM_r4,
+		.arm_r5        = regs->ARM_r5,
+		.arm_r6        = regs->ARM_r6,
+		.arm_r7        = regs->ARM_r7,
+		.arm_r8        = regs->ARM_r8,
+		.arm_r9        = regs->ARM_r9,
+		.arm_r10       = regs->ARM_r10,
+		.arm_fp        = regs->ARM_fp,
+		.arm_ip        = regs->ARM_ip,
+		.arm_sp        = regs->ARM_sp,
+		.arm_lr        = regs->ARM_lr,
+		.arm_pc        = regs->ARM_pc,
+		.arm_cpsr      = regs->ARM_cpsr,
+
+		.trap_no       = current->thread.trap_no,
+		.error_code    = current->thread.error_code,
+		.fault_address = current->thread.address,
+		.oldmask       = set->sig[0],
+	};
+
+	err |= __copy_to_user(&sf->uc.uc_mcontext, &context, sizeof(context));
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -328,7 +336,7 @@ setup_sigframe(struct sigframe __user *sf, struct pt_regs *regs, sigset_t *set)
 	if (err == 0)
 		err |= preserve_vfp_context(&aux->vfp);
 #endif
-	__put_user_error(0, &aux->end_magic, err);
+	err |= __put_user(0, &aux->end_magic);
 
 	return err;
 }
@@ -491,7 +499,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
 	/*
 	 * Set uc.uc_flags to a value which sc.trap_no would never have.
 	 */
-	__put_user_error(0x5ac3c35a, &frame->uc.uc_flags, err);
+	err = __put_user(0x5ac3c35a, &frame->uc.uc_flags);
 
 	err |= setup_sigframe(frame, regs, set);
 	if (err == 0)
@@ -511,8 +519,8 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
 
 	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
 
-	__put_user_error(0, &frame->sig.uc.uc_flags, err);
-	__put_user_error(NULL, &frame->sig.uc.uc_link, err);
+	err |= __put_user(0, &frame->sig.uc.uc_flags);
+	err |= __put_user(NULL, &frame->sig.uc.uc_link);
 
 	err |= __save_altstack(&frame->sig.uc.uc_stack, regs->ARM_sp);
 	err |= setup_sigframe(&frame->sig, regs, set);
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 80517f293eb9..a188d5e8ab7f 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -98,22 +98,20 @@ static int proc_status_show(struct seq_file *m, void *v)
  */
 static void set_segfault(struct pt_regs *regs, unsigned long addr)
 {
-	siginfo_t info;
+	int si_code;
 
-	clear_siginfo(&info);
 	down_read(&current->mm->mmap_sem);
 	if (find_vma(current->mm, addr) == NULL)
-		info.si_code = SEGV_MAPERR;
+		si_code = SEGV_MAPERR;
 	else
-		info.si_code = SEGV_ACCERR;
+		si_code = SEGV_ACCERR;
 	up_read(&current->mm->mmap_sem);
 
-	info.si_signo = SIGSEGV;
-	info.si_errno = 0;
-	info.si_addr  = (void *) instruction_pointer(regs);
-
 	pr_debug("SWP{B} emulation: access caused memory abort!\n");
-	arm_notify_die("Illegal memory access", regs, &info, 0, 0);
+	arm_notify_die("Illegal memory access", regs,
+		       SIGSEGV, si_code,
+		       (void __user *)instruction_pointer(regs),
+		       0, 0);
 
 	abtcounter++;
 }
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index f0dd4b6ebb63..40da0872170f 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -277,6 +277,7 @@ asmlinkage long sys_oabi_epoll_wait(int epfd,
 				    int maxevents, int timeout)
 {
 	struct epoll_event *kbuf;
+	struct oabi_epoll_event e;
 	mm_segment_t fs;
 	long ret, err, i;
 
@@ -295,8 +296,11 @@ asmlinkage long sys_oabi_epoll_wait(int epfd,
 	set_fs(fs);
 	err = 0;
 	for (i = 0; i < ret; i++) {
-		__put_user_error(kbuf[i].events, &events->events, err);
-		__put_user_error(kbuf[i].data,   &events->data,   err);
+		e.events = kbuf[i].events;
+		e.data = kbuf[i].data;
+		err = __copy_to_user(events, &e, sizeof(e));
+		if (err)
+			break;
 		events++;
 	}
 	kfree(kbuf);
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 24ac3cab411d..60e375ce1ab2 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -94,12 +94,6 @@ static void __init parse_dt_topology(void)
 	__cpu_capacity = kcalloc(nr_cpu_ids, sizeof(*__cpu_capacity),
 				 GFP_NOWAIT);
 
-	cn = of_find_node_by_path("/cpus");
-	if (!cn) {
-		pr_err("No CPU information found in DT\n");
-		return;
-	}
-
 	for_each_possible_cpu(cpu) {
 		const u32 *rate;
 		int len;
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index badf02ca3693..2d668cff8ef4 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -365,13 +365,14 @@ void die(const char *str, struct pt_regs *regs, int err)
 }
 
 void arm_notify_die(const char *str, struct pt_regs *regs,
-		struct siginfo *info, unsigned long err, unsigned long trap)
+		int signo, int si_code, void __user *addr,
+		unsigned long err, unsigned long trap)
 {
 	if (user_mode(regs)) {
 		current->thread.error_code = err;
 		current->thread.trap_no = trap;
 
-		force_sig_info(info->si_signo, info, current);
+		force_sig_fault(signo, si_code, addr, current);
 	} else {
 		die(str, regs, err);
 	}
@@ -438,10 +439,8 @@ int call_undef_hook(struct pt_regs *regs, unsigned int instr)
 asmlinkage void do_undefinstr(struct pt_regs *regs)
 {
 	unsigned int instr;
-	siginfo_t info;
 	void __user *pc;
 
-	clear_siginfo(&info);
 	pc = (void __user *)instruction_pointer(regs);
 
 	if (processor_mode(regs) == SVC_MODE) {
@@ -485,13 +484,8 @@ die_sig:
 		dump_instr(KERN_INFO, regs);
 	}
 #endif
-
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code  = ILL_ILLOPC;
-	info.si_addr  = pc;
-
-	arm_notify_die("Oops - undefined instruction", regs, &info, 0, 6);
+	arm_notify_die("Oops - undefined instruction", regs,
+		       SIGILL, ILL_ILLOPC, pc, 0, 6);
 }
 NOKPROBE_SYMBOL(do_undefinstr)
 
@@ -539,9 +533,6 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason)
 
 static int bad_syscall(int n, struct pt_regs *regs)
 {
-	siginfo_t info;
-
-	clear_siginfo(&info);
 	if ((current->personality & PER_MASK) != PER_LINUX) {
 		send_sig(SIGSEGV, current, 1);
 		return regs->ARM_r0;
@@ -555,13 +546,10 @@ static int bad_syscall(int n, struct pt_regs *regs)
 	}
 #endif
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code  = ILL_ILLTRP;
-	info.si_addr  = (void __user *)instruction_pointer(regs) -
-			 (thumb_mode(regs) ? 2 : 4);
-
-	arm_notify_die("Oops - bad syscall", regs, &info, n, 0);
+	arm_notify_die("Oops - bad syscall", regs, SIGILL, ILL_ILLTRP,
+		       (void __user *)instruction_pointer(regs) -
+			 (thumb_mode(regs) ? 2 : 4),
+		       n, 0);
 
 	return regs->ARM_r0;
 }
@@ -607,20 +595,13 @@ do_cache_op(unsigned long start, unsigned long end, int flags)
 #define NR(x) ((__ARM_NR_##x) - __ARM_NR_BASE)
 asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 {
-	siginfo_t info;
-
-	clear_siginfo(&info);
 	if ((no >> 16) != (__ARM_NR_BASE>> 16))
 		return bad_syscall(no, regs);
 
 	switch (no & 0xffff) {
 	case 0: /* branch through 0 */
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		info.si_code  = SEGV_MAPERR;
-		info.si_addr  = NULL;
-
-		arm_notify_die("branch through zero", regs, &info, 0, 0);
+		arm_notify_die("branch through zero", regs,
+			       SIGSEGV, SEGV_MAPERR, NULL, 0, 0);
 		return 0;
 
 	case NR(breakpoint): /* SWI BREAK_POINT */
@@ -688,13 +669,10 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 		}
 	}
 #endif
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code  = ILL_ILLTRP;
-	info.si_addr  = (void __user *)instruction_pointer(regs) -
-			 (thumb_mode(regs) ? 2 : 4);
-
-	arm_notify_die("Oops - bad syscall(2)", regs, &info, no, 0);
+	arm_notify_die("Oops - bad syscall(2)", regs, SIGILL, ILL_ILLTRP,
+		       (void __user *)instruction_pointer(regs) -
+			 (thumb_mode(regs) ? 2 : 4),
+		       no, 0);
 	return 0;
 }
 
@@ -744,9 +722,6 @@ asmlinkage void
 baddataabort(int code, unsigned long instr, struct pt_regs *regs)
 {
 	unsigned long addr = instruction_pointer(regs);
-	siginfo_t info;
-
-	clear_siginfo(&info);
 
 #ifdef CONFIG_DEBUG_USER
 	if (user_debug & UDBG_BADABORT) {
@@ -757,12 +732,8 @@ baddataabort(int code, unsigned long instr, struct pt_regs *regs)
 	}
 #endif
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code  = ILL_ILLOPC;
-	info.si_addr  = (void __user *)addr;
-
-	arm_notify_die("unknown data abort code", regs, &info, instr, 0);
+	arm_notify_die("unknown data abort code", regs,
+		       SIGILL, ILL_ILLOPC, (void __user *)addr, instr, 0);
 }
 
 void __readwrite_bug(const char *fn)
diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S
index 3593d5c1acd2..8c74037ade22 100644
--- a/arch/arm/kernel/vmlinux-xip.lds.S
+++ b/arch/arm/kernel/vmlinux-xip.lds.S
@@ -96,7 +96,6 @@ SECTIONS
 		INIT_SETUP(16)
 		INIT_CALLS
 		CON_INITCALL
-		SECURITY_INITCALL
 		INIT_RAM_FS
 	}
 
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index a826df3d3814..6709a8d33963 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -93,11 +93,7 @@ ENTRY(arm_copy_from_user)
 #ifdef CONFIG_CPU_SPECTRE
 	get_thread_info r3
 	ldr	r3, [r3, #TI_ADDR_LIMIT]
-	adds	ip, r1, r2	@ ip=addr+size
-	sub	r3, r3, #1	@ addr_limit - 1
-	cmpcc	ip, r3		@ if (addr+size > addr_limit - 1)
-	movcs	r1, #0		@ addr = NULL
-	csdb
+	uaccess_mask_range_ptr r1, r2, r3, ip
 #endif
 
 #include "copy_template.S"
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index caf5019d8161..970abe521197 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -94,6 +94,11 @@
 
 ENTRY(__copy_to_user_std)
 WEAK(arm_copy_to_user)
+#ifdef CONFIG_CPU_SPECTRE
+	get_thread_info r3
+	ldr	r3, [r3, #TI_ADDR_LIMIT]
+	uaccess_mask_range_ptr r0, r2, r3, ip
+#endif
 
 #include "copy_template.S"
 
@@ -108,4 +113,3 @@ ENDPROC(__copy_to_user_std)
 	rsb	r0, r0, r2
 	copy_abort_end
 	.popsection
-
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 9b4ed1728616..73dc7360cbdd 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -152,7 +152,8 @@ arm_copy_to_user(void __user *to, const void *from, unsigned long n)
 		n = __copy_to_user_std(to, from, n);
 		uaccess_restore(ua_flags);
 	} else {
-		n = __copy_to_user_memcpy(to, from, n);
+		n = __copy_to_user_memcpy(uaccess_mask_range_ptr(to, n),
+					  from, n);
 	}
 	return n;
 }
diff --git a/arch/arm/mach-at91/pm_suspend.S b/arch/arm/mach-at91/pm_suspend.S
index a7c6ae13c945..bfe1c4d06901 100644
--- a/arch/arm/mach-at91/pm_suspend.S
+++ b/arch/arm/mach-at91/pm_suspend.S
@@ -149,6 +149,14 @@ exit_suspend:
 ENDPROC(at91_pm_suspend_in_sram)
 
 ENTRY(at91_backup_mode)
+	/* Switch the master clock source to slow clock. */
+	ldr	pmc, .pmc_base
+	ldr	tmp1, [pmc, #AT91_PMC_MCKR]
+	bic	tmp1, tmp1, #AT91_PMC_CSS
+	str	tmp1, [pmc, #AT91_PMC_MCKR]
+
+	wait_mckrdy
+
 	/*BUMEN*/
 	ldr	r0, .sfr
 	mov	tmp1, #0x1
diff --git a/arch/arm/mach-mmp/devices.c b/arch/arm/mach-mmp/devices.c
index 671c7a09ab3d..0fca63c80e1a 100644
--- a/arch/arm/mach-mmp/devices.c
+++ b/arch/arm/mach-mmp/devices.c
@@ -277,21 +277,12 @@ struct platform_device pxa168_device_u2o = {
 
 #if IS_ENABLED(CONFIG_USB_EHCI_MV_U2O)
 struct resource pxa168_u2oehci_resources[] = {
-	/* regbase */
 	[0] = {
-		.start	= PXA168_U2O_REGBASE + U2x_CAPREGS_OFFSET,
+		.start	= PXA168_U2O_REGBASE,
 		.end	= PXA168_U2O_REGBASE + USB_REG_RANGE,
 		.flags	= IORESOURCE_MEM,
-		.name	= "capregs",
 	},
-	/* phybase */
 	[1] = {
-		.start	= PXA168_U2O_PHYBASE,
-		.end	= PXA168_U2O_PHYBASE + USB_PHY_RANGE,
-		.flags	= IORESOURCE_MEM,
-		.name	= "phyregs",
-	},
-	[2] = {
 		.start	= IRQ_PXA168_USB1,
 		.end	= IRQ_PXA168_USB1,
 		.flags	= IORESOURCE_IRQ,
diff --git a/arch/arm/mach-shmobile/pm-rcar-gen2.c b/arch/arm/mach-shmobile/pm-rcar-gen2.c
index 345af3ebcc3a..7efe95bd584f 100644
--- a/arch/arm/mach-shmobile/pm-rcar-gen2.c
+++ b/arch/arm/mach-shmobile/pm-rcar-gen2.c
@@ -50,7 +50,7 @@ void __init rcar_gen2_pm_init(void)
 	void __iomem *p;
 	u32 bar;
 	static int once;
-	struct device_node *np, *cpus;
+	struct device_node *np;
 	bool has_a7 = false;
 	bool has_a15 = false;
 	struct resource res;
@@ -59,11 +59,7 @@ void __init rcar_gen2_pm_init(void)
 	if (once++)
 		return;
 
-	cpus = of_find_node_by_path("/cpus");
-	if (!cpus)
-		return;
-
-	for_each_child_of_node(cpus, np) {
+	for_each_of_cpu_node(np) {
 		if (of_device_is_compatible(np, "arm,cortex-a15"))
 			has_a15 = true;
 		else if (of_device_is_compatible(np, "arm,cortex-a7"))
diff --git a/arch/arm/mach-shmobile/pm-rmobile.c b/arch/arm/mach-shmobile/pm-rmobile.c
index e348bcfe389d..94fdeef11b81 100644
--- a/arch/arm/mach-shmobile/pm-rmobile.c
+++ b/arch/arm/mach-shmobile/pm-rmobile.c
@@ -202,7 +202,7 @@ static void __init get_special_pds(void)
 	const struct of_device_id *id;
 
 	/* PM domains containing CPUs */
-	for_each_node_by_type(np, "cpu")
+	for_each_of_cpu_node(np)
 		add_special_pd(np, PD_CPU);
 
 	/* PM domain containing console */
diff --git a/arch/arm/mach-shmobile/timer.c b/arch/arm/mach-shmobile/timer.c
index 828e8aea037e..e48b0939693f 100644
--- a/arch/arm/mach-shmobile/timer.c
+++ b/arch/arm/mach-shmobile/timer.c
@@ -22,22 +22,16 @@
 
 void __init shmobile_init_delay(void)
 {
-	struct device_node *np, *cpus;
+	struct device_node *np;
 	u32 max_freq = 0;
 
-	cpus = of_find_node_by_path("/cpus");
-	if (!cpus)
-		return;
-
-	for_each_child_of_node(cpus, np) {
+	for_each_of_cpu_node(np) {
 		u32 freq;
 
 		if (!of_property_read_u32(np, "clock-frequency", &freq))
 			max_freq = max(max_freq, freq);
 	}
 
-	of_node_put(cpus);
-
 	if (!max_freq)
 		return;
 
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index bd2c739d8083..b54f8f8def36 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -948,15 +948,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		goto fixup;
 
 	if (ai_usermode & UM_SIGNAL) {
-		siginfo_t si;
-
-		clear_siginfo(&si);
-		si.si_signo = SIGBUS;
-		si.si_errno = 0;
-		si.si_code = BUS_ADRALN;
-		si.si_addr = (void __user *)addr;
-
-		force_sig_info(si.si_signo, &si, current);
+		force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr, current);
 	} else {
 		/*
 		 * We're about to disable the alignment trap and return to
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 3232afb6fdc0..f4ea4c62c613 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -161,13 +161,9 @@ __do_user_fault(struct task_struct *tsk, unsigned long addr,
 		unsigned int fsr, unsigned int sig, int code,
 		struct pt_regs *regs)
 {
-	struct siginfo si;
-
 	if (addr > TASK_SIZE)
 		harden_branch_predictor();
 
-	clear_siginfo(&si);
-
 #ifdef CONFIG_DEBUG_USER
 	if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
 	    ((user_debug & UDBG_BUS)  && (sig == SIGBUS))) {
@@ -181,11 +177,7 @@ __do_user_fault(struct task_struct *tsk, unsigned long addr,
 	tsk->thread.address = addr;
 	tsk->thread.error_code = fsr;
 	tsk->thread.trap_no = 14;
-	si.si_signo = sig;
-	si.si_errno = 0;
-	si.si_code = code;
-	si.si_addr = (void __user *)addr;
-	force_sig_info(sig, &si, tsk);
+	force_sig_fault(sig, code, (void __user *)addr, tsk);
 }
 
 void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
@@ -554,7 +546,6 @@ asmlinkage void
 do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	const struct fsr_info *inf = fsr_info + fsr_fs(fsr);
-	struct siginfo info;
 
 	if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs))
 		return;
@@ -563,12 +554,8 @@ do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		inf->name, fsr, addr);
 	show_pte(current->mm, addr);
 
-	clear_siginfo(&info);
-	info.si_signo = inf->sig;
-	info.si_errno = 0;
-	info.si_code  = inf->code;
-	info.si_addr  = (void __user *)addr;
-	arm_notify_die("", regs, &info, fsr, 0);
+	arm_notify_die("", regs, inf->sig, inf->code, (void __user *)addr,
+		       fsr, 0);
 }
 
 void __init
@@ -588,7 +575,6 @@ asmlinkage void
 do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs)
 {
 	const struct fsr_info *inf = ifsr_info + fsr_fs(ifsr);
-	struct siginfo info;
 
 	if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs))
 		return;
@@ -596,12 +582,8 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs)
 	pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
 		inf->name, ifsr, addr);
 
-	clear_siginfo(&info);
-	info.si_signo = inf->sig;
-	info.si_errno = 0;
-	info.si_code  = inf->code;
-	info.si_addr  = (void __user *)addr;
-	arm_notify_die("", regs, &info, ifsr, 0);
+	arm_notify_die("", regs, inf->sig, inf->code, (void __user *)addr,
+		       ifsr, 0);
 }
 
 /*
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index dc7e6b50ef67..aff6e6eadc70 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -216,13 +216,6 @@ static struct notifier_block vfp_notifier_block = {
  */
 static void vfp_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
 {
-	siginfo_t info;
-
-	clear_siginfo(&info);
-	info.si_signo = SIGFPE;
-	info.si_code = sicode;
-	info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
-
 	/*
 	 * This is the same as NWFPE, because it's not clear what
 	 * this is used for
@@ -230,7 +223,9 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
 	current->thread.error_code = 0;
 	current->thread.trap_no = 6;
 
-	send_sig_info(SIGFPE, &info, current);
+	send_sig_fault(SIGFPE, sicode,
+		       (void __user *)(instruction_pointer(regs) - 4),
+		       current);
 }
 
 static void vfp_panic(char *reason, u32 inst)
@@ -553,12 +548,11 @@ void vfp_flush_hwstate(struct thread_info *thread)
  * Save the current VFP state into the provided structures and prepare
  * for entry into a new function (signal handler).
  */
-int vfp_preserve_user_clear_hwstate(struct user_vfp __user *ufp,
-				    struct user_vfp_exc __user *ufp_exc)
+int vfp_preserve_user_clear_hwstate(struct user_vfp *ufp,
+				    struct user_vfp_exc *ufp_exc)
 {
 	struct thread_info *thread = current_thread_info();
 	struct vfp_hard_struct *hwstate = &thread->vfpstate.hard;
-	int err = 0;
 
 	/* Ensure that the saved hwstate is up-to-date. */
 	vfp_sync_hwstate(thread);
@@ -567,22 +561,19 @@ int vfp_preserve_user_clear_hwstate(struct user_vfp __user *ufp,
 	 * Copy the floating point registers. There can be unused
 	 * registers see asm/hwcap.h for details.
 	 */
-	err |= __copy_to_user(&ufp->fpregs, &hwstate->fpregs,
-			      sizeof(hwstate->fpregs));
+	memcpy(&ufp->fpregs, &hwstate->fpregs, sizeof(hwstate->fpregs));
+
 	/*
 	 * Copy the status and control register.
 	 */
-	__put_user_error(hwstate->fpscr, &ufp->fpscr, err);
+	ufp->fpscr = hwstate->fpscr;
 
 	/*
 	 * Copy the exception registers.
 	 */
-	__put_user_error(hwstate->fpexc, &ufp_exc->fpexc, err);
-	__put_user_error(hwstate->fpinst, &ufp_exc->fpinst, err);
-	__put_user_error(hwstate->fpinst2, &ufp_exc->fpinst2, err);
-
-	if (err)
-		return -EFAULT;
+	ufp_exc->fpexc = hwstate->fpexc;
+	ufp_exc->fpinst = hwstate->fpinst;
+	ufp_exc->fpinst2 = ufp_exc->fpinst2;
 
 	/* Ensure that VFP is disabled. */
 	vfp_flush_hwstate(thread);