8 files changed, 275 insertions, 881 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 925d1364727a..ef0c7feea6e2 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
 	select CRYPTO_CRYPTD
+	select CRYPTO_GF128MUL
 	help
 	  Use an implementation of GHASH (used by the GCM AEAD chaining mode)
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
@@ -121,10 +122,4 @@ config CRYPTO_CHACHA20_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 
-config CRYPTO_SPECK_NEON
-	tristate "NEON accelerated Speck cipher algorithms"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLKCIPHER
-	select CRYPTO_SPECK
-
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 8de542c48ade..bd5bceef0605 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -54,7 +53,6 @@ ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
index 451a849ad518..50e7b9896818 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -18,6 +18,34 @@
  * (at your option) any later version.
  */
 
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
+  *
+  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
+  * (c)  vrev32.16			(16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
+  *					 needs index vector)
+  *
+  * ChaCha20 has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit
+  * rotations, the only choices are (a) and (b).  We use (a) since it takes
+  * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
 #include <linux/linkage.h>
 
 	.text
@@ -46,7 +74,9 @@ ENTRY(chacha20_block_xor_neon)
 	vmov		q10, q2
 	vmov		q11, q3
 
+	adr		ip, .Lrol8_table
 	mov		r3, #10
+	vld1.8		{d10}, [ip, :64]
 
 .Ldoubleround:
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -62,9 +92,9 @@ ENTRY(chacha20_block_xor_neon)
 
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vadd.i32	q0, q0, q1
-	veor		q4, q3, q0
-	vshl.u32	q3, q4, #8
-	vsri.u32	q3, q4, #24
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
 
 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vadd.i32	q2, q2, q3
@@ -92,9 +122,9 @@ ENTRY(chacha20_block_xor_neon)
 
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vadd.i32	q0, q0, q1
-	veor		q4, q3, q0
-	vshl.u32	q3, q4, #8
-	vsri.u32	q3, q4, #24
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
 
 	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vadd.i32	q2, q2, q3
@@ -139,13 +169,17 @@ ENTRY(chacha20_block_xor_neon)
 	bx		lr
 ENDPROC(chacha20_block_xor_neon)
 
+	.align		4
+.Lctrinc:	.word	0, 1, 2, 3
+.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
+
 	.align		5
 ENTRY(chacha20_4block_xor_neon)
-	push		{r4-r6, lr}
-	mov		ip, sp			// preserve the stack pointer
-	sub		r3, sp, #0x20		// allocate a 32 byte buffer
-	bic		r3, r3, #0x1f		// aligned to 32 bytes
-	mov		sp, r3
+	push		{r4-r5}
+	mov		r4, sp			// preserve the stack pointer
+	sub		ip, sp, #0x20		// allocate a 32 byte buffer
+	bic		ip, ip, #0x1f		// aligned to 32 bytes
+	mov		sp, ip
 
 	// r0: Input state matrix, s
 	// r1: 4 data blocks output, o
@@ -155,25 +189,24 @@ ENTRY(chacha20_4block_xor_neon)
 	// This function encrypts four consecutive ChaCha20 blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. For final XORing step we transpose the
-	// matrix by interleaving 32- and then 64-bit words, which allows us to
-	// do XOR in NEON registers.
+	// requires no word shuffling. The words are re-interleaved before the
+	// final addition of the original state and the XORing step.
 	//
 
-	// x0..15[0-3] = s0..3[0..3]
-	add		r3, r0, #0x20
+	// x0..15[0-3] = s0..15[0-3]
+	add		ip, r0, #0x20
 	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [r3]
+	vld1.32		{q2-q3}, [ip]
 
-	adr		r3, CTRINC
+	adr		r5, .Lctrinc
 	vdup.32		q15, d7[1]
 	vdup.32		q14, d7[0]
-	vld1.32		{q11}, [r3, :128]
+	vld1.32		{q4}, [r5, :128]
 	vdup.32		q13, d6[1]
 	vdup.32		q12, d6[0]
-	vadd.i32	q12, q12, q11		// x12 += counter values 0-3
 	vdup.32		q11, d5[1]
 	vdup.32		q10, d5[0]
+	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
 	vdup.32		q9, d4[1]
 	vdup.32		q8, d4[0]
 	vdup.32		q7, d3[1]
@@ -185,9 +218,13 @@ ENTRY(chacha20_4block_xor_neon)
 	vdup.32		q1, d0[1]
 	vdup.32		q0, d0[0]
 
+	adr		ip, .Lrol8_table
 	mov		r3, #10
+	b		1f
 
 .Ldoubleround4:
+	vld1.32		{q8-q9}, [sp, :256]
+1:
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
@@ -236,24 +273,25 @@ ENTRY(chacha20_4block_xor_neon)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
 	vadd.i32	q0, q0, q4
 	vadd.i32	q1, q1, q5
 	vadd.i32	q2, q2, q6
 	vadd.i32	q3, q3, q7
 
-	veor		q8, q12, q0
-	veor		q9, q13, q1
-	vshl.u32	q12, q8, #8
-	vshl.u32	q13, q9, #8
-	vsri.u32	q12, q8, #24
-	vsri.u32	q13, q9, #24
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
 
-	veor		q8, q14, q2
-	veor		q9, q15, q3
-	vshl.u32	q14, q8, #8
-	vshl.u32	q15, q9, #8
-	vsri.u32	q14, q8, #24
-	vsri.u32	q15, q9, #24
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
 
 	vld1.32		{q8-q9}, [sp, :256]
 
@@ -332,24 +370,25 @@ ENTRY(chacha20_4block_xor_neon)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
 	vadd.i32	q0, q0, q5
 	vadd.i32	q1, q1, q6
 	vadd.i32	q2, q2, q7
 	vadd.i32	q3, q3, q4
 
-	veor		q8, q15, q0
-	veor		q9, q12, q1
-	vshl.u32	q15, q8, #8
-	vshl.u32	q12, q9, #8
-	vsri.u32	q15, q8, #24
-	vsri.u32	q12, q9, #24
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
 
-	veor		q8, q13, q2
-	veor		q9, q14, q3
-	vshl.u32	q13, q8, #8
-	vshl.u32	q14, q9, #8
-	vsri.u32	q13, q8, #24
-	vsri.u32	q14, q9, #24
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
 
 	vld1.32		{q8-q9}, [sp, :256]
 
@@ -379,104 +418,76 @@ ENTRY(chacha20_4block_xor_neon)
 	vsri.u32	q6, q9, #25
 
 	subs		r3, r3, #1
-	beq		0f
-
-	vld1.32		{q8-q9}, [sp, :256]
-	b		.Ldoubleround4
-
-	// x0[0-3] += s0[0]
-	// x1[0-3] += s0[1]
-	// x2[0-3] += s0[2]
-	// x3[0-3] += s0[3]
-0:	ldmia		r0!, {r3-r6}
-	vdup.32		q8, r3
-	vdup.32		q9, r4
-	vadd.i32	q0, q0, q8
-	vadd.i32	q1, q1, q9
-	vdup.32		q8, r5
-	vdup.32		q9, r6
-	vadd.i32	q2, q2, q8
-	vadd.i32	q3, q3, q9
-
-	// x4[0-3] += s1[0]
-	// x5[0-3] += s1[1]
-	// x6[0-3] += s1[2]
-	// x7[0-3] += s1[3]
-	ldmia		r0!, {r3-r6}
-	vdup.32		q8, r3
-	vdup.32		q9, r4
-	vadd.i32	q4, q4, q8
-	vadd.i32	q5, q5, q9
-	vdup.32		q8, r5
-	vdup.32		q9, r6
-	vadd.i32	q6, q6, q8
-	vadd.i32	q7, q7, q9
-
-	// interleave 32-bit words in state n, n+1
-	vzip.32		q0, q1
-	vzip.32		q2, q3
-	vzip.32		q4, q5
-	vzip.32		q6, q7
-
-	// interleave 64-bit words in state n, n+2
+	bne		.Ldoubleround4
+
+	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+	// x8..9[0-3] are on the stack.
+
+	// Re-interleave the words in the first two rows of each block (x0..7).
+	// Also add the counter values 0-3 to x12[0-3].
+	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
+	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
+	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
+	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
+	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
+	  vadd.u32	q12, q8			// x12 += counter values 0-3
 	vswp		d1, d4
 	vswp		d3, d6
+	  vld1.32	{q8-q9}, [r0]!		// load s0..7
 	vswp		d9, d12
 	vswp		d11, d14
 
-	// xor with corresponding input, write to output
+	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+	// after XORing the first 32 bytes.
+	vswp		q1, q4
+
+	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
+	vadd.u32	q0, q0, q8
+	vadd.u32	q2, q2, q8
+	vadd.u32	q4, q4, q8
+	vadd.u32	q3, q3, q8
+
+	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
+	vadd.u32	q1, q1, q9
+	vadd.u32	q6, q6, q9
+	vadd.u32	q5, q5, q9
+	vadd.u32	q7, q7, q9
+
+	// XOR first 32 bytes using keystream from first two rows of first block
 	vld1.8		{q8-q9}, [r2]!
 	veor		q8, q8, q0
-	veor		q9, q9, q4
+	veor		q9, q9, q1
 	vst1.8		{q8-q9}, [r1]!
 
+	// Re-interleave the words in the last two rows of each block (x8..15).
 	vld1.32		{q8-q9}, [sp, :256]
-
-	// x8[0-3] += s2[0]
-	// x9[0-3] += s2[1]
-	// x10[0-3] += s2[2]
-	// x11[0-3] += s2[3]
-	ldmia		r0!, {r3-r6}
-	vdup.32		q0, r3
-	vdup.32		q4, r4
-	vadd.i32	q8, q8, q0
-	vadd.i32	q9, q9, q4
-	vdup.32		q0, r5
-	vdup.32		q4, r6
-	vadd.i32	q10, q10, q0
-	vadd.i32	q11, q11, q4
-
-	// x12[0-3] += s3[0]
-	// x13[0-3] += s3[1]
-	// x14[0-3] += s3[2]
-	// x15[0-3] += s3[3]
-	ldmia		r0!, {r3-r6}
-	vdup.32		q0, r3
-	vdup.32		q4, r4
-	adr		r3, CTRINC
-	vadd.i32	q12, q12, q0
-	vld1.32		{q0}, [r3, :128]
-	vadd.i32	q13, q13, q4
-	vadd.i32	q12, q12, q0		// x12 += counter values 0-3
-
-	vdup.32		q0, r5
-	vdup.32		q4, r6
-	vadd.i32	q14, q14, q0
-	vadd.i32	q15, q15, q4
-
-	// interleave 32-bit words in state n, n+1
-	vzip.32		q8, q9
-	vzip.32		q10, q11
-	vzip.32		q12, q13
-	vzip.32		q14, q15
-
-	// interleave 64-bit words in state n, n+2
-	vswp		d17, d20
-	vswp		d19, d22
+	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
+	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
+	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
+	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
+	  vld1.32	{q0-q1}, [r0]	// load s8..15
 	vswp		d25, d28
 	vswp		d27, d30
+	vswp		d17, d20
+	vswp		d19, d22
+
+	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
+	vadd.u32	q8,  q8,  q0
+	vadd.u32	q10, q10, q0
+	vadd.u32	q9,  q9,  q0
+	vadd.u32	q11, q11, q0
+
+	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+	vadd.u32	q12, q12, q1
+	vadd.u32	q14, q14, q1
+	vadd.u32	q13, q13, q1
+	vadd.u32	q15, q15, q1
 
-	vmov		q4, q1
+	// XOR the rest of the data with the keystream
 
 	vld1.8		{q0-q1}, [r2]!
 	veor		q0, q0, q8
@@ -509,13 +520,11 @@ ENTRY(chacha20_4block_xor_neon)
 	vst1.8		{q0-q1}, [r1]!
 
 	vld1.8		{q0-q1}, [r2]
+	  mov		sp, r4		// restore original stack pointer
 	veor		q0, q0, q11
 	veor		q1, q1, q15
 	vst1.8		{q0-q1}, [r1]
 
-	mov		sp, ip
-	pop		{r4-r6, pc}
+	pop		{r4-r5}
+	bx		lr
 ENDPROC(chacha20_4block_xor_neon)
-
-	.align		4
-CTRINC:	.word		0, 1, 2, 3
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
index 96e62ec105d0..cd9e93b46c2d 100644
--- a/arch/arm/crypto/crc32-ce-glue.c
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -236,7 +236,7 @@ static void __exit crc32_pmull_mod_exit(void)
 				  ARRAY_SIZE(crc32_pmull_algs));
 }
 
-static const struct cpu_feature crc32_cpu_feature[] = {
+static const struct cpu_feature __maybe_unused crc32_cpu_feature[] = {
 	{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
 };
 MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 2f78c10b1881..406009afa9cf 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -63,6 +63,33 @@
 	k48		.req	d31
 	SHASH2_p64	.req	d31
 
+	HH		.req	q10
+	HH3		.req	q11
+	HH4		.req	q12
+	HH34		.req	q13
+
+	HH_L		.req	d20
+	HH_H		.req	d21
+	HH3_L		.req	d22
+	HH3_H		.req	d23
+	HH4_L		.req	d24
+	HH4_H		.req	d25
+	HH34_L		.req	d26
+	HH34_H		.req	d27
+	SHASH2_H	.req	d29
+
+	XL2		.req	q5
+	XM2		.req	q6
+	XH2		.req	q7
+	T3		.req	q8
+
+	XL2_L		.req	d10
+	XL2_H		.req	d11
+	XM2_L		.req	d12
+	XM2_H		.req	d13
+	T3_L		.req	d16
+	T3_H		.req	d17
+
 	.text
 	.fpu		crypto-neon-fp-armv8
 
@@ -175,12 +202,77 @@
 	beq		0f
 	vld1.64		{T1}, [ip]
 	teq		r0, #0
-	b		1f
+	b		3f
+
+0:	.ifc		\pn, p64
+	tst		r0, #3			// skip until #blocks is a
+	bne		2f			// round multiple of 4
+
+	vld1.8		{XL2-XM2}, [r2]!
+1:	vld1.8		{T3-T2}, [r2]!
+	vrev64.8	XL2, XL2
+	vrev64.8	XM2, XM2
+
+	subs		r0, r0, #4
+
+	vext.8		T1, XL2, XL2, #8
+	veor		XL2_H, XL2_H, XL_L
+	veor		XL, XL, T1
+
+	vrev64.8	T3, T3
+	vrev64.8	T1, T2
+
+	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
+	veor		XL2_H, XL2_H, XL_H
+	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
+	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
+
+	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
+	veor		XM2_L, XM2_L, XM2_H
+	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
+	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
+
+	veor		XH, XH, XH2
+	veor		XL, XL, XL2
+	veor		XM, XM, XM2
+
+	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
+	veor		T3_L, T3_L, T3_H
+	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
+	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
+
+	veor		XH, XH, XH2
+	veor		XL, XL, XL2
+	veor		XM, XM, XM2
+
+	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
+	veor		T1_L, T1_L, T1_H
+	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
+	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
+
+	veor		XH, XH, XH2
+	veor		XL, XL, XL2
+	veor		XM, XM, XM2
 
-0:	vld1.64		{T1}, [r2]!
+	beq		4f
+
+	vld1.8		{XL2-XM2}, [r2]!
+
+	veor		T1, XL, XH
+	veor		XM, XM, T1
+
+	__pmull_reduce_p64
+
+	veor		T1, T1, XH
+	veor		XL, XL, T1
+
+	b		1b
+	.endif
+
+2:	vld1.64		{T1}, [r2]!
 	subs		r0, r0, #1
 
-1:	/* multiply XL by SHASH in GF(2^128) */
+3:	/* multiply XL by SHASH in GF(2^128) */
 #ifndef CONFIG_CPU_BIG_ENDIAN
 	vrev64.8	T1, T1
 #endif
@@ -193,7 +285,7 @@
 	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
 	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
 
-	veor		T1, XL, XH
+4:	veor		T1, XL, XH
 	veor		XM, XM, T1
 
 	__pmull_reduce_\pn
@@ -212,8 +304,14 @@
 	 *			   struct ghash_key const *k, const char *head)
 	 */
 ENTRY(pmull_ghash_update_p64)
-	vld1.64		{SHASH}, [r3]
+	vld1.64		{SHASH}, [r3]!
+	vld1.64		{HH}, [r3]!
+	vld1.64		{HH3-HH4}, [r3]
+
 	veor		SHASH2_p64, SHASH_L, SHASH_H
+	veor		SHASH2_H, HH_L, HH_H
+	veor		HH34_L, HH3_L, HH3_H
+	veor		HH34_H, HH4_L, HH4_H
 
 	vmov.i8		MASK, #0xe1
 	vshl.u64	MASK, MASK, #57
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 8930fc4e7c22..b7d30b6cf49c 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -1,7 +1,7 @@
 /*
  * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
  *
- * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
@@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash");
 #define GHASH_DIGEST_SIZE	16
 
 struct ghash_key {
-	u64	a;
-	u64	b;
+	u64	h[2];
+	u64	h2[2];
+	u64	h3[2];
+	u64	h4[2];
 };
 
 struct ghash_desc_ctx {
@@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
 	return 0;
 }
 
+static void ghash_reflect(u64 h[], const be128 *k)
+{
+	u64 carry = be64_to_cpu(k->a) >> 63;
+
+	h[0] = (be64_to_cpu(k->b) << 1) | carry;
+	h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
+
+	if (carry)
+		h[1] ^= 0xc200000000000000UL;
+}
+
 static int ghash_setkey(struct crypto_shash *tfm,
 			const u8 *inkey, unsigned int keylen)
 {
 	struct ghash_key *key = crypto_shash_ctx(tfm);
-	u64 a, b;
+	be128 h, k;
 
 	if (keylen != GHASH_BLOCK_SIZE) {
 		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 
-	/* perform multiplication by 'x' in GF(2^128) */
-	b = get_unaligned_be64(inkey);
-	a = get_unaligned_be64(inkey + 8);
+	memcpy(&k, inkey, GHASH_BLOCK_SIZE);
+	ghash_reflect(key->h, &k);
+
+	h = k;
+	gf128mul_lle(&h, &k);
+	ghash_reflect(key->h2, &h);
 
-	key->a = (a << 1) | (b >> 63);
-	key->b = (b << 1) | (a >> 63);
+	gf128mul_lle(&h, &k);
+	ghash_reflect(key->h3, &h);
 
-	if (b >> 63)
-		key->b ^= 0xc200000000000000UL;
+	gf128mul_lle(&h, &k);
+	ghash_reflect(key->h4, &h);
 
 	return 0;
 }
diff --git a/arch/arm/crypto/speck-neon-core.S b/arch/arm/crypto/speck-neon-core.S
deleted file mode 100644
index 57caa742016e..000000000000
--- a/arch/arm/crypto/speck-neon-core.S
+++ /dev/null
@@ -1,434 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-	.fpu		neon
-
-	// arguments
-	ROUND_KEYS	.req	r0	// const {u64,u32} *round_keys
-	NROUNDS		.req	r1	// int nrounds
-	DST		.req	r2	// void *dst
-	SRC		.req	r3	// const void *src
-	NBYTES		.req	r4	// unsigned int nbytes
-	TWEAK		.req	r5	// void *tweak
-
-	// registers which hold the data being encrypted/decrypted
-	X0		.req	q0
-	X0_L		.req	d0
-	X0_H		.req	d1
-	Y0		.req	q1
-	Y0_H		.req	d3
-	X1		.req	q2
-	X1_L		.req	d4
-	X1_H		.req	d5
-	Y1		.req	q3
-	Y1_H		.req	d7
-	X2		.req	q4
-	X2_L		.req	d8
-	X2_H		.req	d9
-	Y2		.req	q5
-	Y2_H		.req	d11
-	X3		.req	q6
-	X3_L		.req	d12
-	X3_H		.req	d13
-	Y3		.req	q7
-	Y3_H		.req	d15
-
-	// the round key, duplicated in all lanes
-	ROUND_KEY	.req	q8
-	ROUND_KEY_L	.req	d16
-	ROUND_KEY_H	.req	d17
-
-	// index vector for vtbl-based 8-bit rotates
-	ROTATE_TABLE	.req	d18
-
-	// multiplication table for updating XTS tweaks
-	GF128MUL_TABLE	.req	d19
-	GF64MUL_TABLE	.req	d19
-
-	// current XTS tweak value(s)
-	TWEAKV		.req	q10
-	TWEAKV_L	.req	d20
-	TWEAKV_H	.req	d21
-
-	TMP0		.req	q12
-	TMP0_L		.req	d24
-	TMP0_H		.req	d25
-	TMP1		.req	q13
-	TMP2		.req	q14
-	TMP3		.req	q15
-
-	.align		4
-.Lror64_8_table:
-	.byte		1, 2, 3, 4, 5, 6, 7, 0
-.Lror32_8_table:
-	.byte		1, 2, 3, 0, 5, 6, 7, 4
-.Lrol64_8_table:
-	.byte		7, 0, 1, 2, 3, 4, 5, 6
-.Lrol32_8_table:
-	.byte		3, 0, 1, 2, 7, 4, 5, 6
-.Lgf128mul_table:
-	.byte		0, 0x87
-	.fill		14
-.Lgf64mul_table:
-	.byte		0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
-	.fill		12
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- *
- * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
- * the vtbl approach is faster on some processors and the same speed on others.
- */
-.macro _speck_round_128bytes	n
-
-	// x = ror(x, 8)
-	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
-	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
-	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
-	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
-	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
-	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
-	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
-	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
-
-	// x += y
-	vadd.u\n	X0, Y0
-	vadd.u\n	X1, Y1
-	vadd.u\n	X2, Y2
-	vadd.u\n	X3, Y3
-
-	// x ^= k
-	veor		X0, ROUND_KEY
-	veor		X1, ROUND_KEY
-	veor		X2, ROUND_KEY
-	veor		X3, ROUND_KEY
-
-	// y = rol(y, 3)
-	vshl.u\n	TMP0, Y0, #3
-	vshl.u\n	TMP1, Y1, #3
-	vshl.u\n	TMP2, Y2, #3
-	vshl.u\n	TMP3, Y3, #3
-	vsri.u\n	TMP0, Y0, #(\n - 3)
-	vsri.u\n	TMP1, Y1, #(\n - 3)
-	vsri.u\n	TMP2, Y2, #(\n - 3)
-	vsri.u\n	TMP3, Y3, #(\n - 3)
-
-	// y ^= x
-	veor		Y0, TMP0, X0
-	veor		Y1, TMP1, X1
-	veor		Y2, TMP2, X2
-	veor		Y3, TMP3, X3
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes	n
-
-	// y ^= x
-	veor		TMP0, Y0, X0
-	veor		TMP1, Y1, X1
-	veor		TMP2, Y2, X2
-	veor		TMP3, Y3, X3
-
-	// y = ror(y, 3)
-	vshr.u\n	Y0, TMP0, #3
-	vshr.u\n	Y1, TMP1, #3
-	vshr.u\n	Y2, TMP2, #3
-	vshr.u\n	Y3, TMP3, #3
-	vsli.u\n	Y0, TMP0, #(\n - 3)
-	vsli.u\n	Y1, TMP1, #(\n - 3)
-	vsli.u\n	Y2, TMP2, #(\n - 3)
-	vsli.u\n	Y3, TMP3, #(\n - 3)
-
-	// x ^= k
-	veor		X0, ROUND_KEY
-	veor		X1, ROUND_KEY
-	veor		X2, ROUND_KEY
-	veor		X3, ROUND_KEY
-
-	// x -= y
-	vsub.u\n	X0, Y0
-	vsub.u\n	X1, Y1
-	vsub.u\n	X2, Y2
-	vsub.u\n	X3, Y3
-
-	// x = rol(x, 8);
-	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
-	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
-	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
-	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
-	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
-	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
-	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
-	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
-.endm
-
-.macro _xts128_precrypt_one	dst_reg, tweak_buf, tmp
-
-	// Load the next source block
-	vld1.8		{\dst_reg}, [SRC]!
-
-	// Save the current tweak in the tweak buffer
-	vst1.8		{TWEAKV}, [\tweak_buf:128]!
-
-	// XOR the next source block with the current tweak
-	veor		\dst_reg, TWEAKV
-
-	/*
-	 * Calculate the next tweak by multiplying the current one by x,
-	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
-	 */
-	vshr.u64	\tmp, TWEAKV, #63
-	vshl.u64	TWEAKV, #1
-	veor		TWEAKV_H, \tmp\()_L
-	vtbl.8		\tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
-	veor		TWEAKV_L, \tmp\()_H
-.endm
-
-.macro _xts64_precrypt_two	dst_reg, tweak_buf, tmp
-
-	// Load the next two source blocks
-	vld1.8		{\dst_reg}, [SRC]!
-
-	// Save the current two tweaks in the tweak buffer
-	vst1.8		{TWEAKV}, [\tweak_buf:128]!
-
-	// XOR the next two source blocks with the current two tweaks
-	veor		\dst_reg, TWEAKV
-
-	/*
-	 * Calculate the next two tweaks by multiplying the current ones by x^2,
-	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
-	 */
-	vshr.u64	\tmp, TWEAKV, #62
-	vshl.u64	TWEAKV, #2
-	vtbl.8		\tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
-	vtbl.8		\tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
-	veor		TWEAKV, \tmp
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt	n, decrypting
-	push		{r4-r7}
-	mov		r7, sp
-
-	/*
-	 * The first four parameters were passed in registers r0-r3.  Load the
-	 * additional parameters, which were passed on the stack.
-	 */
-	ldr		NBYTES, [sp, #16]
-	ldr		TWEAK, [sp, #20]
-
-	/*
-	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
-	 * round key rather than the first, since for decryption the round keys
-	 * are used in reverse order.
-	 */
-.if \decrypting
-.if \n == 64
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
-	sub		ROUND_KEYS, #8
-.else
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
-	sub		ROUND_KEYS, #4
-.endif
-.endif
-
-	// Load the index vector for vtbl-based 8-bit rotates
-.if \decrypting
-	ldr		r12, =.Lrol\n\()_8_table
-.else
-	ldr		r12, =.Lror\n\()_8_table
-.endif
-	vld1.8		{ROTATE_TABLE}, [r12:64]
-
-	// One-time XTS preparation
-
-	/*
-	 * Allocate stack space to store 128 bytes worth of tweaks.  For
-	 * performance, this space is aligned to a 16-byte boundary so that we
-	 * can use the load/store instructions that declare 16-byte alignment.
-	 * For Thumb2 compatibility, don't do the 'bic' directly on 'sp'.
-	 */
-	sub		r12, sp, #128
-	bic		r12, #0xf
-	mov		sp, r12
-
-.if \n == 64
-	// Load first tweak
-	vld1.8		{TWEAKV}, [TWEAK]
-
-	// Load GF(2^128) multiplication table
-	ldr		r12, =.Lgf128mul_table
-	vld1.8		{GF128MUL_TABLE}, [r12:64]
-.else
-	// Load first tweak
-	vld1.8		{TWEAKV_L}, [TWEAK]
-
-	// Load GF(2^64) multiplication table
-	ldr		r12, =.Lgf64mul_table
-	vld1.8		{GF64MUL_TABLE}, [r12:64]
-
-	// Calculate second tweak, packing it together with the first
-	vshr.u64	TMP0_L, TWEAKV_L, #63
-	vtbl.u8		TMP0_L, {GF64MUL_TABLE}, TMP0_L
-	vshl.u64	TWEAKV_H, TWEAKV_L, #1
-	veor		TWEAKV_H, TMP0_L
-.endif
-
-.Lnext_128bytes_\@:
-
-	/*
-	 * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
-	 * values, and save the tweaks on the stack for later.  Then
-	 * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
-	 * that the X[0-3] registers contain only the second halves of blocks,
-	 * and the Y[0-3] registers contain only the first halves of blocks.
-	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
-	 */
-	mov		r12, sp
-.if \n == 64
-	_xts128_precrypt_one	X0, r12, TMP0
-	_xts128_precrypt_one	Y0, r12, TMP0
-	_xts128_precrypt_one	X1, r12, TMP0
-	_xts128_precrypt_one	Y1, r12, TMP0
-	_xts128_precrypt_one	X2, r12, TMP0
-	_xts128_precrypt_one	Y2, r12, TMP0
-	_xts128_precrypt_one	X3, r12, TMP0
-	_xts128_precrypt_one	Y3, r12, TMP0
-	vswp		X0_L, Y0_H
-	vswp		X1_L, Y1_H
-	vswp		X2_L, Y2_H
-	vswp		X3_L, Y3_H
-.else
-	_xts64_precrypt_two	X0, r12, TMP0
-	_xts64_precrypt_two	Y0, r12, TMP0
-	_xts64_precrypt_two	X1, r12, TMP0
-	_xts64_precrypt_two	Y1, r12, TMP0
-	_xts64_precrypt_two	X2, r12, TMP0
-	_xts64_precrypt_two	Y2, r12, TMP0
-	_xts64_precrypt_two	X3, r12, TMP0
-	_xts64_precrypt_two	Y3, r12, TMP0
-	vuzp.32		Y0, X0
-	vuzp.32		Y1, X1
-	vuzp.32		Y2, X2
-	vuzp.32		Y3, X3
-.endif
-
-	// Do the cipher rounds
-
-	mov		r12, ROUND_KEYS
-	mov		r6, NROUNDS
-
-.Lnext_round_\@:
-.if \decrypting
-.if \n == 64
-	vld1.64		ROUND_KEY_L, [r12]
-	sub		r12, #8
-	vmov		ROUND_KEY_H, ROUND_KEY_L
-.else
-	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
-	sub		r12, #4
-.endif
-	_speck_unround_128bytes	\n
-.else
-.if \n == 64
-	vld1.64		ROUND_KEY_L, [r12]!
-	vmov		ROUND_KEY_H, ROUND_KEY_L
-.else
-	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
-.endif
-	_speck_round_128bytes	\n
-.endif
-	subs		r6, r6, #1
-	bne		.Lnext_round_\@
-
-	// Re-interleave the 'x' and 'y' elements of each block
-.if \n == 64
-	vswp		X0_L, Y0_H
-	vswp		X1_L, Y1_H
-	vswp		X2_L, Y2_H
-	vswp		X3_L, Y3_H
-.else
-	vzip.32		Y0, X0
-	vzip.32		Y1, X1
-	vzip.32		Y2, X2
-	vzip.32		Y3, X3
-.endif
-
-	// XOR the encrypted/decrypted blocks with the tweaks we saved earlier
-	mov		r12, sp
-	vld1.8		{TMP0, TMP1}, [r12:128]!
-	vld1.8		{TMP2, TMP3}, [r12:128]!
-	veor		X0, TMP0
-	veor		Y0, TMP1
-	veor		X1, TMP2
-	veor		Y1, TMP3
-	vld1.8		{TMP0, TMP1}, [r12:128]!
-	vld1.8		{TMP2, TMP3}, [r12:128]!
-	veor		X2, TMP0
-	veor		Y2, TMP1
-	veor		X3, TMP2
-	veor		Y3, TMP3
-
-	// Store the ciphertext in the destination buffer
-	vst1.8		{X0, Y0}, [DST]!
-	vst1.8		{X1, Y1}, [DST]!
-	vst1.8		{X2, Y2}, [DST]!
-	vst1.8		{X3, Y3}, [DST]!
-
-	// Continue if there are more 128-byte chunks remaining, else return
-	subs		NBYTES, #128
-	bne		.Lnext_128bytes_\@
-
-	// Store the next tweak
-.if \n == 64
-	vst1.8		{TWEAKV}, [TWEAK]
-.else
-	vst1.8		{TWEAKV_L}, [TWEAK]
-.endif
-
-	mov		sp, r7
-	pop		{r4-r7}
-	bx		lr
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
-	_speck_xts_crypt	n=64, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
-	_speck_xts_crypt	n=64, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
-	_speck_xts_crypt	n=32, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
-	_speck_xts_crypt	n=32, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm/crypto/speck-neon-glue.c b/arch/arm/crypto/speck-neon-glue.c
deleted file mode 100644
index f012c3ea998f..000000000000
--- a/arch/arm/crypto/speck-neon-glue.c
+++ /dev/null
@@ -1,288 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Note: the NIST recommendation for XTS only specifies a 128-bit block size,
- * but a 64-bit version (needed for Speck64) is fairly straightforward; the math
- * is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial
- * x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004:
- * "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes
- * OCB and PMAC"), represented as 0x1B.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE	128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
-	struct speck128_tfm_ctx main_key;
-	struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
-				     u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
-					  const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct skcipher_request *req,
-		     speck128_crypt_one_t crypt_one,
-		     speck128_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	le128 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
-			gf128mul_x_ble(&tweak, &tweak);
-
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck128_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_encrypt,
-				    speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_decrypt,
-				    speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			       unsigned int keylen)
-{
-	struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
-	struct speck64_tfm_ctx main_key;
-	struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
-				    u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
-					 const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
-		    speck64_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	__le64 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			*(__le64 *)dst = *(__le64 *)src ^ tweak;
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			*(__le64 *)dst ^= tweak;
-			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
-					    ((tweak & cpu_to_le64(1ULL << 63)) ?
-					     0x1B : 0));
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck64_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_encrypt,
-				   speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_decrypt,
-				   speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			      unsigned int keylen)
-{
-	struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct skcipher_alg speck_algs[] = {
-	{
-		.base.cra_name		= "xts(speck128)",
-		.base.cra_driver_name	= "xts-speck128-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK128_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck128_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
-		.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
-		.ivsize			= SPECK128_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck128_xts_setkey,
-		.encrypt		= speck128_xts_encrypt,
-		.decrypt		= speck128_xts_decrypt,
-	}, {
-		.base.cra_name		= "xts(speck64)",
-		.base.cra_driver_name	= "xts-speck64-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK64_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck64_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
-		.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
-		.ivsize			= SPECK64_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck64_xts_setkey,
-		.encrypt		= speck64_xts_encrypt,
-		.decrypt		= speck64_xts_decrypt,
-	}
-};
-
-static int __init speck_neon_module_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-	return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
-	crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");