crypto: arm64/chacha - use combined SIMD/ALU routine for more speed

To some degree, most known AArch64 micro-architectures appear to be able to issue ALU instructions in parellel to SIMD instructions without affecting the SIMD throughput. This means we can use the ALU to process a fifth ChaCha block while the SIMD is processing four blocks in parallel. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Ard Biesheuvel <ard.biesheuvel@linaro.org> 2018-12-04 14:13:33 +0100
committer: Herbert Xu <herbert@gondor.apana.org.au> 2018-12-13 11:24:55 +0100
commit: 2fe55987b2624a86a5c709a8df65d4de2608dc07 (patch)
tree: aa1e6ba2142d05e74cdfabf2b581ca57a381b7f4 /arch/arm64/crypto/chacha-neon-glue.c
parent: crypto: arm64/chacha - optimize for arbitrary length inputs (diff)
download: linux-2fe55987b2624a86a5c709a8df65d4de2608dc07.tar.xz
linux-2fe55987b2624a86a5c709a8df65d4de2608dc07.zip
1 files changed, 20 insertions, 19 deletions
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 67f8feb0c717..bece1d85bd81 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -38,22 +38,23 @@ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
 			  int bytes, int nrounds)
 {
-	u8 buf[CHACHA_BLOCK_SIZE];
-
-	if (bytes < CHACHA_BLOCK_SIZE) {
-		memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, buf, buf, nrounds);
-		memcpy(dst, buf, bytes);
-		return;
-	}
-
 	while (bytes > 0) {
-		chacha_4block_xor_neon(state, dst, src, nrounds,
-				       min(bytes, CHACHA_BLOCK_SIZE * 4));
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
+		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+		if (l <= CHACHA_BLOCK_SIZE) {
+			u8 buf[CHACHA_BLOCK_SIZE];
+
+			memcpy(buf, src, l);
+			chacha_block_xor_neon(state, buf, buf, nrounds);
+			memcpy(dst, buf, l);
+			state[12] += 1;
+			break;
+		}
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= CHACHA_BLOCK_SIZE * 5;
+		src += CHACHA_BLOCK_SIZE * 5;
+		dst += CHACHA_BLOCK_SIZE * 5;
+		state[12] += 5;
 	}
 }
 
@@ -72,7 +73,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
 		unsigned int nbytes = walk.nbytes;
 
 		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
+			nbytes = rounddown(nbytes, walk.stride);
 
 		kernel_neon_begin();
 		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
@@ -131,7 +132,7 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= chacha_neon,
 		.decrypt		= chacha_neon,
@@ -147,7 +148,7 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
@@ -163,7 +164,7 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha12_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>	2018-12-04 14:13:33 +0100
committer	Herbert Xu <herbert@gondor.apana.org.au>	2018-12-13 11:24:55 +0100
commit	2fe55987b2624a86a5c709a8df65d4de2608dc07 (patch)
tree	aa1e6ba2142d05e74cdfabf2b581ca57a381b7f4 /arch/arm64/crypto/chacha-neon-glue.c
parent	crypto: arm64/chacha - optimize for arbitrary length inputs (diff)
download	linux-2fe55987b2624a86a5c709a8df65d4de2608dc07.tar.xz linux-2fe55987b2624a86a5c709a8df65d4de2608dc07.zip