summaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/chacha-neon-glue.c
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-12-04 14:13:33 +0100
committerHerbert Xu <herbert@gondor.apana.org.au>2018-12-13 11:24:55 +0100
commit2fe55987b2624a86a5c709a8df65d4de2608dc07 (patch)
treeaa1e6ba2142d05e74cdfabf2b581ca57a381b7f4 /arch/arm64/crypto/chacha-neon-glue.c
parentcrypto: arm64/chacha - optimize for arbitrary length inputs (diff)
downloadlinux-2fe55987b2624a86a5c709a8df65d4de2608dc07.tar.xz
linux-2fe55987b2624a86a5c709a8df65d4de2608dc07.zip
crypto: arm64/chacha - use combined SIMD/ALU routine for more speed
To some degree, most known AArch64 micro-architectures appear to be able to issue ALU instructions in parellel to SIMD instructions without affecting the SIMD throughput. This means we can use the ALU to process a fifth ChaCha block while the SIMD is processing four blocks in parallel. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/chacha-neon-glue.c')
-rw-r--r--arch/arm64/crypto/chacha-neon-glue.c39
1 files changed, 20 insertions, 19 deletions
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 67f8feb0c717..bece1d85bd81 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -38,22 +38,23 @@ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
int bytes, int nrounds)
{
- u8 buf[CHACHA_BLOCK_SIZE];
-
- if (bytes < CHACHA_BLOCK_SIZE) {
- memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, bytes);
- return;
- }
-
while (bytes > 0) {
- chacha_4block_xor_neon(state, dst, src, nrounds,
- min(bytes, CHACHA_BLOCK_SIZE * 4));
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state[12] += 4;
+ int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+ if (l <= CHACHA_BLOCK_SIZE) {
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ memcpy(buf, src, l);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ memcpy(dst, buf, l);
+ state[12] += 1;
+ break;
+ }
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= CHACHA_BLOCK_SIZE * 5;
+ src += CHACHA_BLOCK_SIZE * 5;
+ dst += CHACHA_BLOCK_SIZE * 5;
+ state[12] += 5;
}
}
@@ -72,7 +73,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
- nbytes = round_down(nbytes, walk.stride);
+ nbytes = rounddown(nbytes, walk.stride);
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
@@ -131,7 +132,7 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
@@ -147,7 +148,7 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
@@ -163,7 +164,7 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,