diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2018-12-04 14:13:33 +0100 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2018-12-13 11:24:55 +0100 |
commit | 2fe55987b2624a86a5c709a8df65d4de2608dc07 (patch) | |
tree | aa1e6ba2142d05e74cdfabf2b581ca57a381b7f4 /arch/arm64/crypto/chacha-neon-glue.c | |
parent | crypto: arm64/chacha - optimize for arbitrary length inputs (diff) | |
download | linux-2fe55987b2624a86a5c709a8df65d4de2608dc07.tar.xz linux-2fe55987b2624a86a5c709a8df65d4de2608dc07.zip |
crypto: arm64/chacha - use combined SIMD/ALU routine for more speed
To some degree, most known AArch64 micro-architectures appear to be
able to issue ALU instructions in parellel to SIMD instructions
without affecting the SIMD throughput. This means we can use the ALU
to process a fifth ChaCha block while the SIMD is processing four
blocks in parallel.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/chacha-neon-glue.c')
-rw-r--r-- | arch/arm64/crypto/chacha-neon-glue.c | 39 |
1 files changed, 20 insertions, 19 deletions
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 67f8feb0c717..bece1d85bd81 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -38,22 +38,23 @@ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, int bytes, int nrounds) { - u8 buf[CHACHA_BLOCK_SIZE]; - - if (bytes < CHACHA_BLOCK_SIZE) { - memcpy(buf, src, bytes); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, bytes); - return; - } - while (bytes > 0) { - chacha_4block_xor_neon(state, dst, src, nrounds, - min(bytes, CHACHA_BLOCK_SIZE * 4)); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; + int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + + if (l <= CHACHA_BLOCK_SIZE) { + u8 buf[CHACHA_BLOCK_SIZE]; + + memcpy(buf, src, l); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, l); + state[12] += 1; + break; + } + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= CHACHA_BLOCK_SIZE * 5; + src += CHACHA_BLOCK_SIZE * 5; + dst += CHACHA_BLOCK_SIZE * 5; + state[12] += 5; } } @@ -72,7 +73,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req, unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); + nbytes = rounddown(nbytes, walk.stride); kernel_neon_begin(); chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, @@ -131,7 +132,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = chacha_neon, .decrypt = chacha_neon, @@ -147,7 +148,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, @@ -163,7 +164,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha12_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, |