summaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/chacha-neon-glue.c
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-12-04 14:13:32 +0100
committerHerbert Xu <herbert@gondor.apana.org.au>2018-12-13 11:24:40 +0100
commitf2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6 (patch)
tree13a610a77ea99a6862e68fcacae99ea4b18924cf /arch/arm64/crypto/chacha-neon-glue.c
parentcrypto: tcrypt - add block size of 1472 to skcipher template (diff)
downloadlinux-f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6.tar.xz
linux-f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6.zip
crypto: arm64/chacha - optimize for arbitrary length inputs
Update the 4-way NEON ChaCha routine so it can handle input of any length >64 bytes in its entirety, rather than having to call into the 1-way routine and/or memcpy()s via temp buffers to handle the tail of a ChaCha invocation that is not a multiple of 256 bytes. On inputs that are a multiple of 256 bytes (and thus in tcrypt benchmarks), performance drops by around 1% on Cortex-A57, while performance for inputs drawn randomly from the range [64, 1024) increases by around 30%. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/chacha-neon-glue.c')
-rw-r--r--arch/arm64/crypto/chacha-neon-glue.c38
1 files changed, 14 insertions, 24 deletions
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 346eb85498a1..67f8feb0c717 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -32,41 +32,29 @@
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
- int nrounds);
+ int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
+ int bytes, int nrounds)
{
u8 buf[CHACHA_BLOCK_SIZE];
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- kernel_neon_begin();
- chacha_4block_xor_neon(state, dst, src, nrounds);
- kernel_neon_end();
+ if (bytes < CHACHA_BLOCK_SIZE) {
+ memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ memcpy(dst, buf, bytes);
+ return;
+ }
+
+ while (bytes > 0) {
+ chacha_4block_xor_neon(state, dst, src, nrounds,
+ min(bytes, CHACHA_BLOCK_SIZE * 4));
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
-
- if (!bytes)
- return;
-
- kernel_neon_begin();
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, bytes);
- }
- kernel_neon_end();
}
static int chacha_neon_stream_xor(struct skcipher_request *req,
@@ -86,8 +74,10 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
+ kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes, ctx->nrounds);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}