diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2018-12-04 14:13:32 +0100 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2018-12-13 11:24:40 +0100 |
commit | f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6 (patch) | |
tree | 13a610a77ea99a6862e68fcacae99ea4b18924cf /arch/arm64/crypto/chacha-neon-glue.c | |
parent | crypto: tcrypt - add block size of 1472 to skcipher template (diff) | |
download | linux-f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6.tar.xz linux-f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6.zip |
crypto: arm64/chacha - optimize for arbitrary length inputs
Update the 4-way NEON ChaCha routine so it can handle input of any
length >64 bytes in its entirety, rather than having to call into
the 1-way routine and/or memcpy()s via temp buffers to handle the
tail of a ChaCha invocation that is not a multiple of 256 bytes.
On inputs that are a multiple of 256 bytes (and thus in tcrypt
benchmarks), performance drops by around 1% on Cortex-A57, while
performance for inputs drawn randomly from the range [64, 1024)
increases by around 30%.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/chacha-neon-glue.c')
-rw-r--r-- | arch/arm64/crypto/chacha-neon-glue.c | 38 |
1 files changed, 14 insertions, 24 deletions
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 346eb85498a1..67f8feb0c717 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -32,41 +32,29 @@ asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src, int nrounds); asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src, - int nrounds); + int nrounds, int bytes); asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) + int bytes, int nrounds) { u8 buf[CHACHA_BLOCK_SIZE]; - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - kernel_neon_begin(); - chacha_4block_xor_neon(state, dst, src, nrounds); - kernel_neon_end(); + if (bytes < CHACHA_BLOCK_SIZE) { + memcpy(buf, src, bytes); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, bytes); + return; + } + + while (bytes > 0) { + chacha_4block_xor_neon(state, dst, src, nrounds, + min(bytes, CHACHA_BLOCK_SIZE * 4)); bytes -= CHACHA_BLOCK_SIZE * 4; src += CHACHA_BLOCK_SIZE * 4; dst += CHACHA_BLOCK_SIZE * 4; state[12] += 4; } - - if (!bytes) - return; - - kernel_neon_begin(); - while (bytes >= CHACHA_BLOCK_SIZE) { - chacha_block_xor_neon(state, dst, src, nrounds); - bytes -= CHACHA_BLOCK_SIZE; - src += CHACHA_BLOCK_SIZE; - dst += CHACHA_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, bytes); - } - kernel_neon_end(); } static int chacha_neon_stream_xor(struct skcipher_request *req, @@ -86,8 +74,10 @@ static int chacha_neon_stream_xor(struct skcipher_request *req, if (nbytes < walk.total) nbytes = round_down(nbytes, walk.stride); + kernel_neon_begin(); chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); + kernel_neon_end(); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } |