crypto: x86/aes-ni-xts - use direct calls to and 4-way stride

The XTS asm helper arrangement is a bit odd: the 8-way stride helper consists of back-to-back calls to the 4-way core transforms, which are called indirectly, based on a boolean that indicates whether we are performing encryption or decryption. Given how costly indirect calls are on x86, let's switch to direct calls, and given how the 8-way stride doesn't really add anything substantial, use a 4-way stride instead, and make the asm core routine deal with any multiple of 4 blocks. Since 512 byte sectors or 4 KB blocks are the typical quantities XTS operates on, increase the stride exported to the glue helper to 512 bytes as well. As a result, the number of indirect calls is reduced from 3 per 64 bytes of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU) Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps") Tested-by: Eric Biggers <ebiggers@google.com> # x86_64 Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Ard Biesheuvel <ardb@kernel.org> 2020-12-31 17:41:54 +0100
committer: Herbert Xu <herbert@gondor.apana.org.au> 2021-01-08 05:39:47 +0100
commit: 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 (patch)
tree: d0f396b57e398f50604e9e9fb20e793a02b9ccf0 /arch/x86/crypto/aesni-intel_glue.c
parent: crypto: picoxcell - Remove PicoXcell driver (diff)
download: linux-86ad60a65f29dd862a11c22bb4b5be28d6c5cef1.tar.xz
linux-86ad60a65f29dd862a11c22bb4b5be28d6c5cef1.zip
1 files changed, 14 insertions, 11 deletions
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 96bdc1584215..84e3ed49b35d 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -101,6 +101,12 @@ asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 #define AVX_GEN2_OPTSIZE 640
 #define AVX_GEN4_OPTSIZE 4096
 
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+				  const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+				  const u8 *in, unsigned int len, u8 *iv);
+
 #ifdef CONFIG_X86_64
 
 static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
@@ -108,9 +114,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
-				 const u8 *in, bool enc, le128 *iv);
-
 /* asmlinkage void aesni_gcm_enc()
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
  * struct gcm_context_data.  May be uninitialized.
@@ -663,14 +666,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
 }
 
-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
 {
-	aesni_xts_crypt8(ctx, dst, src, true, iv);
+	aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
 }
 
-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
 {
-	aesni_xts_crypt8(ctx, dst, src, false, iv);
+	aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
 }
 
 static const struct common_glue_ctx aesni_enc_xts = {
@@ -678,8 +681,8 @@ static const struct common_glue_ctx aesni_enc_xts = {
 	.fpu_blocks_limit = 1,
 
 	.funcs = { {
-		.num_blocks = 8,
-		.fn_u = { .xts = aesni_xts_enc8 }
+		.num_blocks = 32,
+		.fn_u = { .xts = aesni_xts_enc32 }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .xts = aesni_xts_enc }
@@ -691,8 +694,8 @@ static const struct common_glue_ctx aesni_dec_xts = {
 	.fpu_blocks_limit = 1,
 
 	.funcs = { {
-		.num_blocks = 8,
-		.fn_u = { .xts = aesni_xts_dec8 }
+		.num_blocks = 32,
+		.fn_u = { .xts = aesni_xts_dec32 }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .xts = aesni_xts_dec }
author	Ard Biesheuvel <ardb@kernel.org>	2020-12-31 17:41:54 +0100
committer	Herbert Xu <herbert@gondor.apana.org.au>	2021-01-08 05:39:47 +0100
commit	86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 (patch)
tree	d0f396b57e398f50604e9e9fb20e793a02b9ccf0 /arch/x86/crypto/aesni-intel_glue.c
parent	crypto: picoxcell - Remove PicoXcell driver (diff)
download	linux-86ad60a65f29dd862a11c22bb4b5be28d6c5cef1.tar.xz linux-86ad60a65f29dd862a11c22bb4b5be28d6c5cef1.zip