From 158b52ff1436cf794778774e8510cd42e773849c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Apr 2018 19:08:05 +0100 Subject: crypto: ghash-clmulni - fix spelling mistake: "acclerated" -> "accelerated" Trivial fix to spelling mistake in module description text Signed-off-by: Colin Ian King Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 0420bab19efb..2ddbe3a1868b 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -364,5 +364,5 @@ module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " - "acclerated by PCLMULQDQ-NI"); + "accelerated by PCLMULQDQ-NI"); MODULE_ALIAS_CRYPTO("ghash"); -- cgit v1.2.3 From 1d373d4e8e15b358f08de52956b32e0e38a11f84 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 11 May 2018 14:12:51 +0200 Subject: crypto: x86 - Add optimized AEGIS implementations This patch adds optimized implementations of AEGIS-128, AEGIS-128L, and AEGIS-256, utilizing the AES-NI and SSE2 x86 extensions. Signed-off-by: Ondrej Mosnacek Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 8 + arch/x86/crypto/aegis128-aesni-asm.S | 749 ++++++++++++++++++++++++++++++ arch/x86/crypto/aegis128-aesni-glue.c | 407 ++++++++++++++++ arch/x86/crypto/aegis128l-aesni-asm.S | 825 +++++++++++++++++++++++++++++++++ arch/x86/crypto/aegis128l-aesni-glue.c | 407 ++++++++++++++++ arch/x86/crypto/aegis256-aesni-asm.S | 702 ++++++++++++++++++++++++++++ arch/x86/crypto/aegis256-aesni-glue.c | 407 ++++++++++++++++ crypto/Kconfig | 24 + 8 files changed, 3529 insertions(+) create mode 100644 arch/x86/crypto/aegis128-aesni-asm.S create mode 100644 arch/x86/crypto/aegis128-aesni-glue.c create mode 100644 arch/x86/crypto/aegis128l-aesni-asm.S create mode 100644 arch/x86/crypto/aegis128l-aesni-glue.c create mode 100644 arch/x86/crypto/aegis256-aesni-asm.S create mode 100644 arch/x86/crypto/aegis256-aesni-glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5f07333bb224..c183553a4bd6 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -38,6 +38,10 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o +obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o +obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o +obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o + # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -72,6 +76,10 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o +aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o +aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o + ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia_aesni_avx_glue.o diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S new file mode 100644 index 000000000000..9254e0b6cc06 --- /dev/null +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -0,0 +1,749 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define KEY %xmm5 +#define MSG %xmm5 +#define T0 %xmm6 +#define T1 %xmm7 + +#define STATEP %rdi +#define LEN %rsi +#define SRC %rdx +#define DST %rcx + +.section .rodata.cst16.aegis128_const, "aM", @progbits, 32 +.align 16 +.Laegis128_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis128_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 +.align 16 +.Laegis128_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +/* + * aegis128_update + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state (shifted positions) + * changed: + * T0 + */ +.macro aegis128_update + movdqa STATE4, T0 + aesenc STATE0, STATE4 + aesenc STATE1, STATE0 + aesenc STATE2, STATE1 + aesenc STATE3, STATE2 + aesenc T0, STATE3 +.endm + +/* + * __load_partial: internal ABI + * input: + * LEN - bytes + * SRC - src + * output: + * MSG - message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG, MSG + + mov LEN, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov LEN, %r8 + and $0x1E, %r8 + add SRC, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov LEN, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov LEN, %r8 + and $0x1C, %r8 + add SRC, %r8 + shl $0x10, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov LEN, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov LEN, %r8 + and $0x18, %r8 + add SRC, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG + + mov LEN, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov LEN, %r8 + and $0x10, %r8 + add SRC, %r8 + pslldq $8, MSG + movq (%r8), T0 + pxor T0, MSG + +.Lld_partial_8: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * LEN - bytes + * DST - dst + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov LEN, %r8 + mov DST, %r9 + + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $0x10, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis128_aesni_init) + FRAME_BEGIN + + /* load IV: */ + movdqu (%rdx), T1 + + /* load key: */ + movdqa (%rsi), KEY + pxor KEY, T1 + movdqa T1, STATE0 + movdqa KEY, STATE3 + movdqa KEY, STATE4 + + /* load the constants: */ + movdqa .Laegis128_const_0, STATE2 + movdqa .Laegis128_const_1, STATE1 + pxor STATE2, STATE3 + pxor STATE1, STATE4 + + /* update 10 times with KEY / KEY xor IV: */ + aegis128_update; pxor KEY, STATE4 + aegis128_update; pxor T1, STATE3 + aegis128_update; pxor KEY, STATE2 + aegis128_update; pxor T1, STATE1 + aegis128_update; pxor KEY, STATE0 + aegis128_update; pxor T1, STATE4 + aegis128_update; pxor KEY, STATE3 + aegis128_update; pxor T1, STATE2 + aegis128_update; pxor KEY, STATE1 + aegis128_update; pxor T1, STATE0 + + /* store the state: */ + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_init) + +/* + * void crypto_aegis128_aesni_ad(void *state, unsigned int length, + * const void *data); + */ +ENTRY(crypto_aegis128_aesni_ad) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lad_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + mov SRC, %r8 + and $0xF, %r8 + jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: + movdqa 0x00(SRC), MSG + aegis128_update + pxor MSG, STATE4 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_1 + + movdqa 0x10(SRC), MSG + aegis128_update + pxor MSG, STATE3 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_2 + + movdqa 0x20(SRC), MSG + aegis128_update + pxor MSG, STATE2 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_3 + + movdqa 0x30(SRC), MSG + aegis128_update + pxor MSG, STATE1 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_4 + + movdqa 0x40(SRC), MSG + aegis128_update + pxor MSG, STATE0 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_0 + + add $0x50, SRC + jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: + movdqu 0x00(SRC), MSG + aegis128_update + pxor MSG, STATE4 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_1 + + movdqu 0x10(SRC), MSG + aegis128_update + pxor MSG, STATE3 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_2 + + movdqu 0x20(SRC), MSG + aegis128_update + pxor MSG, STATE2 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_3 + + movdqu 0x30(SRC), MSG + aegis128_update + pxor MSG, STATE1 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_4 + + movdqu 0x40(SRC), MSG + aegis128_update + pxor MSG, STATE0 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_0 + + add $0x50, SRC + jmp .Lad_u_loop + + /* store the state: */ +.Lad_out_0: + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_1: + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_2: + movdqu STATE3, 0x00(STATEP) + movdqu STATE4, 0x10(STATEP) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_3: + movdqu STATE2, 0x00(STATEP) + movdqu STATE3, 0x10(STATEP) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_4: + movdqu STATE1, 0x00(STATEP) + movdqu STATE2, 0x10(STATEP) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) + FRAME_END + ret + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_ad) + +.macro encrypt_block a s0 s1 s2 s3 s4 i + movdq\a (\i * 0x10)(SRC), MSG + movdqa MSG, T0 + pxor \s1, T0 + pxor \s4, T0 + movdqa \s2, T1 + pand \s3, T1 + pxor T1, T0 + movdq\a T0, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN + cmp $0x10, LEN + jl .Lenc_out_\i +.endm + +/* + * void crypto_aegis128_aesni_enc(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_enc) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lenc_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: + encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: + encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Lenc_u_loop + + /* store the state: */ +.Lenc_out_0: + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_1: + movdqu STATE3, 0x00(STATEP) + movdqu STATE4, 0x10(STATEP) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_2: + movdqu STATE2, 0x00(STATEP) + movdqu STATE3, 0x10(STATEP) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_3: + movdqu STATE1, 0x00(STATEP) + movdqu STATE2, 0x10(STATEP) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_4: + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_enc) + +/* + * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_enc_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + /* encrypt message: */ + call __load_partial + + movdqa MSG, T0 + pxor STATE1, T0 + pxor STATE4, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + + call __store_partial + + aegis128_update + pxor MSG, STATE4 + + /* store the state: */ + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + + FRAME_END +ENDPROC(crypto_aegis128_aesni_enc_tail) + +.macro decrypt_block a s0 s1 s2 s3 s4 i + movdq\a (\i * 0x10)(SRC), MSG + pxor \s1, MSG + pxor \s4, MSG + movdqa \s2, T1 + pand \s3, T1 + pxor T1, MSG + movdq\a MSG, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN + cmp $0x10, LEN + jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis128_aesni_dec(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_dec) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Ldec_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: + decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: + decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Ldec_u_loop + + /* store the state: */ +.Ldec_out_0: + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_1: + movdqu STATE3, 0x00(STATEP) + movdqu STATE4, 0x10(STATEP) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_2: + movdqu STATE2, 0x00(STATEP) + movdqu STATE3, 0x10(STATEP) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_3: + movdqu STATE1, 0x00(STATEP) + movdqu STATE2, 0x10(STATEP) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_4: + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_dec) + +/* + * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_dec_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + /* decrypt message: */ + call __load_partial + + pxor STATE1, MSG + pxor STATE4, MSG + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, MSG + + movdqa MSG, T0 + call __store_partial + + /* mask with byte count: */ + movq LEN, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa .Laegis128_counter, T1 + pcmpgtb T1, T0 + pand T0, MSG + + aegis128_update + pxor MSG, STATE4 + + /* store the state: */ + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_dec_tail) + +/* + * void crypto_aegis128_aesni_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis128_aesni_final) + FRAME_BEGIN + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + /* prepare length block: */ + movq %rdx, MSG + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + pxor STATE3, MSG + + /* update state: */ + aegis128_update; pxor MSG, STATE4 + aegis128_update; pxor MSG, STATE3 + aegis128_update; pxor MSG, STATE2 + aegis128_update; pxor MSG, STATE1 + aegis128_update; pxor MSG, STATE0 + aegis128_update; pxor MSG, STATE4 + aegis128_update; pxor MSG, STATE3 + + /* xor tag: */ + movdqu (%rsi), MSG + + pxor STATE0, MSG + pxor STATE1, MSG + pxor STATE2, MSG + pxor STATE3, MSG + pxor STATE4, MSG + + movdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c new file mode 100644 index 000000000000..5de7c0d46edf --- /dev/null +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-128 Authenticated-Encryption Algorithm + * Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define AEGIS128_BLOCK_ALIGN 16 +#define AEGIS128_BLOCK_SIZE 16 +#define AEGIS128_NONCE_SIZE 16 +#define AEGIS128_STATE_BLOCKS 5 +#define AEGIS128_KEY_SIZE 16 +#define AEGIS128_MIN_AUTH_SIZE 8 +#define AEGIS128_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis128_aesni_ad( + void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis128_aesni_enc( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_dec( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_enc_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_dec_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_final( + void *state, void *tag_xor, unsigned int cryptlen, + unsigned int assoclen); + +struct aegis_block { + u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); +}; + +struct aegis_state { + struct aegis_block blocks[AEGIS128_STATE_BLOCKS]; +}; + +struct aegis_ctx { + struct aegis_block key; +}; + +struct aegis_crypt_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, unsigned int length, const void *src, + void *dst); + void (*crypt_tail)(void *state, unsigned int length, const void *src, + void *dst); +}; + +static void crypto_aegis128_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, + unsigned int assoclen) +{ + struct scatter_walk walk; + struct aegis_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= AEGIS128_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = AEGIS128_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + crypto_aegis128_aesni_ad(state, + AEGIS128_BLOCK_SIZE, + buf.bytes); + pos = 0; + left -= fill; + src += fill; + } + + crypto_aegis128_aesni_ad(state, left, src); + + src += left & ~(AEGIS128_BLOCK_SIZE - 1); + left &= AEGIS128_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + pos += left; + assoclen -= size; + + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); + crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); + } +} + +static void crypto_aegis128_aesni_process_crypt( + struct aegis_state *state, struct aead_request *req, + const struct aegis_crypt_ops *ops) +{ + struct skcipher_walk walk; + u8 *src, *dst; + unsigned int chunksize, base; + + ops->skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + src = walk.src.virt.addr; + dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops->crypt_blocks(state, chunksize, src, dst); + + base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1); + src += base; + dst += base; + chunksize &= AEGIS128_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops->crypt_tail(state, chunksize, src, dst); + + skcipher_walk_done(&walk, 0); + } +} + +static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead) +{ + u8 *ctx = crypto_aead_ctx(aead); + ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); + return (void *)ctx; +} + +static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead); + + if (keylen != AEGIS128_KEY_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); + + return 0; +} + +static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + if (authsize > AEGIS128_MAX_AUTH_SIZE) + return -EINVAL; + if (authsize < AEGIS128_MIN_AUTH_SIZE) + return -EINVAL; + return 0; +} + +static void crypto_aegis128_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, + const struct aegis_crypt_ops *ops) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); + struct aegis_state state; + + kernel_fpu_begin(); + + crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); + crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); + crypto_aegis128_aesni_process_crypt(&state, req, ops); + crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +static int crypto_aegis128_aesni_encrypt(struct aead_request *req) +{ + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = crypto_aegis128_aesni_enc, + .crypt_tail = crypto_aegis128_aesni_enc_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} + +static int crypto_aegis128_aesni_decrypt(struct aead_request *req) +{ + static const struct aegis_block zeros = {}; + + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = crypto_aegis128_aesni_dec, + .crypt_tail = crypto_aegis128_aesni_dec_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) +{ + return 0; +} + +static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis128_aesni_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} + +static int cryptd_aegis128_aesni_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} + +static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} + +static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis128_aesni_alg[] = { + { + .setkey = crypto_aegis128_aesni_setkey, + .setauthsize = crypto_aegis128_aesni_setauthsize, + .encrypt = crypto_aegis128_aesni_encrypt, + .decrypt = crypto_aegis128_aesni_decrypt, + .init = crypto_aegis128_aesni_init_tfm, + .exit = crypto_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, + .chunksize = AEGIS128_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + + .cra_name = "__aegis128", + .cra_driver_name = "__aegis128-aesni", + + .cra_module = THIS_MODULE, + } + }, { + .setkey = cryptd_aegis128_aesni_setkey, + .setauthsize = cryptd_aegis128_aesni_setauthsize, + .encrypt = cryptd_aegis128_aesni_encrypt, + .decrypt = cryptd_aegis128_aesni_decrypt, + .init = cryptd_aegis128_aesni_init_tfm, + .exit = cryptd_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, + .chunksize = AEGIS128_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_alignmask = 0, + + .cra_priority = 400, + + .cra_name = "aegis128", + .cra_driver_name = "aegis128-aesni", + + .cra_module = THIS_MODULE, + } + } +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis128_aesni_module_init(void) +{ + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_aegis128_aesni_alg, + ARRAY_SIZE(crypto_aegis128_aesni_alg)); +} + +static void __exit crypto_aegis128_aesni_module_exit(void) +{ + crypto_unregister_aeads(crypto_aegis128_aesni_alg, + ARRAY_SIZE(crypto_aegis128_aesni_alg)); +} + +module_init(crypto_aegis128_aesni_module_init); +module_exit(crypto_aegis128_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis128"); +MODULE_ALIAS_CRYPTO("aegis128-aesni"); diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S new file mode 100644 index 000000000000..9263c344f2c7 --- /dev/null +++ b/arch/x86/crypto/aegis128l-aesni-asm.S @@ -0,0 +1,825 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128L + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define STATE5 %xmm5 +#define STATE6 %xmm6 +#define STATE7 %xmm7 +#define MSG0 %xmm8 +#define MSG1 %xmm9 +#define T0 %xmm10 +#define T1 %xmm11 +#define T2 %xmm12 +#define T3 %xmm13 + +#define STATEP %rdi +#define LEN %rsi +#define SRC %rdx +#define DST %rcx + +.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32 +.align 16 +.Laegis128l_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis128l_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16 +.align 16 +.Laegis128l_counter0: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Laegis128l_counter1: + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +/* + * __load_partial: internal ABI + * input: + * LEN - bytes + * SRC - src + * output: + * MSG0 - first message block + * MSG1 - second message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG0, MSG0 + pxor MSG1, MSG1 + + mov LEN, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov LEN, %r8 + and $0x1E, %r8 + add SRC, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov LEN, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov LEN, %r8 + and $0x1C, %r8 + add SRC, %r8 + shl $0x10, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov LEN, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov LEN, %r8 + and $0x18, %r8 + add SRC, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG0 + + mov LEN, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov LEN, %r8 + and $0x10, %r8 + add SRC, %r8 + pslldq $8, MSG0 + movq (%r8), T0 + pxor T0, MSG0 + +.Lld_partial_8: + mov LEN, %r8 + and $0x10, %r8 + jz .Lld_partial_16 + + movdqa MSG0, MSG1 + movdqu (SRC), MSG0 + +.Lld_partial_16: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * LEN - bytes + * DST - dst + * output: + * T0 - first message block + * T1 - second message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov LEN, %r8 + mov DST, %r9 + + cmp $16, %r8 + jl .Lst_partial_16 + + movdqu T0, (%r9) + movdqa T1, T0 + + sub $16, %r8 + add $16, %r9 + +.Lst_partial_16: + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $0x10, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +.macro update + movdqa STATE7, T0 + aesenc STATE0, STATE7 + aesenc STATE1, STATE0 + aesenc STATE2, STATE1 + aesenc STATE3, STATE2 + aesenc STATE4, STATE3 + aesenc STATE5, STATE4 + aesenc STATE6, STATE5 + aesenc T0, STATE6 +.endm + +.macro update0 + update + pxor MSG0, STATE7 + pxor MSG1, STATE3 +.endm + +.macro update1 + update + pxor MSG0, STATE6 + pxor MSG1, STATE2 +.endm + +.macro update2 + update + pxor MSG0, STATE5 + pxor MSG1, STATE1 +.endm + +.macro update3 + update + pxor MSG0, STATE4 + pxor MSG1, STATE0 +.endm + +.macro update4 + update + pxor MSG0, STATE3 + pxor MSG1, STATE7 +.endm + +.macro update5 + update + pxor MSG0, STATE2 + pxor MSG1, STATE6 +.endm + +.macro update6 + update + pxor MSG0, STATE1 + pxor MSG1, STATE5 +.endm + +.macro update7 + update + pxor MSG0, STATE0 + pxor MSG1, STATE4 +.endm + +.macro state_load + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + movdqu 0x50(STATEP), STATE5 + movdqu 0x60(STATEP), STATE6 + movdqu 0x70(STATEP), STATE7 +.endm + +.macro state_store s0 s1 s2 s3 s4 s5 s6 s7 + movdqu \s7, 0x00(STATEP) + movdqu \s0, 0x10(STATEP) + movdqu \s1, 0x20(STATEP) + movdqu \s2, 0x30(STATEP) + movdqu \s3, 0x40(STATEP) + movdqu \s4, 0x50(STATEP) + movdqu \s5, 0x60(STATEP) + movdqu \s6, 0x70(STATEP) +.endm + +.macro state_store0 + state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 +.endm + +.macro state_store1 + state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 +.endm + +.macro state_store2 + state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro state_store3 + state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro state_store4 + state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro state_store5 + state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 +.endm + +.macro state_store6 + state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 +.endm + +.macro state_store7 + state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 +.endm + +/* + * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis128l_aesni_init) + FRAME_BEGIN + + /* load key: */ + movdqa (%rsi), MSG1 + movdqa MSG1, STATE0 + movdqa MSG1, STATE4 + movdqa MSG1, STATE5 + movdqa MSG1, STATE6 + movdqa MSG1, STATE7 + + /* load IV: */ + movdqu (%rdx), MSG0 + pxor MSG0, STATE0 + pxor MSG0, STATE4 + + /* load the constants: */ + movdqa .Laegis128l_const_0, STATE2 + movdqa .Laegis128l_const_1, STATE1 + movdqa STATE1, STATE3 + pxor STATE2, STATE5 + pxor STATE1, STATE6 + pxor STATE2, STATE7 + + /* update 10 times with IV and KEY: */ + update0 + update1 + update2 + update3 + update4 + update5 + update6 + update7 + update0 + update1 + + state_store1 + + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_init) + +.macro ad_block a i + movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 + movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 + update\i + sub $0x20, LEN + cmp $0x20, LEN + jl .Lad_out_\i +.endm + +/* + * void crypto_aegis128l_aesni_ad(void *state, unsigned int length, + * const void *data); + */ +ENTRY(crypto_aegis128l_aesni_ad) + FRAME_BEGIN + + cmp $0x20, LEN + jb .Lad_out + + state_load + + mov SRC, %r8 + and $0xf, %r8 + jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: + ad_block a 0 + ad_block a 1 + ad_block a 2 + ad_block a 3 + ad_block a 4 + ad_block a 5 + ad_block a 6 + ad_block a 7 + + add $0x100, SRC + jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: + ad_block u 0 + ad_block u 1 + ad_block u 2 + ad_block u 3 + ad_block u 4 + ad_block u 5 + ad_block u 6 + ad_block u 7 + + add $0x100, SRC + jmp .Lad_u_loop + +.Lad_out_0: + state_store0 + FRAME_END + ret + +.Lad_out_1: + state_store1 + FRAME_END + ret + +.Lad_out_2: + state_store2 + FRAME_END + ret + +.Lad_out_3: + state_store3 + FRAME_END + ret + +.Lad_out_4: + state_store4 + FRAME_END + ret + +.Lad_out_5: + state_store5 + FRAME_END + ret + +.Lad_out_6: + state_store6 + FRAME_END + ret + +.Lad_out_7: + state_store7 + FRAME_END + ret + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_ad) + +.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7 + pxor \s1, \m0 + pxor \s6, \m0 + movdqa \s2, T3 + pand \s3, T3 + pxor T3, \m0 + + pxor \s2, \m1 + pxor \s5, \m1 + movdqa \s6, T3 + pand \s7, T3 + pxor T3, \m1 +.endm + +.macro crypt0 m0 m1 + crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 +.endm + +.macro crypt1 m0 m1 + crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 +.endm + +.macro crypt2 m0 m1 + crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro crypt3 m0 m1 + crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro crypt4 m0 m1 + crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro crypt5 m0 m1 + crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 +.endm + +.macro crypt6 m0 m1 + crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 +.endm + +.macro crypt7 m0 m1 + crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 +.endm + +.macro encrypt_block a i + movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 + movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 + movdqa MSG0, T0 + movdqa MSG1, T1 + crypt\i T0, T1 + movdq\a T0, (\i * 0x20 + 0x00)(DST) + movdq\a T1, (\i * 0x20 + 0x10)(DST) + + update\i + + sub $0x20, LEN + cmp $0x20, LEN + jl .Lenc_out_\i +.endm + +.macro decrypt_block a i + movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 + movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 + crypt\i MSG0, MSG1 + movdq\a MSG0, (\i * 0x20 + 0x00)(DST) + movdq\a MSG1, (\i * 0x20 + 0x10)(DST) + + update\i + + sub $0x20, LEN + cmp $0x20, LEN + jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis128l_aesni_enc(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_enc) + FRAME_BEGIN + + cmp $0x20, LEN + jb .Lenc_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xf, %r8 + jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: + encrypt_block a 0 + encrypt_block a 1 + encrypt_block a 2 + encrypt_block a 3 + encrypt_block a 4 + encrypt_block a 5 + encrypt_block a 6 + encrypt_block a 7 + + add $0x100, SRC + add $0x100, DST + jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: + encrypt_block u 0 + encrypt_block u 1 + encrypt_block u 2 + encrypt_block u 3 + encrypt_block u 4 + encrypt_block u 5 + encrypt_block u 6 + encrypt_block u 7 + + add $0x100, SRC + add $0x100, DST + jmp .Lenc_u_loop + +.Lenc_out_0: + state_store0 + FRAME_END + ret + +.Lenc_out_1: + state_store1 + FRAME_END + ret + +.Lenc_out_2: + state_store2 + FRAME_END + ret + +.Lenc_out_3: + state_store3 + FRAME_END + ret + +.Lenc_out_4: + state_store4 + FRAME_END + ret + +.Lenc_out_5: + state_store5 + FRAME_END + ret + +.Lenc_out_6: + state_store6 + FRAME_END + ret + +.Lenc_out_7: + state_store7 + FRAME_END + ret + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_enc) + +/* + * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_enc_tail) + FRAME_BEGIN + + state_load + + /* encrypt message: */ + call __load_partial + + movdqa MSG0, T0 + movdqa MSG1, T1 + crypt0 T0, T1 + + call __store_partial + + update0 + + state_store0 + + FRAME_END +ENDPROC(crypto_aegis128l_aesni_enc_tail) + +/* + * void crypto_aegis128l_aesni_dec(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_dec) + FRAME_BEGIN + + cmp $0x20, LEN + jb .Ldec_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: + decrypt_block a 0 + decrypt_block a 1 + decrypt_block a 2 + decrypt_block a 3 + decrypt_block a 4 + decrypt_block a 5 + decrypt_block a 6 + decrypt_block a 7 + + add $0x100, SRC + add $0x100, DST + jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: + decrypt_block u 0 + decrypt_block u 1 + decrypt_block u 2 + decrypt_block u 3 + decrypt_block u 4 + decrypt_block u 5 + decrypt_block u 6 + decrypt_block u 7 + + add $0x100, SRC + add $0x100, DST + jmp .Ldec_u_loop + +.Ldec_out_0: + state_store0 + FRAME_END + ret + +.Ldec_out_1: + state_store1 + FRAME_END + ret + +.Ldec_out_2: + state_store2 + FRAME_END + ret + +.Ldec_out_3: + state_store3 + FRAME_END + ret + +.Ldec_out_4: + state_store4 + FRAME_END + ret + +.Ldec_out_5: + state_store5 + FRAME_END + ret + +.Ldec_out_6: + state_store6 + FRAME_END + ret + +.Ldec_out_7: + state_store7 + FRAME_END + ret + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_dec) + +/* + * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_dec_tail) + FRAME_BEGIN + + state_load + + /* decrypt message: */ + call __load_partial + + crypt0 MSG0, MSG1 + + movdqa MSG0, T0 + movdqa MSG1, T1 + call __store_partial + + /* mask with byte count: */ + movq LEN, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa T0, T1 + movdqa .Laegis128l_counter0, T2 + movdqa .Laegis128l_counter1, T3 + pcmpgtb T2, T0 + pcmpgtb T3, T1 + pand T0, MSG0 + pand T1, MSG1 + + update0 + + state_store0 + + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_dec_tail) + +/* + * void crypto_aegis128l_aesni_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis128l_aesni_final) + FRAME_BEGIN + + state_load + + /* prepare length block: */ + movq %rdx, MSG0 + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG0 + psllq $3, MSG0 /* multiply by 8 (to get bit count) */ + + pxor STATE2, MSG0 + movdqa MSG0, MSG1 + + /* update state: */ + update0 + update1 + update2 + update3 + update4 + update5 + update6 + + /* xor tag: */ + movdqu (%rsi), T0 + + pxor STATE1, T0 + pxor STATE2, T0 + pxor STATE3, T0 + pxor STATE4, T0 + pxor STATE5, T0 + pxor STATE6, T0 + pxor STATE7, T0 + + movdqu T0, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_final) diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c new file mode 100644 index 000000000000..876e4866e633 --- /dev/null +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-128L Authenticated-Encryption Algorithm + * Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define AEGIS128L_BLOCK_ALIGN 16 +#define AEGIS128L_BLOCK_SIZE 32 +#define AEGIS128L_NONCE_SIZE 16 +#define AEGIS128L_STATE_BLOCKS 8 +#define AEGIS128L_KEY_SIZE 16 +#define AEGIS128L_MIN_AUTH_SIZE 8 +#define AEGIS128L_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis128l_aesni_ad( + void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis128l_aesni_enc( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_dec( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_enc_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_dec_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_final( + void *state, void *tag_xor, unsigned int cryptlen, + unsigned int assoclen); + +struct aegis_block { + u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN); +}; + +struct aegis_state { + struct aegis_block blocks[AEGIS128L_STATE_BLOCKS]; +}; + +struct aegis_ctx { + struct aegis_block key; +}; + +struct aegis_crypt_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, unsigned int length, const void *src, + void *dst); + void (*crypt_tail)(void *state, unsigned int length, const void *src, + void *dst); +}; + +static void crypto_aegis128l_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, + unsigned int assoclen) +{ + struct scatter_walk walk; + struct aegis_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= AEGIS128L_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = AEGIS128L_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + crypto_aegis128l_aesni_ad(state, + AEGIS128L_BLOCK_SIZE, + buf.bytes); + pos = 0; + left -= fill; + src += fill; + } + + crypto_aegis128l_aesni_ad(state, left, src); + + src += left & ~(AEGIS128L_BLOCK_SIZE - 1); + left &= AEGIS128L_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + pos += left; + assoclen -= size; + + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos); + crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes); + } +} + +static void crypto_aegis128l_aesni_process_crypt( + struct aegis_state *state, struct aead_request *req, + const struct aegis_crypt_ops *ops) +{ + struct skcipher_walk walk; + u8 *src, *dst; + unsigned int chunksize, base; + + ops->skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + src = walk.src.virt.addr; + dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops->crypt_blocks(state, chunksize, src, dst); + + base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1); + src += base; + dst += base; + chunksize &= AEGIS128L_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops->crypt_tail(state, chunksize, src, dst); + + skcipher_walk_done(&walk, 0); + } +} + +static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead) +{ + u8 *ctx = crypto_aead_ctx(aead); + ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); + return (void *)ctx; +} + +static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead); + + if (keylen != AEGIS128L_KEY_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE); + + return 0; +} + +static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + if (authsize > AEGIS128L_MAX_AUTH_SIZE) + return -EINVAL; + if (authsize < AEGIS128L_MIN_AUTH_SIZE) + return -EINVAL; + return 0; +} + +static void crypto_aegis128l_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, + const struct aegis_crypt_ops *ops) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm); + struct aegis_state state; + + kernel_fpu_begin(); + + crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv); + crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen); + crypto_aegis128l_aesni_process_crypt(&state, req, ops); + crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +static int crypto_aegis128l_aesni_encrypt(struct aead_request *req) +{ + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = crypto_aegis128l_aesni_enc, + .crypt_tail = crypto_aegis128l_aesni_enc_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} + +static int crypto_aegis128l_aesni_decrypt(struct aead_request *req) +{ + static const struct aegis_block zeros = {}; + + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = crypto_aegis128l_aesni_dec, + .crypt_tail = crypto_aegis128l_aesni_dec_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead) +{ + return 0; +} + +static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} + +static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} + +static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} + +static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis128l_aesni_alg[] = { + { + .setkey = crypto_aegis128l_aesni_setkey, + .setauthsize = crypto_aegis128l_aesni_setauthsize, + .encrypt = crypto_aegis128l_aesni_encrypt, + .decrypt = crypto_aegis128l_aesni_decrypt, + .init = crypto_aegis128l_aesni_init_tfm, + .exit = crypto_aegis128l_aesni_exit_tfm, + + .ivsize = AEGIS128L_NONCE_SIZE, + .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, + .chunksize = AEGIS128L_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + + .cra_name = "__aegis128l", + .cra_driver_name = "__aegis128l-aesni", + + .cra_module = THIS_MODULE, + } + }, { + .setkey = cryptd_aegis128l_aesni_setkey, + .setauthsize = cryptd_aegis128l_aesni_setauthsize, + .encrypt = cryptd_aegis128l_aesni_encrypt, + .decrypt = cryptd_aegis128l_aesni_decrypt, + .init = cryptd_aegis128l_aesni_init_tfm, + .exit = cryptd_aegis128l_aesni_exit_tfm, + + .ivsize = AEGIS128L_NONCE_SIZE, + .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, + .chunksize = AEGIS128L_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_alignmask = 0, + + .cra_priority = 400, + + .cra_name = "aegis128l", + .cra_driver_name = "aegis128l-aesni", + + .cra_module = THIS_MODULE, + } + } +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis128l_aesni_module_init(void) +{ + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_aegis128l_aesni_alg, + ARRAY_SIZE(crypto_aegis128l_aesni_alg)); +} + +static void __exit crypto_aegis128l_aesni_module_exit(void) +{ + crypto_unregister_aeads(crypto_aegis128l_aesni_alg, + ARRAY_SIZE(crypto_aegis128l_aesni_alg)); +} + +module_init(crypto_aegis128l_aesni_module_init); +module_exit(crypto_aegis128l_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis128l"); +MODULE_ALIAS_CRYPTO("aegis128l-aesni"); diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S new file mode 100644 index 000000000000..1d977d515bf9 --- /dev/null +++ b/arch/x86/crypto/aegis256-aesni-asm.S @@ -0,0 +1,702 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128L + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define STATE5 %xmm5 +#define MSG %xmm6 +#define T0 %xmm7 +#define T1 %xmm8 +#define T2 %xmm9 +#define T3 %xmm10 + +#define STATEP %rdi +#define LEN %rsi +#define SRC %rdx +#define DST %rcx + +.section .rodata.cst16.aegis256_const, "aM", @progbits, 32 +.align 16 +.Laegis256_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis256_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16 +.align 16 +.Laegis256_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +/* + * __load_partial: internal ABI + * input: + * LEN - bytes + * SRC - src + * output: + * MSG - message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG, MSG + + mov LEN, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov LEN, %r8 + and $0x1E, %r8 + add SRC, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov LEN, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov LEN, %r8 + and $0x1C, %r8 + add SRC, %r8 + shl $0x10, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov LEN, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov LEN, %r8 + and $0x18, %r8 + add SRC, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG + + mov LEN, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov LEN, %r8 + and $0x10, %r8 + add SRC, %r8 + pslldq $8, MSG + movq (%r8), T0 + pxor T0, MSG + +.Lld_partial_8: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * LEN - bytes + * DST - dst + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov LEN, %r8 + mov DST, %r9 + + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $0x10, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +.macro update + movdqa STATE5, T0 + aesenc STATE0, STATE5 + aesenc STATE1, STATE0 + aesenc STATE2, STATE1 + aesenc STATE3, STATE2 + aesenc STATE4, STATE3 + aesenc T0, STATE4 +.endm + +.macro update0 m + update + pxor \m, STATE5 +.endm + +.macro update1 m + update + pxor \m, STATE4 +.endm + +.macro update2 m + update + pxor \m, STATE3 +.endm + +.macro update3 m + update + pxor \m, STATE2 +.endm + +.macro update4 m + update + pxor \m, STATE1 +.endm + +.macro update5 m + update + pxor \m, STATE0 +.endm + +.macro state_load + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + movdqu 0x50(STATEP), STATE5 +.endm + +.macro state_store s0 s1 s2 s3 s4 s5 + movdqu \s5, 0x00(STATEP) + movdqu \s0, 0x10(STATEP) + movdqu \s1, 0x20(STATEP) + movdqu \s2, 0x30(STATEP) + movdqu \s3, 0x40(STATEP) + movdqu \s4, 0x50(STATEP) +.endm + +.macro state_store0 + state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro state_store1 + state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro state_store2 + state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro state_store3 + state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 +.endm + +.macro state_store4 + state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 +.endm + +.macro state_store5 + state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 +.endm + +/* + * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis256_aesni_init) + FRAME_BEGIN + + /* load key: */ + movdqa 0x00(%rsi), MSG + movdqa 0x10(%rsi), T1 + movdqa MSG, STATE4 + movdqa T1, STATE5 + + /* load IV: */ + movdqu 0x00(%rdx), T2 + movdqu 0x10(%rdx), T3 + pxor MSG, T2 + pxor T1, T3 + movdqa T2, STATE0 + movdqa T3, STATE1 + + /* load the constants: */ + movdqa .Laegis256_const_0, STATE3 + movdqa .Laegis256_const_1, STATE2 + pxor STATE3, STATE4 + pxor STATE2, STATE5 + + /* update 10 times with IV and KEY: */ + update0 MSG + update1 T1 + update2 T2 + update3 T3 + update4 MSG + update5 T1 + update0 T2 + update1 T3 + update2 MSG + update3 T1 + update4 T2 + update5 T3 + update0 MSG + update1 T1 + update2 T2 + update3 T3 + + state_store3 + + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_init) + +.macro ad_block a i + movdq\a (\i * 0x10)(SRC), MSG + update\i MSG + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_\i +.endm + +/* + * void crypto_aegis256_aesni_ad(void *state, unsigned int length, + * const void *data); + */ +ENTRY(crypto_aegis256_aesni_ad) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lad_out + + state_load + + mov SRC, %r8 + and $0xf, %r8 + jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: + ad_block a 0 + ad_block a 1 + ad_block a 2 + ad_block a 3 + ad_block a 4 + ad_block a 5 + + add $0x60, SRC + jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: + ad_block u 0 + ad_block u 1 + ad_block u 2 + ad_block u 3 + ad_block u 4 + ad_block u 5 + + add $0x60, SRC + jmp .Lad_u_loop + +.Lad_out_0: + state_store0 + FRAME_END + ret + +.Lad_out_1: + state_store1 + FRAME_END + ret + +.Lad_out_2: + state_store2 + FRAME_END + ret + +.Lad_out_3: + state_store3 + FRAME_END + ret + +.Lad_out_4: + state_store4 + FRAME_END + ret + +.Lad_out_5: + state_store5 + FRAME_END + ret + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_ad) + +.macro crypt m s0 s1 s2 s3 s4 s5 + pxor \s1, \m + pxor \s4, \m + pxor \s5, \m + movdqa \s2, T3 + pand \s3, T3 + pxor T3, \m +.endm + +.macro crypt0 m + crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro crypt1 m + crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro crypt2 m + crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro crypt3 m + crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 +.endm + +.macro crypt4 m + crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 +.endm + +.macro crypt5 m + crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 +.endm + +.macro encrypt_block a i + movdq\a (\i * 0x10)(SRC), MSG + movdqa MSG, T0 + crypt\i T0 + movdq\a T0, (\i * 0x10)(DST) + + update\i MSG + + sub $0x10, LEN + cmp $0x10, LEN + jl .Lenc_out_\i +.endm + +.macro decrypt_block a i + movdq\a (\i * 0x10)(SRC), MSG + crypt\i MSG + movdq\a MSG, (\i * 0x10)(DST) + + update\i MSG + + sub $0x10, LEN + cmp $0x10, LEN + jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis256_aesni_enc(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_enc) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lenc_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xf, %r8 + jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: + encrypt_block a 0 + encrypt_block a 1 + encrypt_block a 2 + encrypt_block a 3 + encrypt_block a 4 + encrypt_block a 5 + + add $0x60, SRC + add $0x60, DST + jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: + encrypt_block u 0 + encrypt_block u 1 + encrypt_block u 2 + encrypt_block u 3 + encrypt_block u 4 + encrypt_block u 5 + + add $0x60, SRC + add $0x60, DST + jmp .Lenc_u_loop + +.Lenc_out_0: + state_store0 + FRAME_END + ret + +.Lenc_out_1: + state_store1 + FRAME_END + ret + +.Lenc_out_2: + state_store2 + FRAME_END + ret + +.Lenc_out_3: + state_store3 + FRAME_END + ret + +.Lenc_out_4: + state_store4 + FRAME_END + ret + +.Lenc_out_5: + state_store5 + FRAME_END + ret + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_enc) + +/* + * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_enc_tail) + FRAME_BEGIN + + state_load + + /* encrypt message: */ + call __load_partial + + movdqa MSG, T0 + crypt0 T0 + + call __store_partial + + update0 MSG + + state_store0 + + FRAME_END +ENDPROC(crypto_aegis256_aesni_enc_tail) + +/* + * void crypto_aegis256_aesni_dec(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_dec) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Ldec_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: + decrypt_block a 0 + decrypt_block a 1 + decrypt_block a 2 + decrypt_block a 3 + decrypt_block a 4 + decrypt_block a 5 + + add $0x60, SRC + add $0x60, DST + jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: + decrypt_block u 0 + decrypt_block u 1 + decrypt_block u 2 + decrypt_block u 3 + decrypt_block u 4 + decrypt_block u 5 + + add $0x60, SRC + add $0x60, DST + jmp .Ldec_u_loop + +.Ldec_out_0: + state_store0 + FRAME_END + ret + +.Ldec_out_1: + state_store1 + FRAME_END + ret + +.Ldec_out_2: + state_store2 + FRAME_END + ret + +.Ldec_out_3: + state_store3 + FRAME_END + ret + +.Ldec_out_4: + state_store4 + FRAME_END + ret + +.Ldec_out_5: + state_store5 + FRAME_END + ret + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_dec) + +/* + * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_dec_tail) + FRAME_BEGIN + + state_load + + /* decrypt message: */ + call __load_partial + + crypt0 MSG + + movdqa MSG, T0 + call __store_partial + + /* mask with byte count: */ + movq LEN, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa .Laegis256_counter, T1 + pcmpgtb T1, T0 + pand T0, MSG + + update0 MSG + + state_store0 + + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_dec_tail) + +/* + * void crypto_aegis256_aesni_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis256_aesni_final) + FRAME_BEGIN + + state_load + + /* prepare length block: */ + movq %rdx, MSG + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + pxor STATE3, MSG + + /* update state: */ + update0 MSG + update1 MSG + update2 MSG + update3 MSG + update4 MSG + update5 MSG + update0 MSG + + /* xor tag: */ + movdqu (%rsi), MSG + + pxor STATE0, MSG + pxor STATE1, MSG + pxor STATE2, MSG + pxor STATE3, MSG + pxor STATE4, MSG + pxor STATE5, MSG + + movdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_final) diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c new file mode 100644 index 000000000000..3181655dd862 --- /dev/null +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-256 Authenticated-Encryption Algorithm + * Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define AEGIS256_BLOCK_ALIGN 16 +#define AEGIS256_BLOCK_SIZE 16 +#define AEGIS256_NONCE_SIZE 32 +#define AEGIS256_STATE_BLOCKS 6 +#define AEGIS256_KEY_SIZE 32 +#define AEGIS256_MIN_AUTH_SIZE 8 +#define AEGIS256_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis256_aesni_ad( + void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis256_aesni_enc( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_dec( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_enc_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_dec_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_final( + void *state, void *tag_xor, unsigned int cryptlen, + unsigned int assoclen); + +struct aegis_block { + u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN); +}; + +struct aegis_state { + struct aegis_block blocks[AEGIS256_STATE_BLOCKS]; +}; + +struct aegis_ctx { + struct aegis_block key; +}; + +struct aegis_crypt_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, unsigned int length, const void *src, + void *dst); + void (*crypt_tail)(void *state, unsigned int length, const void *src, + void *dst); +}; + +static void crypto_aegis256_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, + unsigned int assoclen) +{ + struct scatter_walk walk; + struct aegis_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= AEGIS256_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = AEGIS256_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + crypto_aegis256_aesni_ad(state, + AEGIS256_BLOCK_SIZE, + buf.bytes); + pos = 0; + left -= fill; + src += fill; + } + + crypto_aegis256_aesni_ad(state, left, src); + + src += left & ~(AEGIS256_BLOCK_SIZE - 1); + left &= AEGIS256_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + pos += left; + assoclen -= size; + + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos); + crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes); + } +} + +static void crypto_aegis256_aesni_process_crypt( + struct aegis_state *state, struct aead_request *req, + const struct aegis_crypt_ops *ops) +{ + struct skcipher_walk walk; + u8 *src, *dst; + unsigned int chunksize, base; + + ops->skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + src = walk.src.virt.addr; + dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops->crypt_blocks(state, chunksize, src, dst); + + base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1); + src += base; + dst += base; + chunksize &= AEGIS256_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops->crypt_tail(state, chunksize, src, dst); + + skcipher_walk_done(&walk, 0); + } +} + +static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead) +{ + u8 *ctx = crypto_aead_ctx(aead); + ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); + return (void *)ctx; +} + +static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead); + + if (keylen != AEGIS256_KEY_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key.bytes, key, AEGIS256_KEY_SIZE); + + return 0; +} + +static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + if (authsize > AEGIS256_MAX_AUTH_SIZE) + return -EINVAL; + if (authsize < AEGIS256_MIN_AUTH_SIZE) + return -EINVAL; + return 0; +} + +static void crypto_aegis256_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, + const struct aegis_crypt_ops *ops) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm); + struct aegis_state state; + + kernel_fpu_begin(); + + crypto_aegis256_aesni_init(&state, ctx->key.bytes, req->iv); + crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen); + crypto_aegis256_aesni_process_crypt(&state, req, ops); + crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +static int crypto_aegis256_aesni_encrypt(struct aead_request *req) +{ + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = crypto_aegis256_aesni_enc, + .crypt_tail = crypto_aegis256_aesni_enc_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} + +static int crypto_aegis256_aesni_decrypt(struct aead_request *req) +{ + static const struct aegis_block zeros = {}; + + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = crypto_aegis256_aesni_dec, + .crypt_tail = crypto_aegis256_aesni_dec_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead) +{ + return 0; +} + +static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis256_aesni_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} + +static int cryptd_aegis256_aesni_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} + +static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} + +static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis256_aesni_alg[] = { + { + .setkey = crypto_aegis256_aesni_setkey, + .setauthsize = crypto_aegis256_aesni_setauthsize, + .encrypt = crypto_aegis256_aesni_encrypt, + .decrypt = crypto_aegis256_aesni_decrypt, + .init = crypto_aegis256_aesni_init_tfm, + .exit = crypto_aegis256_aesni_exit_tfm, + + .ivsize = AEGIS256_NONCE_SIZE, + .maxauthsize = AEGIS256_MAX_AUTH_SIZE, + .chunksize = AEGIS256_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + + .cra_name = "__aegis256", + .cra_driver_name = "__aegis256-aesni", + + .cra_module = THIS_MODULE, + } + }, { + .setkey = cryptd_aegis256_aesni_setkey, + .setauthsize = cryptd_aegis256_aesni_setauthsize, + .encrypt = cryptd_aegis256_aesni_encrypt, + .decrypt = cryptd_aegis256_aesni_decrypt, + .init = cryptd_aegis256_aesni_init_tfm, + .exit = cryptd_aegis256_aesni_exit_tfm, + + .ivsize = AEGIS256_NONCE_SIZE, + .maxauthsize = AEGIS256_MAX_AUTH_SIZE, + .chunksize = AEGIS256_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_alignmask = 0, + + .cra_priority = 400, + + .cra_name = "aegis256", + .cra_driver_name = "aegis256-aesni", + + .cra_module = THIS_MODULE, + } + } +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis256_aesni_module_init(void) +{ + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_aegis256_aesni_alg, + ARRAY_SIZE(crypto_aegis256_aesni_alg)); +} + +static void __exit crypto_aegis256_aesni_module_exit(void) +{ + crypto_unregister_aeads(crypto_aegis256_aesni_alg, + ARRAY_SIZE(crypto_aegis256_aesni_alg)); +} + +module_init(crypto_aegis256_aesni_module_init); +module_exit(crypto_aegis256_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis256"); +MODULE_ALIAS_CRYPTO("aegis256-aesni"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 48856238a490..d8d123ea47c6 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -310,6 +310,30 @@ config CRYPTO_AEGIS256 help Support for the AEGIS-256 dedicated AEAD algorithm. +config CRYPTO_AEGIS128_AESNI_SSE2 + tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_CRYPTD + help + AESNI+SSE2 implementation of the AEGSI-128 dedicated AEAD algorithm. + +config CRYPTO_AEGIS128L_AESNI_SSE2 + tristate "AEGIS-128L AEAD algorithm (x86_64 AESNI+SSE2 implementation)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_CRYPTD + help + AESNI+SSE2 implementation of the AEGSI-128L dedicated AEAD algorithm. + +config CRYPTO_AEGIS256_AESNI_SSE2 + tristate "AEGIS-256 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_CRYPTD + help + AESNI+SSE2 implementation of the AEGSI-256 dedicated AEAD algorithm. + config CRYPTO_SEQIV tristate "Sequence Number IV Generator" select CRYPTO_AEAD -- cgit v1.2.3 From 6ecc9d9ff91ff26769e58164b6216c6189cb8302 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 11 May 2018 14:19:12 +0200 Subject: crypto: x86 - Add optimized MORUS implementations This patch adds optimized implementations of MORUS-640 and MORUS-1280, utilizing the SSE2 and AVX2 x86 extensions. For MORUS-1280 (which operates on 256-bit blocks) we provide both AVX2 and SSE2 implementation. Although SSE2 MORUS-1280 is slower than AVX2 MORUS-1280, it is comparable in speed to the SSE2 MORUS-640. Signed-off-by: Ondrej Mosnacek Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 10 + arch/x86/crypto/morus1280-avx2-asm.S | 621 +++++++++++++++++++++++ arch/x86/crypto/morus1280-avx2-glue.c | 68 +++ arch/x86/crypto/morus1280-sse2-asm.S | 895 ++++++++++++++++++++++++++++++++++ arch/x86/crypto/morus1280-sse2-glue.c | 68 +++ arch/x86/crypto/morus640-sse2-asm.S | 614 +++++++++++++++++++++++ arch/x86/crypto/morus640-sse2-glue.c | 68 +++ crypto/Kconfig | 26 + 8 files changed, 2370 insertions(+) create mode 100644 arch/x86/crypto/morus1280-avx2-asm.S create mode 100644 arch/x86/crypto/morus1280-avx2-glue.c create mode 100644 arch/x86/crypto/morus1280-sse2-asm.S create mode 100644 arch/x86/crypto/morus1280-sse2-glue.c create mode 100644 arch/x86/crypto/morus640-sse2-asm.S create mode 100644 arch/x86/crypto/morus640-sse2-glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index c183553a4bd6..3813e7cdaada 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -42,6 +42,9 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o +obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o +obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o + # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -59,6 +62,8 @@ ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/ obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/ obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/ + + obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o endif aes-i586-y := aes-i586-asm_32.o aes_glue.o @@ -80,6 +85,9 @@ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o +morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o +morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o + ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia_aesni_avx_glue.o @@ -95,6 +103,8 @@ ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o chacha20-x86_64-y += chacha20-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o + + morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S new file mode 100644 index 000000000000..37d422e77931 --- /dev/null +++ b/arch/x86/crypto/morus1280-avx2-asm.S @@ -0,0 +1,621 @@ +/* + * AVX2 implementation of MORUS-1280 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ + (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) +#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) + +#define STATE0 %ymm0 +#define STATE0_LOW %xmm0 +#define STATE1 %ymm1 +#define STATE2 %ymm2 +#define STATE3 %ymm3 +#define STATE4 %ymm4 +#define KEY %ymm5 +#define MSG %ymm5 +#define MSG_LOW %xmm5 +#define T0 %ymm6 +#define T0_LOW %xmm6 +#define T1 %ymm7 + +.section .rodata.cst32.morus1280_const, "aM", @progbits, 32 +.align 32 +.Lmorus1280_const: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 +.align 32 +.Lmorus1280_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +.macro morus1280_round s0, s1, s2, s3, s4, b, w + vpand \s1, \s2, T0 + vpxor T0, \s0, \s0 + vpxor \s3, \s0, \s0 + vpsllq $\b, \s0, T0 + vpsrlq $(64 - \b), \s0, \s0 + vpxor T0, \s0, \s0 + vpermq $\w, \s3, \s3 +.endm + +/* + * __morus1280_update: internal ABI + * input: + * STATE[0-4] - input state + * MSG - message block + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update: + morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 + vpxor MSG, STATE1, STATE1 + morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 + vpxor MSG, STATE2, STATE2 + morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 + vpxor MSG, STATE3, STATE3 + morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 + vpxor MSG, STATE4, STATE4 + morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 + ret +ENDPROC(__morus1280_update) + +/* + * __morus1280_update_zero: internal ABI + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update_zero: + morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 + morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 + morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 + morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 + morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 + ret +ENDPROC(__morus1280_update_zero) + +/* + * __load_partial: internal ABI + * input: + * %rsi - src + * %rcx - bytes + * output: + * MSG - message block + * changed: + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + vpxor MSG, MSG, MSG + + mov %rcx, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov %rcx, %r8 + and $0x1E, %r8 + add %rsi, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov %rcx, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov %rcx, %r8 + and $0x1C, %r8 + add %rsi, %r8 + shl $16, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov %rcx, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov %rcx, %r8 + and $0x18, %r8 + add %rsi, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG_LOW + + mov %rcx, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov %rcx, %r8 + and $0x10, %r8 + add %rsi, %r8 + pshufd $MASK2, MSG_LOW, MSG_LOW + pinsrq $0, (%r8), MSG_LOW + +.Lld_partial_8: + mov %rcx, %r8 + and $0x10, %r8 + jz .Lld_partial_16 + + vpermq $MASK2, MSG, MSG + movdqu (%rsi), MSG_LOW + +.Lld_partial_16: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * %rdx - dst + * %rcx - bytes + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov %rcx, %r8 + mov %rdx, %r9 + + cmp $16, %r8 + jl .Lst_partial_16 + + movdqu T0_LOW, (%r9) + vpermq $MASK2, T0, T0 + + sub $16, %r8 + add $16, %r9 + +.Lst_partial_16: + movq T0_LOW, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + pextrq $1, T0_LOW, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $16, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_morus1280_avx2_init(void *state, const void *key, + * const void *iv); + */ +ENTRY(crypto_morus1280_avx2_init) + FRAME_BEGIN + + /* load IV: */ + vpxor STATE0, STATE0, STATE0 + movdqu (%rdx), STATE0_LOW + /* load key: */ + vmovdqu (%rsi), KEY + vmovdqa KEY, STATE1 + /* load all ones: */ + vpcmpeqd STATE2, STATE2, STATE2 + /* load all zeros: */ + vpxor STATE3, STATE3, STATE3 + /* load the constant: */ + vmovdqa .Lmorus1280_const, STATE4 + + /* update 16 times with zero: */ + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + + /* xor-in the key again after updates: */ + vpxor KEY, STATE1, STATE1 + + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_init) + +/* + * void crypto_morus1280_avx2_ad(void *state, const void *data, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_ad) + FRAME_BEGIN + + cmp $32, %rdx + jb .Lad_out + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + mov %rsi, %r8 + and $0x1F, %r8 + jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: + vmovdqa (%rsi), MSG + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_a_loop + + jmp .Lad_cont +.align 4 +.Lad_u_loop: + vmovdqu (%rsi), MSG + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_u_loop + +.Lad_cont: + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_ad) + +/* + * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_enc) + FRAME_BEGIN + + cmp $32, %rcx + jb .Lenc_out + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0x1F, %r8 + jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: + vmovdqa (%rsi), MSG + vmovdqa MSG, T0 + vpxor STATE0, T0, T0 + vpermq $MASK3, STATE1, T1 + vpxor T1, T0, T0 + vpand STATE2, STATE3, T1 + vpxor T1, T0, T0 + vmovdqa T0, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_a_loop + + jmp .Lenc_cont +.align 4 +.Lenc_u_loop: + vmovdqu (%rsi), MSG + vmovdqa MSG, T0 + vpxor STATE0, T0, T0 + vpermq $MASK3, STATE1, T1 + vpxor T1, T0, T0 + vpand STATE2, STATE3, T1 + vpxor T1, T0, T0 + vmovdqu T0, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_u_loop + +.Lenc_cont: + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_enc) + +/* + * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_enc_tail) + FRAME_BEGIN + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + /* encrypt message: */ + call __load_partial + + vmovdqa MSG, T0 + vpxor STATE0, T0, T0 + vpermq $MASK3, STATE1, T1 + vpxor T1, T0, T0 + vpand STATE2, STATE3, T1 + vpxor T1, T0, T0 + + call __store_partial + + call __morus1280_update + + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + + FRAME_END +ENDPROC(crypto_morus1280_avx2_enc_tail) + +/* + * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_dec) + FRAME_BEGIN + + cmp $32, %rcx + jb .Ldec_out + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0x1F, %r8 + jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: + vmovdqa (%rsi), MSG + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqa MSG, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_a_loop + + jmp .Ldec_cont +.align 4 +.Ldec_u_loop: + vmovdqu (%rsi), MSG + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqu MSG, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_u_loop + +.Ldec_cont: + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_dec) + +/* + * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_dec_tail) + FRAME_BEGIN + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + /* decrypt message: */ + call __load_partial + + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqa MSG, T0 + + call __store_partial + + /* mask with byte count: */ + movq %rcx, T0_LOW + vpbroadcastb T0_LOW, T0 + vmovdqa .Lmorus1280_counter, T1 + vpcmpgtb T1, T0, T0 + vpand T0, MSG, MSG + + call __morus1280_update + + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_dec_tail) + +/* + * void crypto_morus1280_avx2_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus1280_avx2_final) + FRAME_BEGIN + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + /* xor state[0] into state[4]: */ + vpxor STATE0, STATE4, STATE4 + + /* prepare length block: */ + vpxor MSG, MSG, MSG + vpinsrq $0, %rdx, MSG_LOW, MSG_LOW + vpinsrq $1, %rcx, MSG_LOW, MSG_LOW + vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ + + /* update state: */ + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + + /* xor tag: */ + vmovdqu (%rsi), MSG + + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_final) diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c new file mode 100644 index 000000000000..f111f36d26dc --- /dev/null +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + * Glue for AVX2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include + +asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key, + const void *iv); +asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data, + unsigned int length); + +asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, + u64 assoclen, u64 cryptlen); + +MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); + +static const struct x86_cpu_id avx2_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AVX2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id); + +static int __init crypto_morus1280_avx2_module_init(void) +{ + if (!x86_match_cpu(avx2_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_morus1280_avx2_algs, + ARRAY_SIZE(crypto_morus1280_avx2_algs)); +} + +static void __exit crypto_morus1280_avx2_module_exit(void) +{ + crypto_unregister_aeads(crypto_morus1280_avx2_algs, + ARRAY_SIZE(crypto_morus1280_avx2_algs)); +} + +module_init(crypto_morus1280_avx2_module_init); +module_exit(crypto_morus1280_avx2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation"); +MODULE_ALIAS_CRYPTO("morus1280"); +MODULE_ALIAS_CRYPTO("morus1280-avx2"); diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S new file mode 100644 index 000000000000..1fe637c7be9d --- /dev/null +++ b/arch/x86/crypto/morus1280-sse2-asm.S @@ -0,0 +1,895 @@ +/* + * SSE2 implementation of MORUS-1280 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ + (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) + +#define STATE0_LO %xmm0 +#define STATE0_HI %xmm1 +#define STATE1_LO %xmm2 +#define STATE1_HI %xmm3 +#define STATE2_LO %xmm4 +#define STATE2_HI %xmm5 +#define STATE3_LO %xmm6 +#define STATE3_HI %xmm7 +#define STATE4_LO %xmm8 +#define STATE4_HI %xmm9 +#define KEY_LO %xmm10 +#define KEY_HI %xmm11 +#define MSG_LO %xmm10 +#define MSG_HI %xmm11 +#define T0_LO %xmm12 +#define T0_HI %xmm13 +#define T1_LO %xmm14 +#define T1_HI %xmm15 + +.section .rodata.cst16.morus640_const, "aM", @progbits, 16 +.align 16 +.Lmorus640_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Lmorus640_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 +.align 16 +.Lmorus640_counter_0: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Lmorus640_counter_1: + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +.macro rol1 hi, lo + /* + * HI_1 | HI_0 || LO_1 | LO_0 + * ==> + * HI_0 | HI_1 || LO_1 | LO_0 + * ==> + * HI_0 | LO_1 || LO_0 | HI_1 + */ + pshufd $MASK2, \hi, \hi + movdqa \hi, T0_LO + punpcklqdq \lo, T0_LO + punpckhqdq \hi, \lo + movdqa \lo, \hi + movdqa T0_LO, \lo +.endm + +.macro rol2 hi, lo + movdqa \lo, T0_LO + movdqa \hi, \lo + movdqa T0_LO, \hi +.endm + +.macro rol3 hi, lo + /* + * HI_1 | HI_0 || LO_1 | LO_0 + * ==> + * HI_0 | HI_1 || LO_1 | LO_0 + * ==> + * LO_0 | HI_1 || HI_0 | LO_1 + */ + pshufd $MASK2, \hi, \hi + movdqa \lo, T0_LO + punpckhqdq \hi, T0_LO + punpcklqdq \lo, \hi + movdqa T0_LO, \lo +.endm + +.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w + movdqa \s1_l, T0_LO + pand \s2_l, T0_LO + pxor T0_LO, \s0_l + + movdqa \s1_h, T0_LO + pand \s2_h, T0_LO + pxor T0_LO, \s0_h + + pxor \s3_l, \s0_l + pxor \s3_h, \s0_h + + movdqa \s0_l, T0_LO + psllq $\b, T0_LO + psrlq $(64 - \b), \s0_l + pxor T0_LO, \s0_l + + movdqa \s0_h, T0_LO + psllq $\b, T0_LO + psrlq $(64 - \b), \s0_h + pxor T0_LO, \s0_h + + \w \s3_h, \s3_l +.endm + +/* + * __morus1280_update: internal ABI + * input: + * STATE[0-4] - input state + * MSG - message block + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update: + morus1280_round \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + 13, rol1 + pxor MSG_LO, STATE1_LO + pxor MSG_HI, STATE1_HI + morus1280_round \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + 46, rol2 + pxor MSG_LO, STATE2_LO + pxor MSG_HI, STATE2_HI + morus1280_round \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + 38, rol3 + pxor MSG_LO, STATE3_LO + pxor MSG_HI, STATE3_HI + morus1280_round \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + 7, rol2 + pxor MSG_LO, STATE4_LO + pxor MSG_HI, STATE4_HI + morus1280_round \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + 4, rol1 + ret +ENDPROC(__morus1280_update) + +/* + * __morus1280_update_zero: internal ABI + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update_zero: + morus1280_round \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + 13, rol1 + morus1280_round \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + 46, rol2 + morus1280_round \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + 38, rol3 + morus1280_round \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + 7, rol2 + morus1280_round \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + 4, rol1 + ret +ENDPROC(__morus1280_update_zero) + +/* + * __load_partial: internal ABI + * input: + * %rsi - src + * %rcx - bytes + * output: + * MSG - message block + * changed: + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG_LO, MSG_LO + pxor MSG_HI, MSG_HI + + mov %rcx, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov %rcx, %r8 + and $0x1E, %r8 + add %rsi, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov %rcx, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov %rcx, %r8 + and $0x1C, %r8 + add %rsi, %r8 + shl $16, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov %rcx, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov %rcx, %r8 + and $0x18, %r8 + add %rsi, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG_LO + + mov %rcx, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov %rcx, %r8 + and $0x10, %r8 + add %rsi, %r8 + pslldq $8, MSG_LO + movq (%r8), T0_LO + pxor T0_LO, MSG_LO + +.Lld_partial_8: + mov %rcx, %r8 + and $0x10, %r8 + jz .Lld_partial_16 + + movdqa MSG_LO, MSG_HI + movdqu (%rsi), MSG_LO + +.Lld_partial_16: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * %rdx - dst + * %rcx - bytes + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov %rcx, %r8 + mov %rdx, %r9 + + cmp $16, %r8 + jl .Lst_partial_16 + + movdqu T0_LO, (%r9) + movdqa T0_HI, T0_LO + + sub $16, %r8 + add $16, %r9 + +.Lst_partial_16: + movq T0_LO, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0_LO + movq T0_LO, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $16, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_morus1280_sse2_init(void *state, const void *key, + * const void *iv); + */ +ENTRY(crypto_morus1280_sse2_init) + FRAME_BEGIN + + /* load IV: */ + pxor STATE0_HI, STATE0_HI + movdqu (%rdx), STATE0_LO + /* load key: */ + movdqu 0(%rsi), KEY_LO + movdqu 16(%rsi), KEY_HI + movdqa KEY_LO, STATE1_LO + movdqa KEY_HI, STATE1_HI + /* load all ones: */ + pcmpeqd STATE2_LO, STATE2_LO + pcmpeqd STATE2_HI, STATE2_HI + /* load all zeros: */ + pxor STATE3_LO, STATE3_LO + pxor STATE3_HI, STATE3_HI + /* load the constant: */ + movdqa .Lmorus640_const_0, STATE4_LO + movdqa .Lmorus640_const_1, STATE4_HI + + /* update 16 times with zero: */ + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + + /* xor-in the key again after updates: */ + pxor KEY_LO, STATE1_LO + pxor KEY_HI, STATE1_HI + + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_init) + +/* + * void crypto_morus1280_sse2_ad(void *state, const void *data, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_ad) + FRAME_BEGIN + + cmp $32, %rdx + jb .Lad_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + mov %rsi, %r8 + and $0xF, %r8 + jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: + movdqa 0(%rsi), MSG_LO + movdqa 16(%rsi), MSG_HI + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_a_loop + + jmp .Lad_cont +.align 4 +.Lad_u_loop: + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_u_loop + +.Lad_cont: + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_ad) + +/* + * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_enc) + FRAME_BEGIN + + cmp $32, %rcx + jb .Lenc_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: + movdqa 0(%rsi), MSG_LO + movdqa 16(%rsi), MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + pxor STATE0_LO, T0_LO + pxor STATE0_HI, T0_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + movdqa T0_LO, 0(%rdx) + movdqa T0_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_a_loop + + jmp .Lenc_cont +.align 4 +.Lenc_u_loop: + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + pxor STATE0_LO, T0_LO + pxor STATE0_HI, T0_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + movdqu T0_LO, 0(%rdx) + movdqu T0_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_u_loop + +.Lenc_cont: + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_enc) + +/* + * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_enc_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + /* encrypt message: */ + call __load_partial + + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + pxor STATE0_LO, T0_LO + pxor STATE0_HI, T0_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + + call __store_partial + + call __morus1280_update + + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + + FRAME_END +ENDPROC(crypto_morus1280_sse2_enc_tail) + +/* + * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_dec) + FRAME_BEGIN + + cmp $32, %rcx + jb .Ldec_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: + movdqa 0(%rsi), MSG_LO + movdqa 16(%rsi), MSG_HI + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa MSG_LO, 0(%rdx) + movdqa MSG_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_a_loop + + jmp .Ldec_cont +.align 4 +.Ldec_u_loop: + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqu MSG_LO, 0(%rdx) + movdqu MSG_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_u_loop + +.Ldec_cont: + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_dec) + +/* + * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_dec_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + /* decrypt message: */ + call __load_partial + + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + + call __store_partial + + /* mask with byte count: */ + movq %rcx, T0_LO + punpcklbw T0_LO, T0_LO + punpcklbw T0_LO, T0_LO + punpcklbw T0_LO, T0_LO + punpcklbw T0_LO, T0_LO + movdqa T0_LO, T0_HI + movdqa .Lmorus640_counter_0, T1_LO + movdqa .Lmorus640_counter_1, T1_HI + pcmpgtb T1_LO, T0_LO + pcmpgtb T1_HI, T0_HI + pand T0_LO, MSG_LO + pand T0_HI, MSG_HI + + call __morus1280_update + + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_dec_tail) + +/* + * void crypto_morus1280_sse2_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus1280_sse2_final) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + /* xor state[0] into state[4]: */ + pxor STATE0_LO, STATE4_LO + pxor STATE0_HI, STATE4_HI + + /* prepare length block: */ + movq %rdx, MSG_LO + movq %rcx, T0_LO + pslldq $8, T0_LO + pxor T0_LO, MSG_LO + psllq $3, MSG_LO /* multiply by 8 (to get bit count) */ + pxor MSG_HI, MSG_HI + + /* update state: */ + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + + /* xor tag: */ + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T0_LO + movdqa STATE1_HI, T0_HI + rol3 T0_HI, T0_LO + pxor T0_LO, MSG_LO + pxor T0_HI, MSG_HI + movdqa STATE2_LO, T0_LO + movdqa STATE2_HI, T0_HI + pand STATE3_LO, T0_LO + pand STATE3_HI, T0_HI + pxor T0_LO, MSG_LO + pxor T0_HI, MSG_HI + + movdqu MSG_LO, 0(%rsi) + movdqu MSG_HI, 16(%rsi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_final) diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c new file mode 100644 index 000000000000..839270aa713c --- /dev/null +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + * Glue for SSE2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include + +asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key, + const void *iv); +asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data, + unsigned int length); + +asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, + u64 assoclen, u64 cryptlen); + +MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); + +static const struct x86_cpu_id sse2_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); + +static int __init crypto_morus1280_sse2_module_init(void) +{ + if (!x86_match_cpu(sse2_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_morus1280_sse2_algs, + ARRAY_SIZE(crypto_morus1280_sse2_algs)); +} + +static void __exit crypto_morus1280_sse2_module_exit(void) +{ + crypto_unregister_aeads(crypto_morus1280_sse2_algs, + ARRAY_SIZE(crypto_morus1280_sse2_algs)); +} + +module_init(crypto_morus1280_sse2_module_init); +module_exit(crypto_morus1280_sse2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation"); +MODULE_ALIAS_CRYPTO("morus1280"); +MODULE_ALIAS_CRYPTO("morus1280-sse2"); diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S new file mode 100644 index 000000000000..71c72a0a0862 --- /dev/null +++ b/arch/x86/crypto/morus640-sse2-asm.S @@ -0,0 +1,614 @@ +/* + * SSE2 implementation of MORUS-640 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ + (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) +#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define KEY %xmm5 +#define MSG %xmm5 +#define T0 %xmm6 +#define T1 %xmm7 + +.section .rodata.cst16.morus640_const, "aM", @progbits, 32 +.align 16 +.Lmorus640_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Lmorus640_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 +.align 16 +.Lmorus640_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +.macro morus640_round s0, s1, s2, s3, s4, b, w + movdqa \s1, T0 + pand \s2, T0 + pxor T0, \s0 + pxor \s3, \s0 + movdqa \s0, T0 + pslld $\b, T0 + psrld $(32 - \b), \s0 + pxor T0, \s0 + pshufd $\w, \s3, \s3 +.endm + +/* + * __morus640_update: internal ABI + * input: + * STATE[0-4] - input state + * MSG - message block + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus640_update: + morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 + pxor MSG, STATE1 + morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 + pxor MSG, STATE2 + morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 + pxor MSG, STATE3 + morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 + pxor MSG, STATE4 + morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 + ret +ENDPROC(__morus640_update) + + +/* + * __morus640_update_zero: internal ABI + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus640_update_zero: + morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 + morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 + morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 + morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 + morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 + ret +ENDPROC(__morus640_update_zero) + +/* + * __load_partial: internal ABI + * input: + * %rsi - src + * %rcx - bytes + * output: + * MSG - message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG, MSG + + mov %rcx, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov %rcx, %r8 + and $0x1E, %r8 + add %rsi, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov %rcx, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov %rcx, %r8 + and $0x1C, %r8 + add %rsi, %r8 + shl $16, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov %rcx, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov %rcx, %r8 + and $0x18, %r8 + add %rsi, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG + + mov %rcx, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov %rcx, %r8 + and $0x10, %r8 + add %rsi, %r8 + pslldq $8, MSG + movq (%r8), T0 + pxor T0, MSG + +.Lld_partial_8: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * %rdx - dst + * %rcx - bytes + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov %rcx, %r8 + mov %rdx, %r9 + + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $16, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_morus640_sse2_init) + FRAME_BEGIN + + /* load IV: */ + movdqu (%rdx), STATE0 + /* load key: */ + movdqu (%rsi), KEY + movdqa KEY, STATE1 + /* load all ones: */ + pcmpeqd STATE2, STATE2 + /* load the constants: */ + movdqa .Lmorus640_const_0, STATE3 + movdqa .Lmorus640_const_1, STATE4 + + /* update 16 times with zero: */ + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + + /* xor-in the key again after updates: */ + pxor KEY, STATE1 + + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_init) + +/* + * void crypto_morus640_sse2_ad(void *state, const void *data, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_ad) + FRAME_BEGIN + + cmp $16, %rdx + jb .Lad_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + mov %rsi, %r8 + and $0xF, %r8 + jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: + movdqa (%rsi), MSG + call __morus640_update + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lad_a_loop + + jmp .Lad_cont +.align 4 +.Lad_u_loop: + movdqu (%rsi), MSG + call __morus640_update + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lad_u_loop + +.Lad_cont: + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_ad) + +/* + * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_enc) + FRAME_BEGIN + + cmp $16, %rcx + jb .Lenc_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: + movdqa (%rsi), MSG + movdqa MSG, T0 + pxor STATE0, T0 + pshufd $MASK3, STATE1, T1 + pxor T1, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + movdqa T0, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Lenc_a_loop + + jmp .Lenc_cont +.align 4 +.Lenc_u_loop: + movdqu (%rsi), MSG + movdqa MSG, T0 + pxor STATE0, T0 + pshufd $MASK3, STATE1, T1 + pxor T1, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + movdqu T0, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Lenc_u_loop + +.Lenc_cont: + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_enc) + +/* + * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_enc_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + /* encrypt message: */ + call __load_partial + + movdqa MSG, T0 + pxor STATE0, T0 + pshufd $MASK3, STATE1, T1 + pxor T1, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + + call __store_partial + + call __morus640_update + + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + + FRAME_END +ENDPROC(crypto_morus640_sse2_enc_tail) + +/* + * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_dec) + FRAME_BEGIN + + cmp $16, %rcx + jb .Ldec_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: + movdqa (%rsi), MSG + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + movdqa MSG, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Ldec_a_loop + + jmp .Ldec_cont +.align 4 +.Ldec_u_loop: + movdqu (%rsi), MSG + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + movdqu MSG, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Ldec_u_loop + +.Ldec_cont: + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_dec) + +/* + * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_dec_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + /* decrypt message: */ + call __load_partial + + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + movdqa MSG, T0 + + call __store_partial + + /* mask with byte count: */ + movq %rcx, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa .Lmorus640_counter, T1 + pcmpgtb T1, T0 + pand T0, MSG + + call __morus640_update + + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_dec_tail) + +/* + * void crypto_morus640_sse2_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus640_sse2_final) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + /* xor state[0] into state[4]: */ + pxor STATE0, STATE4 + + /* prepare length block: */ + movq %rdx, MSG + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + /* update state: */ + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + + /* xor tag: */ + movdqu (%rsi), MSG + + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + + movdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_final) diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c new file mode 100644 index 000000000000..26b47e2db8d2 --- /dev/null +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-640 Authenticated-Encryption Algorithm + * Glue for SSE2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include + +asmlinkage void crypto_morus640_sse2_init(void *state, const void *key, + const void *iv); +asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data, + unsigned int length); + +asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, + u64 assoclen, u64 cryptlen); + +MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); + +static const struct x86_cpu_id sse2_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); + +static int __init crypto_morus640_sse2_module_init(void) +{ + if (!x86_match_cpu(sse2_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_morus640_sse2_algs, + ARRAY_SIZE(crypto_morus640_sse2_algs)); +} + +static void __exit crypto_morus640_sse2_module_exit(void) +{ + crypto_unregister_aeads(crypto_morus640_sse2_algs, + ARRAY_SIZE(crypto_morus640_sse2_algs)); +} + +module_init(crypto_morus640_sse2_module_init); +module_exit(crypto_morus640_sse2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation"); +MODULE_ALIAS_CRYPTO("morus640"); +MODULE_ALIAS_CRYPTO("morus640-sse2"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 4761667fbcf9..75f5efde9aa3 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -348,6 +348,14 @@ config CRYPTO_MORUS640_GLUE Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD algorithm. +config CRYPTO_MORUS640_SSE2 + tristate "MORUS-640 AEAD algorithm (x86_64 SSE2 implementation)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_MORUS640_GLUE + help + SSE2 implementation of the MORUS-640 dedicated AEAD algorithm. + config CRYPTO_MORUS1280 tristate "MORUS-1280 AEAD algorithm" select CRYPTO_AEAD @@ -362,6 +370,24 @@ config CRYPTO_MORUS1280_GLUE Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD algorithm. +config CRYPTO_MORUS1280_SSE2 + tristate "MORUS-1280 AEAD algorithm (x86_64 SSE2 implementation)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_MORUS1280_GLUE + help + SSE2 optimizedimplementation of the MORUS-1280 dedicated AEAD + algorithm. + +config CRYPTO_MORUS1280_AVX2 + tristate "MORUS-1280 AEAD algorithm (x86_64 AVX2 implementation)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_MORUS1280_GLUE + help + AVX2 optimized implementation of the MORUS-1280 dedicated AEAD + algorithm. + config CRYPTO_SEQIV tristate "Sequence Number IV Generator" select CRYPTO_AEAD -- cgit v1.2.3 From dd09f58ce0c7011f1c7d0a52779dafc7a5ba5506 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Sun, 20 May 2018 10:57:23 +0200 Subject: crypto: x86/aegis256 - Fix wrong key buffer size AEGIS-256 key is two blocks, not one. Fixes: 1d373d4e8e15 ("crypto: x86 - Add optimized AEGIS implementations") Reported-by: Eric Biggers Signed-off-by: Ondrej Mosnacek Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis256-aesni-glue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index 3181655dd862..2b5dd3af8f4d 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -57,7 +57,7 @@ struct aegis_state { }; struct aegis_ctx { - struct aegis_block key; + struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE]; }; struct aegis_crypt_ops { @@ -164,7 +164,7 @@ static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key, return -EINVAL; } - memcpy(ctx->key.bytes, key, AEGIS256_KEY_SIZE); + memcpy(ctx->key, key, AEGIS256_KEY_SIZE); return 0; } @@ -190,7 +190,7 @@ static void crypto_aegis256_aesni_crypt(struct aead_request *req, kernel_fpu_begin(); - crypto_aegis256_aesni_init(&state, ctx->key.bytes, req->iv); + crypto_aegis256_aesni_init(&state, ctx->key, req->iv); crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen); crypto_aegis256_aesni_process_crypt(&state, req, ops); crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen); -- cgit v1.2.3 From 2808f17319155256498badd5acd9609aaa3f13b6 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 21 May 2018 21:41:51 +0200 Subject: crypto: morus - Mark MORUS SIMD glue as x86-specific Commit 56e8e57fc3a7 ("crypto: morus - Add common SIMD glue code for MORUS") accidetally consiedered the glue code to be usable by different architectures, but it seems to be only usable on x86. This patch moves it under arch/x86/crypto and adds 'depends on X86' to the Kconfig options and also removes the prompt to hide these internal options from the user. Reported-by: kbuild test robot Signed-off-by: Ondrej Mosnacek Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/morus1280_glue.c | 302 +++++++++++++++++++++++++++++++++++++++ arch/x86/crypto/morus640_glue.c | 298 ++++++++++++++++++++++++++++++++++++++ crypto/Kconfig | 6 +- crypto/Makefile | 2 - crypto/morus1280_glue.c | 302 --------------------------------------- crypto/morus640_glue.c | 298 -------------------------------------- 7 files changed, 607 insertions(+), 604 deletions(-) create mode 100644 arch/x86/crypto/morus1280_glue.c create mode 100644 arch/x86/crypto/morus640_glue.c delete mode 100644 crypto/morus1280_glue.c delete mode 100644 crypto/morus640_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3813e7cdaada..48e731d782e9 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -42,6 +42,9 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o +obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o +obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o + obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c new file mode 100644 index 000000000000..0dccdda1eb3a --- /dev/null +++ b/arch/x86/crypto/morus1280_glue.c @@ -0,0 +1,302 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + * Common x86 SIMD glue skeleton + * + * Copyright (c) 2016-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct morus1280_state { + struct morus1280_block s[MORUS_STATE_BLOCKS]; +}; + +struct morus1280_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, const void *src, void *dst, + unsigned int length); + void (*crypt_tail)(void *state, const void *src, void *dst, + unsigned int length); +}; + +static void crypto_morus1280_glue_process_ad( + struct morus1280_state *state, + const struct morus1280_glue_ops *ops, + struct scatterlist *sg_src, unsigned int assoclen) +{ + struct scatter_walk walk; + struct morus1280_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= MORUS1280_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = MORUS1280_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); + pos = 0; + left -= fill; + src += fill; + } + + ops->ad(state, src, left); + src += left & ~(MORUS1280_BLOCK_SIZE - 1); + left &= MORUS1280_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + + pos += left; + assoclen -= size; + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); + ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); + } +} + +static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state, + struct morus1280_ops ops, + struct aead_request *req) +{ + struct skcipher_walk walk; + u8 *cursor_src, *cursor_dst; + unsigned int chunksize, base; + + ops.skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + cursor_src = walk.src.virt.addr; + cursor_dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); + + base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1); + cursor_src += base; + cursor_dst += base; + chunksize &= MORUS1280_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops.crypt_tail(state, cursor_src, cursor_dst, + chunksize); + + skcipher_walk_done(&walk, 0); + } +} + +int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct morus1280_ctx *ctx = crypto_aead_ctx(aead); + + if (keylen == MORUS1280_BLOCK_SIZE) { + memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE); + } else if (keylen == MORUS1280_BLOCK_SIZE / 2) { + memcpy(ctx->key.bytes, key, keylen); + memcpy(ctx->key.bytes + keylen, key, keylen); + } else { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey); + +int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize); + +static void crypto_morus1280_glue_crypt(struct aead_request *req, + struct morus1280_ops ops, + unsigned int cryptlen, + struct morus1280_block *tag_xor) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); + struct morus1280_state state; + + kernel_fpu_begin(); + + ctx->ops->init(&state, &ctx->key, req->iv); + crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); + crypto_morus1280_glue_process_crypt(&state, ops, req); + ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +int crypto_morus1280_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); + struct morus1280_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = ctx->ops->enc, + .crypt_tail = ctx->ops->enc_tail, + }; + + struct morus1280_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt); + +int crypto_morus1280_glue_decrypt(struct aead_request *req) +{ + static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); + struct morus1280_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = ctx->ops->dec, + .crypt_tail = ctx->ops->dec_tail, + }; + + struct morus1280_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); + + return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt); + +void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, + const struct morus1280_glue_ops *ops) +{ + struct morus1280_ctx *ctx = crypto_aead_ctx(aead); + ctx->ops = ops; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); + +int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey); + +int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize); + +int cryptd_morus1280_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt); + +int cryptd_morus1280_glue_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt); + +int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + const char *name = crypto_aead_alg(aead)->base.cra_driver_name; + char internal_name[CRYPTO_MAX_ALG_NAME]; + + if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) + >= CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm); + +void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c new file mode 100644 index 000000000000..7b58fe4d9bd1 --- /dev/null +++ b/arch/x86/crypto/morus640_glue.c @@ -0,0 +1,298 @@ +/* + * The MORUS-640 Authenticated-Encryption Algorithm + * Common x86 SIMD glue skeleton + * + * Copyright (c) 2016-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct morus640_state { + struct morus640_block s[MORUS_STATE_BLOCKS]; +}; + +struct morus640_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, const void *src, void *dst, + unsigned int length); + void (*crypt_tail)(void *state, const void *src, void *dst, + unsigned int length); +}; + +static void crypto_morus640_glue_process_ad( + struct morus640_state *state, + const struct morus640_glue_ops *ops, + struct scatterlist *sg_src, unsigned int assoclen) +{ + struct scatter_walk walk; + struct morus640_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= MORUS640_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = MORUS640_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); + pos = 0; + left -= fill; + src += fill; + } + + ops->ad(state, src, left); + src += left & ~(MORUS640_BLOCK_SIZE - 1); + left &= MORUS640_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + + pos += left; + assoclen -= size; + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); + ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); + } +} + +static void crypto_morus640_glue_process_crypt(struct morus640_state *state, + struct morus640_ops ops, + struct aead_request *req) +{ + struct skcipher_walk walk; + u8 *cursor_src, *cursor_dst; + unsigned int chunksize, base; + + ops.skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + cursor_src = walk.src.virt.addr; + cursor_dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); + + base = chunksize & ~(MORUS640_BLOCK_SIZE - 1); + cursor_src += base; + cursor_dst += base; + chunksize &= MORUS640_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops.crypt_tail(state, cursor_src, cursor_dst, + chunksize); + + skcipher_walk_done(&walk, 0); + } +} + +int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct morus640_ctx *ctx = crypto_aead_ctx(aead); + + if (keylen != MORUS640_BLOCK_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE); + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey); + +int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize); + +static void crypto_morus640_glue_crypt(struct aead_request *req, + struct morus640_ops ops, + unsigned int cryptlen, + struct morus640_block *tag_xor) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus640_ctx *ctx = crypto_aead_ctx(tfm); + struct morus640_state state; + + kernel_fpu_begin(); + + ctx->ops->init(&state, &ctx->key, req->iv); + crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); + crypto_morus640_glue_process_crypt(&state, ops, req); + ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +int crypto_morus640_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus640_ctx *ctx = crypto_aead_ctx(tfm); + struct morus640_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = ctx->ops->enc, + .crypt_tail = ctx->ops->enc_tail, + }; + + struct morus640_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt); + +int crypto_morus640_glue_decrypt(struct aead_request *req) +{ + static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus640_ctx *ctx = crypto_aead_ctx(tfm); + struct morus640_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = ctx->ops->dec, + .crypt_tail = ctx->ops->dec_tail, + }; + + struct morus640_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); + + return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt); + +void crypto_morus640_glue_init_ops(struct crypto_aead *aead, + const struct morus640_glue_ops *ops) +{ + struct morus640_ctx *ctx = crypto_aead_ctx(aead); + ctx->ops = ops; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); + +int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey); + +int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize); + +int cryptd_morus640_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt); + +int cryptd_morus640_glue_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt); + +int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + const char *name = crypto_aead_alg(aead)->base.cra_driver_name; + char internal_name[CRYPTO_MAX_ALG_NAME]; + + if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) + >= CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm); + +void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek "); +MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 75f5efde9aa3..30d54a56e64a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -341,7 +341,8 @@ config CRYPTO_MORUS640 Support for the MORUS-640 dedicated AEAD algorithm. config CRYPTO_MORUS640_GLUE - tristate "MORUS-640 AEAD algorithm (glue for SIMD optimizations)" + tristate + depends on X86 select CRYPTO_AEAD select CRYPTO_CRYPTD help @@ -363,7 +364,8 @@ config CRYPTO_MORUS1280 Support for the MORUS-1280 dedicated AEAD algorithm. config CRYPTO_MORUS1280_GLUE - tristate "MORUS-1280 AEAD algorithm (glue for SIMD optimizations)" + tristate + depends on X86 select CRYPTO_AEAD select CRYPTO_CRYPTD help diff --git a/crypto/Makefile b/crypto/Makefile index 68a7c546460a..6d1d40eeb964 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -91,8 +91,6 @@ obj-$(CONFIG_CRYPTO_AEGIS128L) += aegis128l.o obj-$(CONFIG_CRYPTO_AEGIS256) += aegis256.o obj-$(CONFIG_CRYPTO_MORUS640) += morus640.o obj-$(CONFIG_CRYPTO_MORUS1280) += morus1280.o -obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o -obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o obj-$(CONFIG_CRYPTO_MCRYPTD) += mcryptd.o diff --git a/crypto/morus1280_glue.c b/crypto/morus1280_glue.c deleted file mode 100644 index ce1e5c34b09d..000000000000 --- a/crypto/morus1280_glue.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Common glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct morus1280_state { - struct morus1280_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus1280_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus1280_glue_process_ad( - struct morus1280_state *state, - const struct morus1280_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus1280_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS1280_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS1280_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS1280_BLOCK_SIZE - 1); - left &= MORUS1280_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - } -} - -static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state, - struct morus1280_ops ops, - struct aead_request *req) -{ - struct skcipher_walk walk; - u8 *cursor_src, *cursor_dst; - unsigned int chunksize, base; - - ops.skcipher_walk_init(&walk, req, false); - - while (walk.nbytes) { - cursor_src = walk.src.virt.addr; - cursor_dst = walk.dst.virt.addr; - chunksize = walk.nbytes; - - ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); - - base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1); - cursor_src += base; - cursor_dst += base; - chunksize &= MORUS1280_BLOCK_SIZE - 1; - - if (chunksize > 0) - ops.crypt_tail(state, cursor_src, cursor_dst, - chunksize); - - skcipher_walk_done(&walk, 0); - } -} - -int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen == MORUS1280_BLOCK_SIZE) { - memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE); - } else if (keylen == MORUS1280_BLOCK_SIZE / 2) { - memcpy(ctx->key.bytes, key, keylen); - memcpy(ctx->key.bytes + keylen, key, keylen); - } else { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey); - -int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize); - -static void crypto_morus1280_glue_crypt(struct aead_request *req, - struct morus1280_ops ops, - unsigned int cryptlen, - struct morus1280_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_state state; - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus1280_glue_process_crypt(&state, ops, req); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus1280_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus1280_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt); - -int crypto_morus1280_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus1280_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt); - -void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, - const struct morus1280_glue_ops *ops) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); - -int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey); - -int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize); - -int cryptd_morus1280_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt); - -int cryptd_morus1280_glue_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt); - -int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - const char *name = crypto_aead_alg(aead)->base.cra_driver_name; - char internal_name[CRYPTO_MAX_ALG_NAME]; - - if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) - >= CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; - - cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm); - -void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for optimizations"); diff --git a/crypto/morus640_glue.c b/crypto/morus640_glue.c deleted file mode 100644 index c7e788cfaa29..000000000000 --- a/crypto/morus640_glue.c +++ /dev/null @@ -1,298 +0,0 @@ -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Common glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct morus640_state { - struct morus640_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus640_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus640_glue_process_ad( - struct morus640_state *state, - const struct morus640_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus640_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS640_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS640_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS640_BLOCK_SIZE - 1); - left &= MORUS640_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - } -} - -static void crypto_morus640_glue_process_crypt(struct morus640_state *state, - struct morus640_ops ops, - struct aead_request *req) -{ - struct skcipher_walk walk; - u8 *cursor_src, *cursor_dst; - unsigned int chunksize, base; - - ops.skcipher_walk_init(&walk, req, false); - - while (walk.nbytes) { - cursor_src = walk.src.virt.addr; - cursor_dst = walk.dst.virt.addr; - chunksize = walk.nbytes; - - ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); - - base = chunksize & ~(MORUS640_BLOCK_SIZE - 1); - cursor_src += base; - cursor_dst += base; - chunksize &= MORUS640_BLOCK_SIZE - 1; - - if (chunksize > 0) - ops.crypt_tail(state, cursor_src, cursor_dst, - chunksize); - - skcipher_walk_done(&walk, 0); - } -} - -int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen != MORUS640_BLOCK_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey); - -int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize); - -static void crypto_morus640_glue_crypt(struct aead_request *req, - struct morus640_ops ops, - unsigned int cryptlen, - struct morus640_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_state state; - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus640_glue_process_crypt(&state, ops, req); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus640_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus640_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt); - -int crypto_morus640_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus640_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt); - -void crypto_morus640_glue_init_ops(struct crypto_aead *aead, - const struct morus640_glue_ops *ops) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); - -int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey); - -int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize); - -int cryptd_morus640_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt); - -int cryptd_morus640_glue_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt); - -int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - const char *name = crypto_aead_alg(aead)->base.cra_driver_name; - char internal_name[CRYPTO_MAX_ALG_NAME]; - - if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) - >= CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; - - cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm); - -void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek "); -MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for optimizations"); -- cgit v1.2.3 From b7b73cd5d74694ed59abcdb4974dacb4ff8b2a2a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 26 May 2018 00:08:58 -0700 Subject: crypto: x86/salsa20 - remove x86 salsa20 implementations The x86 assembly implementations of Salsa20 use the frame base pointer register (%ebp or %rbp), which breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Recent (v4.10+) kernels will warn about this, e.g. WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 'bp' value 000000001077994c [...] But after looking into it, I believe there's very little reason to still retain the x86 Salsa20 code. First, these are *not* vectorized (SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere close to the best Salsa20 performance on any remotely modern x86 processor; they're just regular x86 assembly. Second, it's still unclear that anyone is actually using the kernel's Salsa20 at all, especially given that now ChaCha20 is supported too, and with much more efficient SSSE3 and AVX2 implementations. Finally, in benchmarks I did on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic (~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm is only slightly faster than salsa20-generic (~15% faster on Skylake, ~20% faster on Zen). The gcc version made little difference. So, the x86_64 salsa20-asm is pretty clearly useless. That leaves just the i686 salsa20-asm, which based on my tests provides a 15-20% speed boost. But that's without updating the code to not use %ebp. And given the maintenance cost, the small speed difference vs. salsa20-generic, the fact that few people still use i686 kernels, the doubt that anyone is even using the kernel's Salsa20 at all, and the fact that a SSE2 implementation would almost certainly be much faster on any remotely modern x86 processor yet no one has cared enough to add one yet, I don't think it's worthwhile to keep. Thus, just remove both the x86_64 and i686 salsa20-asm implementations. Reported-by: syzbot+ffa3a158337bbc01ff09@syzkaller.appspotmail.com Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 4 - arch/x86/crypto/salsa20-i586-asm_32.S | 938 -------------------------------- arch/x86/crypto/salsa20-x86_64-asm_64.S | 805 --------------------------- arch/x86/crypto/salsa20_glue.c | 91 ---- crypto/Kconfig | 28 - 5 files changed, 1866 deletions(-) delete mode 100644 arch/x86/crypto/salsa20-i586-asm_32.S delete mode 100644 arch/x86/crypto/salsa20-x86_64-asm_64.S delete mode 100644 arch/x86/crypto/salsa20_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 48e731d782e9..a450ad573dcb 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o -obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o @@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o @@ -71,7 +69,6 @@ endif aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o -salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o @@ -80,7 +77,6 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S deleted file mode 100644 index 6014b7b9e52a..000000000000 --- a/arch/x86/crypto/salsa20-i586-asm_32.S +++ /dev/null @@ -1,938 +0,0 @@ -# Derived from: -# salsa20_pm.s version 20051229 -# D. J. Bernstein -# Public domain. - -#include - -.text - -# enter salsa20_encrypt_bytes -ENTRY(salsa20_encrypt_bytes) - mov %esp,%eax - and $31,%eax - add $256,%eax - sub %eax,%esp - # eax_stack = eax - movl %eax,80(%esp) - # ebx_stack = ebx - movl %ebx,84(%esp) - # esi_stack = esi - movl %esi,88(%esp) - # edi_stack = edi - movl %edi,92(%esp) - # ebp_stack = ebp - movl %ebp,96(%esp) - # x = arg1 - movl 4(%esp,%eax),%edx - # m = arg2 - movl 8(%esp,%eax),%esi - # out = arg3 - movl 12(%esp,%eax),%edi - # bytes = arg4 - movl 16(%esp,%eax),%ebx - # bytes -= 0 - sub $0,%ebx - # goto done if unsigned<= - jbe ._done -._start: - # in0 = *(uint32 *) (x + 0) - movl 0(%edx),%eax - # in1 = *(uint32 *) (x + 4) - movl 4(%edx),%ecx - # in2 = *(uint32 *) (x + 8) - movl 8(%edx),%ebp - # j0 = in0 - movl %eax,164(%esp) - # in3 = *(uint32 *) (x + 12) - movl 12(%edx),%eax - # j1 = in1 - movl %ecx,168(%esp) - # in4 = *(uint32 *) (x + 16) - movl 16(%edx),%ecx - # j2 = in2 - movl %ebp,172(%esp) - # in5 = *(uint32 *) (x + 20) - movl 20(%edx),%ebp - # j3 = in3 - movl %eax,176(%esp) - # in6 = *(uint32 *) (x + 24) - movl 24(%edx),%eax - # j4 = in4 - movl %ecx,180(%esp) - # in7 = *(uint32 *) (x + 28) - movl 28(%edx),%ecx - # j5 = in5 - movl %ebp,184(%esp) - # in8 = *(uint32 *) (x + 32) - movl 32(%edx),%ebp - # j6 = in6 - movl %eax,188(%esp) - # in9 = *(uint32 *) (x + 36) - movl 36(%edx),%eax - # j7 = in7 - movl %ecx,192(%esp) - # in10 = *(uint32 *) (x + 40) - movl 40(%edx),%ecx - # j8 = in8 - movl %ebp,196(%esp) - # in11 = *(uint32 *) (x + 44) - movl 44(%edx),%ebp - # j9 = in9 - movl %eax,200(%esp) - # in12 = *(uint32 *) (x + 48) - movl 48(%edx),%eax - # j10 = in10 - movl %ecx,204(%esp) - # in13 = *(uint32 *) (x + 52) - movl 52(%edx),%ecx - # j11 = in11 - movl %ebp,208(%esp) - # in14 = *(uint32 *) (x + 56) - movl 56(%edx),%ebp - # j12 = in12 - movl %eax,212(%esp) - # in15 = *(uint32 *) (x + 60) - movl 60(%edx),%eax - # j13 = in13 - movl %ecx,216(%esp) - # j14 = in14 - movl %ebp,220(%esp) - # j15 = in15 - movl %eax,224(%esp) - # x_backup = x - movl %edx,64(%esp) -._bytesatleast1: - # bytes - 64 - cmp $64,%ebx - # goto nocopy if unsigned>= - jae ._nocopy - # ctarget = out - movl %edi,228(%esp) - # out = &tmp - leal 0(%esp),%edi - # i = bytes - mov %ebx,%ecx - # while (i) { *out++ = *m++; --i } - rep movsb - # out = &tmp - leal 0(%esp),%edi - # m = &tmp - leal 0(%esp),%esi -._nocopy: - # out_backup = out - movl %edi,72(%esp) - # m_backup = m - movl %esi,68(%esp) - # bytes_backup = bytes - movl %ebx,76(%esp) - # in0 = j0 - movl 164(%esp),%eax - # in1 = j1 - movl 168(%esp),%ecx - # in2 = j2 - movl 172(%esp),%edx - # in3 = j3 - movl 176(%esp),%ebx - # x0 = in0 - movl %eax,100(%esp) - # x1 = in1 - movl %ecx,104(%esp) - # x2 = in2 - movl %edx,108(%esp) - # x3 = in3 - movl %ebx,112(%esp) - # in4 = j4 - movl 180(%esp),%eax - # in5 = j5 - movl 184(%esp),%ecx - # in6 = j6 - movl 188(%esp),%edx - # in7 = j7 - movl 192(%esp),%ebx - # x4 = in4 - movl %eax,116(%esp) - # x5 = in5 - movl %ecx,120(%esp) - # x6 = in6 - movl %edx,124(%esp) - # x7 = in7 - movl %ebx,128(%esp) - # in8 = j8 - movl 196(%esp),%eax - # in9 = j9 - movl 200(%esp),%ecx - # in10 = j10 - movl 204(%esp),%edx - # in11 = j11 - movl 208(%esp),%ebx - # x8 = in8 - movl %eax,132(%esp) - # x9 = in9 - movl %ecx,136(%esp) - # x10 = in10 - movl %edx,140(%esp) - # x11 = in11 - movl %ebx,144(%esp) - # in12 = j12 - movl 212(%esp),%eax - # in13 = j13 - movl 216(%esp),%ecx - # in14 = j14 - movl 220(%esp),%edx - # in15 = j15 - movl 224(%esp),%ebx - # x12 = in12 - movl %eax,148(%esp) - # x13 = in13 - movl %ecx,152(%esp) - # x14 = in14 - movl %edx,156(%esp) - # x15 = in15 - movl %ebx,160(%esp) - # i = 20 - mov $20,%ebp - # p = x0 - movl 100(%esp),%eax - # s = x5 - movl 120(%esp),%ecx - # t = x10 - movl 140(%esp),%edx - # w = x15 - movl 160(%esp),%ebx -._mainloop: - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x12 - addl 148(%esp),%eax - # x5 = s - movl %ecx,120(%esp) - # t += x6 - addl 124(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x1 - movl 104(%esp),%esi - # r += s - add %ecx,%esi - # v = x11 - movl 144(%esp),%edi - # v += w - add %ebx,%edi - # p <<<= 7 - rol $7,%eax - # p ^= x4 - xorl 116(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x14 - xorl 156(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x9 - xorl 136(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x3 - xorl 112(%esp),%edi - # x4 = p - movl %eax,116(%esp) - # x14 = t - movl %edx,156(%esp) - # p += x0 - addl 100(%esp),%eax - # x9 = r - movl %esi,136(%esp) - # t += x10 - addl 140(%esp),%edx - # x3 = v - movl %edi,112(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x8 - xorl 132(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x2 - xorl 108(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x13 - xorl 152(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x7 - xorl 128(%esp),%ebx - # x8 = p - movl %eax,132(%esp) - # x2 = t - movl %edx,108(%esp) - # p += x4 - addl 116(%esp),%eax - # x13 = s - movl %ecx,152(%esp) - # t += x14 - addl 156(%esp),%edx - # x7 = w - movl %ebx,128(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x12 - xorl 148(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x6 - xorl 124(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x1 - xorl 104(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x11 - xorl 144(%esp),%edi - # x12 = p - movl %eax,148(%esp) - # x6 = t - movl %edx,124(%esp) - # p += x8 - addl 132(%esp),%eax - # x1 = r - movl %esi,104(%esp) - # t += x2 - addl 108(%esp),%edx - # x11 = v - movl %edi,144(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x3 - addl 112(%esp),%eax - # p <<<= 7 - rol $7,%eax - # x5 = s - movl %ecx,120(%esp) - # t += x9 - addl 136(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x4 - movl 116(%esp),%esi - # r += s - add %ecx,%esi - # v = x14 - movl 156(%esp),%edi - # v += w - add %ebx,%edi - # p ^= x1 - xorl 104(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x11 - xorl 144(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x6 - xorl 124(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x12 - xorl 148(%esp),%edi - # x1 = p - movl %eax,104(%esp) - # x11 = t - movl %edx,144(%esp) - # p += x0 - addl 100(%esp),%eax - # x6 = r - movl %esi,124(%esp) - # t += x10 - addl 140(%esp),%edx - # x12 = v - movl %edi,148(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x2 - xorl 108(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x8 - xorl 132(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x7 - xorl 128(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x13 - xorl 152(%esp),%ebx - # x2 = p - movl %eax,108(%esp) - # x8 = t - movl %edx,132(%esp) - # p += x1 - addl 104(%esp),%eax - # x7 = s - movl %ecx,128(%esp) - # t += x11 - addl 144(%esp),%edx - # x13 = w - movl %ebx,152(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x3 - xorl 112(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x9 - xorl 136(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x4 - xorl 116(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x14 - xorl 156(%esp),%edi - # x3 = p - movl %eax,112(%esp) - # x9 = t - movl %edx,136(%esp) - # p += x2 - addl 108(%esp),%eax - # x4 = r - movl %esi,116(%esp) - # t += x8 - addl 132(%esp),%edx - # x14 = v - movl %edi,156(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x12 - addl 148(%esp),%eax - # x5 = s - movl %ecx,120(%esp) - # t += x6 - addl 124(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x1 - movl 104(%esp),%esi - # r += s - add %ecx,%esi - # v = x11 - movl 144(%esp),%edi - # v += w - add %ebx,%edi - # p <<<= 7 - rol $7,%eax - # p ^= x4 - xorl 116(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x14 - xorl 156(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x9 - xorl 136(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x3 - xorl 112(%esp),%edi - # x4 = p - movl %eax,116(%esp) - # x14 = t - movl %edx,156(%esp) - # p += x0 - addl 100(%esp),%eax - # x9 = r - movl %esi,136(%esp) - # t += x10 - addl 140(%esp),%edx - # x3 = v - movl %edi,112(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x8 - xorl 132(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x2 - xorl 108(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x13 - xorl 152(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x7 - xorl 128(%esp),%ebx - # x8 = p - movl %eax,132(%esp) - # x2 = t - movl %edx,108(%esp) - # p += x4 - addl 116(%esp),%eax - # x13 = s - movl %ecx,152(%esp) - # t += x14 - addl 156(%esp),%edx - # x7 = w - movl %ebx,128(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x12 - xorl 148(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x6 - xorl 124(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x1 - xorl 104(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x11 - xorl 144(%esp),%edi - # x12 = p - movl %eax,148(%esp) - # x6 = t - movl %edx,124(%esp) - # p += x8 - addl 132(%esp),%eax - # x1 = r - movl %esi,104(%esp) - # t += x2 - addl 108(%esp),%edx - # x11 = v - movl %edi,144(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x3 - addl 112(%esp),%eax - # p <<<= 7 - rol $7,%eax - # x5 = s - movl %ecx,120(%esp) - # t += x9 - addl 136(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x4 - movl 116(%esp),%esi - # r += s - add %ecx,%esi - # v = x14 - movl 156(%esp),%edi - # v += w - add %ebx,%edi - # p ^= x1 - xorl 104(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x11 - xorl 144(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x6 - xorl 124(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x12 - xorl 148(%esp),%edi - # x1 = p - movl %eax,104(%esp) - # x11 = t - movl %edx,144(%esp) - # p += x0 - addl 100(%esp),%eax - # x6 = r - movl %esi,124(%esp) - # t += x10 - addl 140(%esp),%edx - # x12 = v - movl %edi,148(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x2 - xorl 108(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x8 - xorl 132(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x7 - xorl 128(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x13 - xorl 152(%esp),%ebx - # x2 = p - movl %eax,108(%esp) - # x8 = t - movl %edx,132(%esp) - # p += x1 - addl 104(%esp),%eax - # x7 = s - movl %ecx,128(%esp) - # t += x11 - addl 144(%esp),%edx - # x13 = w - movl %ebx,152(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x3 - xorl 112(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x9 - xorl 136(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x4 - xorl 116(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x14 - xorl 156(%esp),%edi - # x3 = p - movl %eax,112(%esp) - # x9 = t - movl %edx,136(%esp) - # p += x2 - addl 108(%esp),%eax - # x4 = r - movl %esi,116(%esp) - # t += x8 - addl 132(%esp),%edx - # x14 = v - movl %edi,156(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # i -= 4 - sub $4,%ebp - # goto mainloop if unsigned > - ja ._mainloop - # x0 = p - movl %eax,100(%esp) - # x5 = s - movl %ecx,120(%esp) - # x10 = t - movl %edx,140(%esp) - # x15 = w - movl %ebx,160(%esp) - # out = out_backup - movl 72(%esp),%edi - # m = m_backup - movl 68(%esp),%esi - # in0 = x0 - movl 100(%esp),%eax - # in1 = x1 - movl 104(%esp),%ecx - # in0 += j0 - addl 164(%esp),%eax - # in1 += j1 - addl 168(%esp),%ecx - # in0 ^= *(uint32 *) (m + 0) - xorl 0(%esi),%eax - # in1 ^= *(uint32 *) (m + 4) - xorl 4(%esi),%ecx - # *(uint32 *) (out + 0) = in0 - movl %eax,0(%edi) - # *(uint32 *) (out + 4) = in1 - movl %ecx,4(%edi) - # in2 = x2 - movl 108(%esp),%eax - # in3 = x3 - movl 112(%esp),%ecx - # in2 += j2 - addl 172(%esp),%eax - # in3 += j3 - addl 176(%esp),%ecx - # in2 ^= *(uint32 *) (m + 8) - xorl 8(%esi),%eax - # in3 ^= *(uint32 *) (m + 12) - xorl 12(%esi),%ecx - # *(uint32 *) (out + 8) = in2 - movl %eax,8(%edi) - # *(uint32 *) (out + 12) = in3 - movl %ecx,12(%edi) - # in4 = x4 - movl 116(%esp),%eax - # in5 = x5 - movl 120(%esp),%ecx - # in4 += j4 - addl 180(%esp),%eax - # in5 += j5 - addl 184(%esp),%ecx - # in4 ^= *(uint32 *) (m + 16) - xorl 16(%esi),%eax - # in5 ^= *(uint32 *) (m + 20) - xorl 20(%esi),%ecx - # *(uint32 *) (out + 16) = in4 - movl %eax,16(%edi) - # *(uint32 *) (out + 20) = in5 - movl %ecx,20(%edi) - # in6 = x6 - movl 124(%esp),%eax - # in7 = x7 - movl 128(%esp),%ecx - # in6 += j6 - addl 188(%esp),%eax - # in7 += j7 - addl 192(%esp),%ecx - # in6 ^= *(uint32 *) (m + 24) - xorl 24(%esi),%eax - # in7 ^= *(uint32 *) (m + 28) - xorl 28(%esi),%ecx - # *(uint32 *) (out + 24) = in6 - movl %eax,24(%edi) - # *(uint32 *) (out + 28) = in7 - movl %ecx,28(%edi) - # in8 = x8 - movl 132(%esp),%eax - # in9 = x9 - movl 136(%esp),%ecx - # in8 += j8 - addl 196(%esp),%eax - # in9 += j9 - addl 200(%esp),%ecx - # in8 ^= *(uint32 *) (m + 32) - xorl 32(%esi),%eax - # in9 ^= *(uint32 *) (m + 36) - xorl 36(%esi),%ecx - # *(uint32 *) (out + 32) = in8 - movl %eax,32(%edi) - # *(uint32 *) (out + 36) = in9 - movl %ecx,36(%edi) - # in10 = x10 - movl 140(%esp),%eax - # in11 = x11 - movl 144(%esp),%ecx - # in10 += j10 - addl 204(%esp),%eax - # in11 += j11 - addl 208(%esp),%ecx - # in10 ^= *(uint32 *) (m + 40) - xorl 40(%esi),%eax - # in11 ^= *(uint32 *) (m + 44) - xorl 44(%esi),%ecx - # *(uint32 *) (out + 40) = in10 - movl %eax,40(%edi) - # *(uint32 *) (out + 44) = in11 - movl %ecx,44(%edi) - # in12 = x12 - movl 148(%esp),%eax - # in13 = x13 - movl 152(%esp),%ecx - # in12 += j12 - addl 212(%esp),%eax - # in13 += j13 - addl 216(%esp),%ecx - # in12 ^= *(uint32 *) (m + 48) - xorl 48(%esi),%eax - # in13 ^= *(uint32 *) (m + 52) - xorl 52(%esi),%ecx - # *(uint32 *) (out + 48) = in12 - movl %eax,48(%edi) - # *(uint32 *) (out + 52) = in13 - movl %ecx,52(%edi) - # in14 = x14 - movl 156(%esp),%eax - # in15 = x15 - movl 160(%esp),%ecx - # in14 += j14 - addl 220(%esp),%eax - # in15 += j15 - addl 224(%esp),%ecx - # in14 ^= *(uint32 *) (m + 56) - xorl 56(%esi),%eax - # in15 ^= *(uint32 *) (m + 60) - xorl 60(%esi),%ecx - # *(uint32 *) (out + 56) = in14 - movl %eax,56(%edi) - # *(uint32 *) (out + 60) = in15 - movl %ecx,60(%edi) - # bytes = bytes_backup - movl 76(%esp),%ebx - # in8 = j8 - movl 196(%esp),%eax - # in9 = j9 - movl 200(%esp),%ecx - # in8 += 1 - add $1,%eax - # in9 += 0 + carry - adc $0,%ecx - # j8 = in8 - movl %eax,196(%esp) - # j9 = in9 - movl %ecx,200(%esp) - # bytes - 64 - cmp $64,%ebx - # goto bytesatleast65 if unsigned> - ja ._bytesatleast65 - # goto bytesatleast64 if unsigned>= - jae ._bytesatleast64 - # m = out - mov %edi,%esi - # out = ctarget - movl 228(%esp),%edi - # i = bytes - mov %ebx,%ecx - # while (i) { *out++ = *m++; --i } - rep movsb -._bytesatleast64: - # x = x_backup - movl 64(%esp),%eax - # in8 = j8 - movl 196(%esp),%ecx - # in9 = j9 - movl 200(%esp),%edx - # *(uint32 *) (x + 32) = in8 - movl %ecx,32(%eax) - # *(uint32 *) (x + 36) = in9 - movl %edx,36(%eax) -._done: - # eax = eax_stack - movl 80(%esp),%eax - # ebx = ebx_stack - movl 84(%esp),%ebx - # esi = esi_stack - movl 88(%esp),%esi - # edi = edi_stack - movl 92(%esp),%edi - # ebp = ebp_stack - movl 96(%esp),%ebp - # leave - add %eax,%esp - ret -._bytesatleast65: - # bytes -= 64 - sub $64,%ebx - # out += 64 - add $64,%edi - # m += 64 - add $64,%esi - # goto bytesatleast1 - jmp ._bytesatleast1 -ENDPROC(salsa20_encrypt_bytes) diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S deleted file mode 100644 index 03a4918f41ee..000000000000 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ /dev/null @@ -1,805 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include - -# enter salsa20_encrypt_bytes -ENTRY(salsa20_encrypt_bytes) - mov %rsp,%r11 - and $31,%r11 - add $256,%r11 - sub %r11,%rsp - # x = arg1 - mov %rdi,%r8 - # m = arg2 - mov %rsi,%rsi - # out = arg3 - mov %rdx,%rdi - # bytes = arg4 - mov %rcx,%rdx - # unsigned>? bytes - 0 - cmp $0,%rdx - # comment:fp stack unchanged by jump - # goto done if !unsigned> - jbe ._done - # comment:fp stack unchanged by fallthrough -# start: -._start: - # r11_stack = r11 - movq %r11,0(%rsp) - # r12_stack = r12 - movq %r12,8(%rsp) - # r13_stack = r13 - movq %r13,16(%rsp) - # r14_stack = r14 - movq %r14,24(%rsp) - # r15_stack = r15 - movq %r15,32(%rsp) - # rbx_stack = rbx - movq %rbx,40(%rsp) - # rbp_stack = rbp - movq %rbp,48(%rsp) - # in0 = *(uint64 *) (x + 0) - movq 0(%r8),%rcx - # in2 = *(uint64 *) (x + 8) - movq 8(%r8),%r9 - # in4 = *(uint64 *) (x + 16) - movq 16(%r8),%rax - # in6 = *(uint64 *) (x + 24) - movq 24(%r8),%r10 - # in8 = *(uint64 *) (x + 32) - movq 32(%r8),%r11 - # in10 = *(uint64 *) (x + 40) - movq 40(%r8),%r12 - # in12 = *(uint64 *) (x + 48) - movq 48(%r8),%r13 - # in14 = *(uint64 *) (x + 56) - movq 56(%r8),%r14 - # j0 = in0 - movq %rcx,56(%rsp) - # j2 = in2 - movq %r9,64(%rsp) - # j4 = in4 - movq %rax,72(%rsp) - # j6 = in6 - movq %r10,80(%rsp) - # j8 = in8 - movq %r11,88(%rsp) - # j10 = in10 - movq %r12,96(%rsp) - # j12 = in12 - movq %r13,104(%rsp) - # j14 = in14 - movq %r14,112(%rsp) - # x_backup = x - movq %r8,120(%rsp) -# bytesatleast1: -._bytesatleast1: - # unsigned>= 32 - shr $32,%rdi - # x3 = j2 - movq 64(%rsp),%rsi - # x2 = x3 - mov %rsi,%rcx - # (uint64) x3 >>= 32 - shr $32,%rsi - # x5 = j4 - movq 72(%rsp),%r8 - # x4 = x5 - mov %r8,%r9 - # (uint64) x5 >>= 32 - shr $32,%r8 - # x5_stack = x5 - movq %r8,160(%rsp) - # x7 = j6 - movq 80(%rsp),%r8 - # x6 = x7 - mov %r8,%rax - # (uint64) x7 >>= 32 - shr $32,%r8 - # x9 = j8 - movq 88(%rsp),%r10 - # x8 = x9 - mov %r10,%r11 - # (uint64) x9 >>= 32 - shr $32,%r10 - # x11 = j10 - movq 96(%rsp),%r12 - # x10 = x11 - mov %r12,%r13 - # x10_stack = x10 - movq %r13,168(%rsp) - # (uint64) x11 >>= 32 - shr $32,%r12 - # x13 = j12 - movq 104(%rsp),%r13 - # x12 = x13 - mov %r13,%r14 - # (uint64) x13 >>= 32 - shr $32,%r13 - # x15 = j14 - movq 112(%rsp),%r15 - # x14 = x15 - mov %r15,%rbx - # (uint64) x15 >>= 32 - shr $32,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # i = 20 - mov $20,%r15 -# mainloop: -._mainloop: - # i_backup = i - movq %r15,184(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x12 + x0 - lea (%r14,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x4 ^= a - xor %rbp,%r9 - # b = x1 + x5 - lea (%rdi,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x9 ^= b - xor %rbp,%r10 - # a = x0 + x4 - lea (%rdx,%r9),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x8 ^= a - xor %rbp,%r11 - # b = x5 + x9 - lea (%r15,%r10),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x13 ^= b - xor %rbp,%r13 - # a = x4 + x8 - lea (%r9,%r11),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x12 ^= a - xor %rbp,%r14 - # b = x9 + x13 - lea (%r10,%r13),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x1 ^= b - xor %rbp,%rdi - # a = x8 + x12 - lea (%r11,%r14),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x13 + x1 - lea (%r13,%rdi),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x6 + x10 - lea (%rax,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x14 ^= c - xor %r15,%rbx - # c = x10 + x14 - lea (%rbp,%rbx),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x2 ^= c - xor %r15,%rcx - # c = x14 + x2 - lea (%rbx,%rcx),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x6 ^= c - xor %r15,%rax - # c = x2 + x6 - lea (%rcx,%rax),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x11 + x15 - lea (%r12,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x3 ^= d - xor %rbp,%rsi - # d = x15 + x3 - lea (%r15,%rsi),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x7 ^= d - xor %rbp,%r8 - # d = x3 + x7 - lea (%rsi,%r8),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x11 ^= d - xor %rbp,%r12 - # d = x7 + x11 - lea (%r8,%r12),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x3 + x0 - lea (%rsi,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x1 ^= a - xor %rbp,%rdi - # b = x4 + x5 - lea (%r9,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x6 ^= b - xor %rbp,%rax - # a = x0 + x1 - lea (%rdx,%rdi),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x2 ^= a - xor %rbp,%rcx - # b = x5 + x6 - lea (%r15,%rax),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x7 ^= b - xor %rbp,%r8 - # a = x1 + x2 - lea (%rdi,%rcx),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x3 ^= a - xor %rbp,%rsi - # b = x6 + x7 - lea (%rax,%r8),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x4 ^= b - xor %rbp,%r9 - # a = x2 + x3 - lea (%rcx,%rsi),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x7 + x4 - lea (%r8,%r9),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x9 + x10 - lea (%r10,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x11 ^= c - xor %r15,%r12 - # c = x10 + x11 - lea (%rbp,%r12),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x8 ^= c - xor %r15,%r11 - # c = x11 + x8 - lea (%r12,%r11),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x9 ^= c - xor %r15,%r10 - # c = x8 + x9 - lea (%r11,%r10),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x14 + x15 - lea (%rbx,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x12 ^= d - xor %rbp,%r14 - # d = x15 + x12 - lea (%r15,%r14),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x13 ^= d - xor %rbp,%r13 - # d = x12 + x13 - lea (%r14,%r13),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x14 ^= d - xor %rbp,%rbx - # d = x13 + x14 - lea (%r13,%rbx),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x12 + x0 - lea (%r14,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x4 ^= a - xor %rbp,%r9 - # b = x1 + x5 - lea (%rdi,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x9 ^= b - xor %rbp,%r10 - # a = x0 + x4 - lea (%rdx,%r9),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x8 ^= a - xor %rbp,%r11 - # b = x5 + x9 - lea (%r15,%r10),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x13 ^= b - xor %rbp,%r13 - # a = x4 + x8 - lea (%r9,%r11),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x12 ^= a - xor %rbp,%r14 - # b = x9 + x13 - lea (%r10,%r13),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x1 ^= b - xor %rbp,%rdi - # a = x8 + x12 - lea (%r11,%r14),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x13 + x1 - lea (%r13,%rdi),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x6 + x10 - lea (%rax,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x14 ^= c - xor %r15,%rbx - # c = x10 + x14 - lea (%rbp,%rbx),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x2 ^= c - xor %r15,%rcx - # c = x14 + x2 - lea (%rbx,%rcx),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x6 ^= c - xor %r15,%rax - # c = x2 + x6 - lea (%rcx,%rax),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x11 + x15 - lea (%r12,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x3 ^= d - xor %rbp,%rsi - # d = x15 + x3 - lea (%r15,%rsi),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x7 ^= d - xor %rbp,%r8 - # d = x3 + x7 - lea (%rsi,%r8),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x11 ^= d - xor %rbp,%r12 - # d = x7 + x11 - lea (%r8,%r12),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x3 + x0 - lea (%rsi,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x1 ^= a - xor %rbp,%rdi - # b = x4 + x5 - lea (%r9,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x6 ^= b - xor %rbp,%rax - # a = x0 + x1 - lea (%rdx,%rdi),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x2 ^= a - xor %rbp,%rcx - # b = x5 + x6 - lea (%r15,%rax),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x7 ^= b - xor %rbp,%r8 - # a = x1 + x2 - lea (%rdi,%rcx),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x3 ^= a - xor %rbp,%rsi - # b = x6 + x7 - lea (%rax,%r8),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x4 ^= b - xor %rbp,%r9 - # a = x2 + x3 - lea (%rcx,%rsi),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x7 + x4 - lea (%r8,%r9),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x9 + x10 - lea (%r10,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x11 ^= c - xor %r15,%r12 - # c = x10 + x11 - lea (%rbp,%r12),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x8 ^= c - xor %r15,%r11 - # c = x11 + x8 - lea (%r12,%r11),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x9 ^= c - xor %r15,%r10 - # c = x8 + x9 - lea (%r11,%r10),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x14 + x15 - lea (%rbx,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x12 ^= d - xor %rbp,%r14 - # d = x15 + x12 - lea (%r15,%r14),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x13 ^= d - xor %rbp,%r13 - # d = x12 + x13 - lea (%r14,%r13),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x14 ^= d - xor %rbp,%rbx - # d = x13 + x14 - lea (%r13,%rbx),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # i = i_backup - movq 184(%rsp),%r15 - # unsigned>? i -= 4 - sub $4,%r15 - # comment:fp stack unchanged by jump - # goto mainloop if unsigned> - ja ._mainloop - # (uint32) x2 += j2 - addl 64(%rsp),%ecx - # x3 <<= 32 - shl $32,%rsi - # x3 += j2 - addq 64(%rsp),%rsi - # (uint64) x3 >>= 32 - shr $32,%rsi - # x3 <<= 32 - shl $32,%rsi - # x2 += x3 - add %rsi,%rcx - # (uint32) x6 += j6 - addl 80(%rsp),%eax - # x7 <<= 32 - shl $32,%r8 - # x7 += j6 - addq 80(%rsp),%r8 - # (uint64) x7 >>= 32 - shr $32,%r8 - # x7 <<= 32 - shl $32,%r8 - # x6 += x7 - add %r8,%rax - # (uint32) x8 += j8 - addl 88(%rsp),%r11d - # x9 <<= 32 - shl $32,%r10 - # x9 += j8 - addq 88(%rsp),%r10 - # (uint64) x9 >>= 32 - shr $32,%r10 - # x9 <<= 32 - shl $32,%r10 - # x8 += x9 - add %r10,%r11 - # (uint32) x12 += j12 - addl 104(%rsp),%r14d - # x13 <<= 32 - shl $32,%r13 - # x13 += j12 - addq 104(%rsp),%r13 - # (uint64) x13 >>= 32 - shr $32,%r13 - # x13 <<= 32 - shl $32,%r13 - # x12 += x13 - add %r13,%r14 - # (uint32) x0 += j0 - addl 56(%rsp),%edx - # x1 <<= 32 - shl $32,%rdi - # x1 += j0 - addq 56(%rsp),%rdi - # (uint64) x1 >>= 32 - shr $32,%rdi - # x1 <<= 32 - shl $32,%rdi - # x0 += x1 - add %rdi,%rdx - # x5 = x5_stack - movq 160(%rsp),%rdi - # (uint32) x4 += j4 - addl 72(%rsp),%r9d - # x5 <<= 32 - shl $32,%rdi - # x5 += j4 - addq 72(%rsp),%rdi - # (uint64) x5 >>= 32 - shr $32,%rdi - # x5 <<= 32 - shl $32,%rdi - # x4 += x5 - add %rdi,%r9 - # x10 = x10_stack - movq 168(%rsp),%r8 - # (uint32) x10 += j10 - addl 96(%rsp),%r8d - # x11 <<= 32 - shl $32,%r12 - # x11 += j10 - addq 96(%rsp),%r12 - # (uint64) x11 >>= 32 - shr $32,%r12 - # x11 <<= 32 - shl $32,%r12 - # x10 += x11 - add %r12,%r8 - # x15 = x15_stack - movq 176(%rsp),%rdi - # (uint32) x14 += j14 - addl 112(%rsp),%ebx - # x15 <<= 32 - shl $32,%rdi - # x15 += j14 - addq 112(%rsp),%rdi - # (uint64) x15 >>= 32 - shr $32,%rdi - # x15 <<= 32 - shl $32,%rdi - # x14 += x15 - add %rdi,%rbx - # out = out_backup - movq 136(%rsp),%rdi - # m = m_backup - movq 144(%rsp),%rsi - # x0 ^= *(uint64 *) (m + 0) - xorq 0(%rsi),%rdx - # *(uint64 *) (out + 0) = x0 - movq %rdx,0(%rdi) - # x2 ^= *(uint64 *) (m + 8) - xorq 8(%rsi),%rcx - # *(uint64 *) (out + 8) = x2 - movq %rcx,8(%rdi) - # x4 ^= *(uint64 *) (m + 16) - xorq 16(%rsi),%r9 - # *(uint64 *) (out + 16) = x4 - movq %r9,16(%rdi) - # x6 ^= *(uint64 *) (m + 24) - xorq 24(%rsi),%rax - # *(uint64 *) (out + 24) = x6 - movq %rax,24(%rdi) - # x8 ^= *(uint64 *) (m + 32) - xorq 32(%rsi),%r11 - # *(uint64 *) (out + 32) = x8 - movq %r11,32(%rdi) - # x10 ^= *(uint64 *) (m + 40) - xorq 40(%rsi),%r8 - # *(uint64 *) (out + 40) = x10 - movq %r8,40(%rdi) - # x12 ^= *(uint64 *) (m + 48) - xorq 48(%rsi),%r14 - # *(uint64 *) (out + 48) = x12 - movq %r14,48(%rdi) - # x14 ^= *(uint64 *) (m + 56) - xorq 56(%rsi),%rbx - # *(uint64 *) (out + 56) = x14 - movq %rbx,56(%rdi) - # bytes = bytes_backup - movq 152(%rsp),%rdx - # in8 = j8 - movq 88(%rsp),%rcx - # in8 += 1 - add $1,%rcx - # j8 = in8 - movq %rcx,88(%rsp) - # unsigned>? unsigned - ja ._bytesatleast65 - # comment:fp stack unchanged by jump - # goto bytesatleast64 if !unsigned< - jae ._bytesatleast64 - # m = out - mov %rdi,%rsi - # out = ctarget - movq 128(%rsp),%rdi - # i = bytes - mov %rdx,%rcx - # while (i) { *out++ = *m++; --i } - rep movsb - # comment:fp stack unchanged by fallthrough -# bytesatleast64: -._bytesatleast64: - # x = x_backup - movq 120(%rsp),%rdi - # in8 = j8 - movq 88(%rsp),%rsi - # *(uint64 *) (x + 32) = in8 - movq %rsi,32(%rdi) - # r11 = r11_stack - movq 0(%rsp),%r11 - # r12 = r12_stack - movq 8(%rsp),%r12 - # r13 = r13_stack - movq 16(%rsp),%r13 - # r14 = r14_stack - movq 24(%rsp),%r14 - # r15 = r15_stack - movq 32(%rsp),%r15 - # rbx = rbx_stack - movq 40(%rsp),%rbx - # rbp = rbp_stack - movq 48(%rsp),%rbp - # comment:fp stack unchanged by fallthrough -# done: -._done: - # leave - add %r11,%rsp - mov %rdi,%rax - mov %rsi,%rdx - ret -# bytesatleast65: -._bytesatleast65: - # bytes -= 64 - sub $64,%rdx - # out += 64 - add $64,%rdi - # m += 64 - add $64,%rsi - # comment:fp stack unchanged by jump - # goto bytesatleast1 - jmp ._bytesatleast1 -ENDPROC(salsa20_encrypt_bytes) diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c deleted file mode 100644 index b07d7d959806..000000000000 --- a/arch/x86/crypto/salsa20_glue.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Glue code for optimized assembly version of Salsa20. - * - * Copyright (c) 2007 Tan Swee Heng - * - * The assembly codes are public domain assembly codes written by Daniel. J. - * Bernstein . The codes are modified to include indentation - * and to remove extraneous comments and functions that are not needed. - * - i586 version, renamed as salsa20-i586-asm_32.S - * available from - * - x86-64 version, renamed as salsa20-x86_64-asm_64.S - * available from - * - * Also modified to set up the initial state using the generic C code rather - * than in assembly. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ - -#include -#include -#include -#include - -asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst, - u32 bytes); - -static int salsa20_asm_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, true); - - crypto_salsa20_init(state, ctx, walk.iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - salsa20_encrypt_bytes(state, walk.src.virt.addr, - walk.dst.virt.addr, nbytes); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static struct skcipher_alg alg = { - .base.cra_name = "salsa20", - .base.cra_driver_name = "salsa20-asm", - .base.cra_priority = 200, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct salsa20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = SALSA20_MIN_KEY_SIZE, - .max_keysize = SALSA20_MAX_KEY_SIZE, - .ivsize = SALSA20_IV_SIZE, - .chunksize = SALSA20_BLOCK_SIZE, - .setkey = crypto_salsa20_setkey, - .encrypt = salsa20_asm_crypt, - .decrypt = salsa20_asm_crypt, -}; - -static int __init init(void) -{ - return crypto_register_skcipher(&alg); -} - -static void __exit fini(void) -{ - crypto_unregister_skcipher(&alg); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)"); -MODULE_ALIAS_CRYPTO("salsa20"); -MODULE_ALIAS_CRYPTO("salsa20-asm"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 30d54a56e64a..f3e40ac56d93 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1436,34 +1436,6 @@ config CRYPTO_SALSA20 The Salsa20 stream cipher algorithm is designed by Daniel J. Bernstein . See -config CRYPTO_SALSA20_586 - tristate "Salsa20 stream cipher algorithm (i586)" - depends on (X86 || UML_X86) && !64BIT - select CRYPTO_BLKCIPHER - select CRYPTO_SALSA20 - help - Salsa20 stream cipher algorithm. - - Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT - Stream Cipher Project. See - - The Salsa20 stream cipher algorithm is designed by Daniel J. - Bernstein . See - -config CRYPTO_SALSA20_X86_64 - tristate "Salsa20 stream cipher algorithm (x86_64)" - depends on (X86 || UML_X86) && 64BIT - select CRYPTO_BLKCIPHER - select CRYPTO_SALSA20 - help - Salsa20 stream cipher algorithm. - - Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT - Stream Cipher Project. See - - The Salsa20 stream cipher algorithm is designed by Daniel J. - Bernstein . See - config CRYPTO_CHACHA20 tristate "ChaCha20 cipher algorithm" select CRYPTO_BLKCIPHER -- cgit v1.2.3