diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-06 00:51:21 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-06 00:51:21 +0200 |
commit | 3e1a29b3bf66c2850ea8eba78c59c234921c0b69 (patch) | |
tree | 641a5428e3a1ef205fafede3d6a03dae85d30e92 /arch/x86 | |
parent | Merge tag 'fscrypt_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/gi... (diff) | |
parent | crypto: chtls - free beyond end rspq_skb_cache (diff) | |
download | linux-3e1a29b3bf66c2850ea8eba78c59c234921c0b69.tar.xz linux-3e1a29b3bf66c2850ea8eba78c59c234921c0b69.zip |
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
"API:
- Decryption test vectors are now automatically generated from
encryption test vectors.
Algorithms:
- Fix unaligned access issues in crc32/crc32c.
- Add zstd compression algorithm.
- Add AEGIS.
- Add MORUS.
Drivers:
- Add accelerated AEGIS/MORUS on x86.
- Add accelerated SM4 on arm64.
- Removed x86 assembly salsa implementation as it is slower than C.
- Add authenc(hmac(sha*), cbc(aes)) support in inside-secure.
- Add ctr(aes) support in crypto4xx.
- Add hardware key support in ccree.
- Add support for new Centaur CPU in via-rng"
* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (112 commits)
crypto: chtls - free beyond end rspq_skb_cache
crypto: chtls - kbuild warnings
crypto: chtls - dereference null variable
crypto: chtls - wait for memory sendmsg, sendpage
crypto: chtls - key len correction
crypto: salsa20 - Revert "crypto: salsa20 - export generic helpers"
crypto: x86/salsa20 - remove x86 salsa20 implementations
crypto: ccp - Add GET_ID SEV command
crypto: ccp - Add DOWNLOAD_FIRMWARE SEV command
crypto: qat - Add MODULE_FIRMWARE for all qat drivers
crypto: ccree - silence debug prints
crypto: ccree - better clock handling
crypto: ccree - correct host regs offset
crypto: chelsio - Remove separate buffer used for DMA map B0 block in CCM
crypt: chelsio - Send IV as Immediate for cipher algo
crypto: chelsio - Return -ENOSPC for transient busy indication.
crypto: caam/qi - fix warning in init_cgr()
crypto: caam - fix rfc4543 descriptors
crypto: caam - fix MC firmware detection
crypto: clarify licensing of OpenSSL asm code
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/crypto/Makefile | 25 | ||||
-rw-r--r-- | arch/x86/crypto/aegis128-aesni-asm.S | 749 | ||||
-rw-r--r-- | arch/x86/crypto/aegis128-aesni-glue.c | 407 | ||||
-rw-r--r-- | arch/x86/crypto/aegis128l-aesni-asm.S | 825 | ||||
-rw-r--r-- | arch/x86/crypto/aegis128l-aesni-glue.c | 407 | ||||
-rw-r--r-- | arch/x86/crypto/aegis256-aesni-asm.S | 702 | ||||
-rw-r--r-- | arch/x86/crypto/aegis256-aesni-glue.c | 407 | ||||
-rw-r--r-- | arch/x86/crypto/ghash-clmulni-intel_glue.c | 2 | ||||
-rw-r--r-- | arch/x86/crypto/morus1280-avx2-asm.S | 621 | ||||
-rw-r--r-- | arch/x86/crypto/morus1280-avx2-glue.c | 68 | ||||
-rw-r--r-- | arch/x86/crypto/morus1280-sse2-asm.S | 895 | ||||
-rw-r--r-- | arch/x86/crypto/morus1280-sse2-glue.c | 68 | ||||
-rw-r--r-- | arch/x86/crypto/morus1280_glue.c | 302 | ||||
-rw-r--r-- | arch/x86/crypto/morus640-sse2-asm.S | 614 | ||||
-rw-r--r-- | arch/x86/crypto/morus640-sse2-glue.c | 68 | ||||
-rw-r--r-- | arch/x86/crypto/morus640_glue.c | 298 | ||||
-rw-r--r-- | arch/x86/crypto/salsa20-i586-asm_32.S | 938 | ||||
-rw-r--r-- | arch/x86/crypto/salsa20-x86_64-asm_64.S | 805 | ||||
-rw-r--r-- | arch/x86/crypto/salsa20_glue.c | 91 |
19 files changed, 6453 insertions, 1839 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5f07333bb224..a450ad573dcb 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o -obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o @@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o @@ -38,6 +36,16 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o +obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o +obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o +obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o + +obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o +obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o + +obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o +obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o + # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -55,11 +63,12 @@ ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/ obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/ obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/ + + obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o endif aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o -salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o @@ -68,10 +77,16 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o +aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o +aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o + +morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o +morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o + ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia_aesni_avx_glue.o @@ -87,6 +102,8 @@ ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o chacha20-x86_64-y += chacha20-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o + + morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S new file mode 100644 index 000000000000..9254e0b6cc06 --- /dev/null +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -0,0 +1,749 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define KEY %xmm5 +#define MSG %xmm5 +#define T0 %xmm6 +#define T1 %xmm7 + +#define STATEP %rdi +#define LEN %rsi +#define SRC %rdx +#define DST %rcx + +.section .rodata.cst16.aegis128_const, "aM", @progbits, 32 +.align 16 +.Laegis128_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis128_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 +.align 16 +.Laegis128_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +/* + * aegis128_update + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state (shifted positions) + * changed: + * T0 + */ +.macro aegis128_update + movdqa STATE4, T0 + aesenc STATE0, STATE4 + aesenc STATE1, STATE0 + aesenc STATE2, STATE1 + aesenc STATE3, STATE2 + aesenc T0, STATE3 +.endm + +/* + * __load_partial: internal ABI + * input: + * LEN - bytes + * SRC - src + * output: + * MSG - message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG, MSG + + mov LEN, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov LEN, %r8 + and $0x1E, %r8 + add SRC, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov LEN, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov LEN, %r8 + and $0x1C, %r8 + add SRC, %r8 + shl $0x10, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov LEN, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov LEN, %r8 + and $0x18, %r8 + add SRC, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG + + mov LEN, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov LEN, %r8 + and $0x10, %r8 + add SRC, %r8 + pslldq $8, MSG + movq (%r8), T0 + pxor T0, MSG + +.Lld_partial_8: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * LEN - bytes + * DST - dst + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov LEN, %r8 + mov DST, %r9 + + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $0x10, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis128_aesni_init) + FRAME_BEGIN + + /* load IV: */ + movdqu (%rdx), T1 + + /* load key: */ + movdqa (%rsi), KEY + pxor KEY, T1 + movdqa T1, STATE0 + movdqa KEY, STATE3 + movdqa KEY, STATE4 + + /* load the constants: */ + movdqa .Laegis128_const_0, STATE2 + movdqa .Laegis128_const_1, STATE1 + pxor STATE2, STATE3 + pxor STATE1, STATE4 + + /* update 10 times with KEY / KEY xor IV: */ + aegis128_update; pxor KEY, STATE4 + aegis128_update; pxor T1, STATE3 + aegis128_update; pxor KEY, STATE2 + aegis128_update; pxor T1, STATE1 + aegis128_update; pxor KEY, STATE0 + aegis128_update; pxor T1, STATE4 + aegis128_update; pxor KEY, STATE3 + aegis128_update; pxor T1, STATE2 + aegis128_update; pxor KEY, STATE1 + aegis128_update; pxor T1, STATE0 + + /* store the state: */ + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_init) + +/* + * void crypto_aegis128_aesni_ad(void *state, unsigned int length, + * const void *data); + */ +ENTRY(crypto_aegis128_aesni_ad) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lad_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + mov SRC, %r8 + and $0xF, %r8 + jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: + movdqa 0x00(SRC), MSG + aegis128_update + pxor MSG, STATE4 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_1 + + movdqa 0x10(SRC), MSG + aegis128_update + pxor MSG, STATE3 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_2 + + movdqa 0x20(SRC), MSG + aegis128_update + pxor MSG, STATE2 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_3 + + movdqa 0x30(SRC), MSG + aegis128_update + pxor MSG, STATE1 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_4 + + movdqa 0x40(SRC), MSG + aegis128_update + pxor MSG, STATE0 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_0 + + add $0x50, SRC + jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: + movdqu 0x00(SRC), MSG + aegis128_update + pxor MSG, STATE4 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_1 + + movdqu 0x10(SRC), MSG + aegis128_update + pxor MSG, STATE3 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_2 + + movdqu 0x20(SRC), MSG + aegis128_update + pxor MSG, STATE2 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_3 + + movdqu 0x30(SRC), MSG + aegis128_update + pxor MSG, STATE1 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_4 + + movdqu 0x40(SRC), MSG + aegis128_update + pxor MSG, STATE0 + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_0 + + add $0x50, SRC + jmp .Lad_u_loop + + /* store the state: */ +.Lad_out_0: + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_1: + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_2: + movdqu STATE3, 0x00(STATEP) + movdqu STATE4, 0x10(STATEP) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_3: + movdqu STATE2, 0x00(STATEP) + movdqu STATE3, 0x10(STATEP) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) + FRAME_END + ret + +.Lad_out_4: + movdqu STATE1, 0x00(STATEP) + movdqu STATE2, 0x10(STATEP) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) + FRAME_END + ret + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_ad) + +.macro encrypt_block a s0 s1 s2 s3 s4 i + movdq\a (\i * 0x10)(SRC), MSG + movdqa MSG, T0 + pxor \s1, T0 + pxor \s4, T0 + movdqa \s2, T1 + pand \s3, T1 + pxor T1, T0 + movdq\a T0, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN + cmp $0x10, LEN + jl .Lenc_out_\i +.endm + +/* + * void crypto_aegis128_aesni_enc(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_enc) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lenc_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: + encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: + encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Lenc_u_loop + + /* store the state: */ +.Lenc_out_0: + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_1: + movdqu STATE3, 0x00(STATEP) + movdqu STATE4, 0x10(STATEP) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_2: + movdqu STATE2, 0x00(STATEP) + movdqu STATE3, 0x10(STATEP) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_3: + movdqu STATE1, 0x00(STATEP) + movdqu STATE2, 0x10(STATEP) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out_4: + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + FRAME_END + ret + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_enc) + +/* + * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_enc_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + /* encrypt message: */ + call __load_partial + + movdqa MSG, T0 + pxor STATE1, T0 + pxor STATE4, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + + call __store_partial + + aegis128_update + pxor MSG, STATE4 + + /* store the state: */ + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + + FRAME_END +ENDPROC(crypto_aegis128_aesni_enc_tail) + +.macro decrypt_block a s0 s1 s2 s3 s4 i + movdq\a (\i * 0x10)(SRC), MSG + pxor \s1, MSG + pxor \s4, MSG + movdqa \s2, T1 + pand \s3, T1 + pxor T1, MSG + movdq\a MSG, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN + cmp $0x10, LEN + jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis128_aesni_dec(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_dec) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Ldec_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: + decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: + decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST + jmp .Ldec_u_loop + + /* store the state: */ +.Ldec_out_0: + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_1: + movdqu STATE3, 0x00(STATEP) + movdqu STATE4, 0x10(STATEP) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_2: + movdqu STATE2, 0x00(STATEP) + movdqu STATE3, 0x10(STATEP) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_3: + movdqu STATE1, 0x00(STATEP) + movdqu STATE2, 0x10(STATEP) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out_4: + movdqu STATE0, 0x00(STATEP) + movdqu STATE1, 0x10(STATEP) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) + FRAME_END + ret + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_dec) + +/* + * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_dec_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + /* decrypt message: */ + call __load_partial + + pxor STATE1, MSG + pxor STATE4, MSG + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, MSG + + movdqa MSG, T0 + call __store_partial + + /* mask with byte count: */ + movq LEN, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa .Laegis128_counter, T1 + pcmpgtb T1, T0 + pand T0, MSG + + aegis128_update + pxor MSG, STATE4 + + /* store the state: */ + movdqu STATE4, 0x00(STATEP) + movdqu STATE0, 0x10(STATEP) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) + + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_dec_tail) + +/* + * void crypto_aegis128_aesni_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis128_aesni_final) + FRAME_BEGIN + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + + /* prepare length block: */ + movq %rdx, MSG + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + pxor STATE3, MSG + + /* update state: */ + aegis128_update; pxor MSG, STATE4 + aegis128_update; pxor MSG, STATE3 + aegis128_update; pxor MSG, STATE2 + aegis128_update; pxor MSG, STATE1 + aegis128_update; pxor MSG, STATE0 + aegis128_update; pxor MSG, STATE4 + aegis128_update; pxor MSG, STATE3 + + /* xor tag: */ + movdqu (%rsi), MSG + + pxor STATE0, MSG + pxor STATE1, MSG + pxor STATE2, MSG + pxor STATE3, MSG + pxor STATE4, MSG + + movdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c new file mode 100644 index 000000000000..5de7c0d46edf --- /dev/null +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-128 Authenticated-Encryption Algorithm + * Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +#define AEGIS128_BLOCK_ALIGN 16 +#define AEGIS128_BLOCK_SIZE 16 +#define AEGIS128_NONCE_SIZE 16 +#define AEGIS128_STATE_BLOCKS 5 +#define AEGIS128_KEY_SIZE 16 +#define AEGIS128_MIN_AUTH_SIZE 8 +#define AEGIS128_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis128_aesni_ad( + void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis128_aesni_enc( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_dec( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_enc_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_dec_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_final( + void *state, void *tag_xor, unsigned int cryptlen, + unsigned int assoclen); + +struct aegis_block { + u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); +}; + +struct aegis_state { + struct aegis_block blocks[AEGIS128_STATE_BLOCKS]; +}; + +struct aegis_ctx { + struct aegis_block key; +}; + +struct aegis_crypt_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, unsigned int length, const void *src, + void *dst); + void (*crypt_tail)(void *state, unsigned int length, const void *src, + void *dst); +}; + +static void crypto_aegis128_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, + unsigned int assoclen) +{ + struct scatter_walk walk; + struct aegis_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= AEGIS128_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = AEGIS128_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + crypto_aegis128_aesni_ad(state, + AEGIS128_BLOCK_SIZE, + buf.bytes); + pos = 0; + left -= fill; + src += fill; + } + + crypto_aegis128_aesni_ad(state, left, src); + + src += left & ~(AEGIS128_BLOCK_SIZE - 1); + left &= AEGIS128_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + pos += left; + assoclen -= size; + + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); + crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); + } +} + +static void crypto_aegis128_aesni_process_crypt( + struct aegis_state *state, struct aead_request *req, + const struct aegis_crypt_ops *ops) +{ + struct skcipher_walk walk; + u8 *src, *dst; + unsigned int chunksize, base; + + ops->skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + src = walk.src.virt.addr; + dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops->crypt_blocks(state, chunksize, src, dst); + + base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1); + src += base; + dst += base; + chunksize &= AEGIS128_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops->crypt_tail(state, chunksize, src, dst); + + skcipher_walk_done(&walk, 0); + } +} + +static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead) +{ + u8 *ctx = crypto_aead_ctx(aead); + ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); + return (void *)ctx; +} + +static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead); + + if (keylen != AEGIS128_KEY_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); + + return 0; +} + +static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + if (authsize > AEGIS128_MAX_AUTH_SIZE) + return -EINVAL; + if (authsize < AEGIS128_MIN_AUTH_SIZE) + return -EINVAL; + return 0; +} + +static void crypto_aegis128_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, + const struct aegis_crypt_ops *ops) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); + struct aegis_state state; + + kernel_fpu_begin(); + + crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); + crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); + crypto_aegis128_aesni_process_crypt(&state, req, ops); + crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +static int crypto_aegis128_aesni_encrypt(struct aead_request *req) +{ + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = crypto_aegis128_aesni_enc, + .crypt_tail = crypto_aegis128_aesni_enc_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} + +static int crypto_aegis128_aesni_decrypt(struct aead_request *req) +{ + static const struct aegis_block zeros = {}; + + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = crypto_aegis128_aesni_dec, + .crypt_tail = crypto_aegis128_aesni_dec_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) +{ + return 0; +} + +static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis128_aesni_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} + +static int cryptd_aegis128_aesni_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} + +static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} + +static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis128_aesni_alg[] = { + { + .setkey = crypto_aegis128_aesni_setkey, + .setauthsize = crypto_aegis128_aesni_setauthsize, + .encrypt = crypto_aegis128_aesni_encrypt, + .decrypt = crypto_aegis128_aesni_decrypt, + .init = crypto_aegis128_aesni_init_tfm, + .exit = crypto_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, + .chunksize = AEGIS128_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + + .cra_name = "__aegis128", + .cra_driver_name = "__aegis128-aesni", + + .cra_module = THIS_MODULE, + } + }, { + .setkey = cryptd_aegis128_aesni_setkey, + .setauthsize = cryptd_aegis128_aesni_setauthsize, + .encrypt = cryptd_aegis128_aesni_encrypt, + .decrypt = cryptd_aegis128_aesni_decrypt, + .init = cryptd_aegis128_aesni_init_tfm, + .exit = cryptd_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, + .chunksize = AEGIS128_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_alignmask = 0, + + .cra_priority = 400, + + .cra_name = "aegis128", + .cra_driver_name = "aegis128-aesni", + + .cra_module = THIS_MODULE, + } + } +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis128_aesni_module_init(void) +{ + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_aegis128_aesni_alg, + ARRAY_SIZE(crypto_aegis128_aesni_alg)); +} + +static void __exit crypto_aegis128_aesni_module_exit(void) +{ + crypto_unregister_aeads(crypto_aegis128_aesni_alg, + ARRAY_SIZE(crypto_aegis128_aesni_alg)); +} + +module_init(crypto_aegis128_aesni_module_init); +module_exit(crypto_aegis128_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis128"); +MODULE_ALIAS_CRYPTO("aegis128-aesni"); diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S new file mode 100644 index 000000000000..9263c344f2c7 --- /dev/null +++ b/arch/x86/crypto/aegis128l-aesni-asm.S @@ -0,0 +1,825 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128L + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define STATE5 %xmm5 +#define STATE6 %xmm6 +#define STATE7 %xmm7 +#define MSG0 %xmm8 +#define MSG1 %xmm9 +#define T0 %xmm10 +#define T1 %xmm11 +#define T2 %xmm12 +#define T3 %xmm13 + +#define STATEP %rdi +#define LEN %rsi +#define SRC %rdx +#define DST %rcx + +.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32 +.align 16 +.Laegis128l_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis128l_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16 +.align 16 +.Laegis128l_counter0: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Laegis128l_counter1: + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +/* + * __load_partial: internal ABI + * input: + * LEN - bytes + * SRC - src + * output: + * MSG0 - first message block + * MSG1 - second message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG0, MSG0 + pxor MSG1, MSG1 + + mov LEN, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov LEN, %r8 + and $0x1E, %r8 + add SRC, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov LEN, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov LEN, %r8 + and $0x1C, %r8 + add SRC, %r8 + shl $0x10, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov LEN, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov LEN, %r8 + and $0x18, %r8 + add SRC, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG0 + + mov LEN, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov LEN, %r8 + and $0x10, %r8 + add SRC, %r8 + pslldq $8, MSG0 + movq (%r8), T0 + pxor T0, MSG0 + +.Lld_partial_8: + mov LEN, %r8 + and $0x10, %r8 + jz .Lld_partial_16 + + movdqa MSG0, MSG1 + movdqu (SRC), MSG0 + +.Lld_partial_16: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * LEN - bytes + * DST - dst + * output: + * T0 - first message block + * T1 - second message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov LEN, %r8 + mov DST, %r9 + + cmp $16, %r8 + jl .Lst_partial_16 + + movdqu T0, (%r9) + movdqa T1, T0 + + sub $16, %r8 + add $16, %r9 + +.Lst_partial_16: + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $0x10, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +.macro update + movdqa STATE7, T0 + aesenc STATE0, STATE7 + aesenc STATE1, STATE0 + aesenc STATE2, STATE1 + aesenc STATE3, STATE2 + aesenc STATE4, STATE3 + aesenc STATE5, STATE4 + aesenc STATE6, STATE5 + aesenc T0, STATE6 +.endm + +.macro update0 + update + pxor MSG0, STATE7 + pxor MSG1, STATE3 +.endm + +.macro update1 + update + pxor MSG0, STATE6 + pxor MSG1, STATE2 +.endm + +.macro update2 + update + pxor MSG0, STATE5 + pxor MSG1, STATE1 +.endm + +.macro update3 + update + pxor MSG0, STATE4 + pxor MSG1, STATE0 +.endm + +.macro update4 + update + pxor MSG0, STATE3 + pxor MSG1, STATE7 +.endm + +.macro update5 + update + pxor MSG0, STATE2 + pxor MSG1, STATE6 +.endm + +.macro update6 + update + pxor MSG0, STATE1 + pxor MSG1, STATE5 +.endm + +.macro update7 + update + pxor MSG0, STATE0 + pxor MSG1, STATE4 +.endm + +.macro state_load + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + movdqu 0x50(STATEP), STATE5 + movdqu 0x60(STATEP), STATE6 + movdqu 0x70(STATEP), STATE7 +.endm + +.macro state_store s0 s1 s2 s3 s4 s5 s6 s7 + movdqu \s7, 0x00(STATEP) + movdqu \s0, 0x10(STATEP) + movdqu \s1, 0x20(STATEP) + movdqu \s2, 0x30(STATEP) + movdqu \s3, 0x40(STATEP) + movdqu \s4, 0x50(STATEP) + movdqu \s5, 0x60(STATEP) + movdqu \s6, 0x70(STATEP) +.endm + +.macro state_store0 + state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 +.endm + +.macro state_store1 + state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 +.endm + +.macro state_store2 + state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro state_store3 + state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro state_store4 + state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro state_store5 + state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 +.endm + +.macro state_store6 + state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 +.endm + +.macro state_store7 + state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 +.endm + +/* + * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis128l_aesni_init) + FRAME_BEGIN + + /* load key: */ + movdqa (%rsi), MSG1 + movdqa MSG1, STATE0 + movdqa MSG1, STATE4 + movdqa MSG1, STATE5 + movdqa MSG1, STATE6 + movdqa MSG1, STATE7 + + /* load IV: */ + movdqu (%rdx), MSG0 + pxor MSG0, STATE0 + pxor MSG0, STATE4 + + /* load the constants: */ + movdqa .Laegis128l_const_0, STATE2 + movdqa .Laegis128l_const_1, STATE1 + movdqa STATE1, STATE3 + pxor STATE2, STATE5 + pxor STATE1, STATE6 + pxor STATE2, STATE7 + + /* update 10 times with IV and KEY: */ + update0 + update1 + update2 + update3 + update4 + update5 + update6 + update7 + update0 + update1 + + state_store1 + + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_init) + +.macro ad_block a i + movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 + movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 + update\i + sub $0x20, LEN + cmp $0x20, LEN + jl .Lad_out_\i +.endm + +/* + * void crypto_aegis128l_aesni_ad(void *state, unsigned int length, + * const void *data); + */ +ENTRY(crypto_aegis128l_aesni_ad) + FRAME_BEGIN + + cmp $0x20, LEN + jb .Lad_out + + state_load + + mov SRC, %r8 + and $0xf, %r8 + jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: + ad_block a 0 + ad_block a 1 + ad_block a 2 + ad_block a 3 + ad_block a 4 + ad_block a 5 + ad_block a 6 + ad_block a 7 + + add $0x100, SRC + jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: + ad_block u 0 + ad_block u 1 + ad_block u 2 + ad_block u 3 + ad_block u 4 + ad_block u 5 + ad_block u 6 + ad_block u 7 + + add $0x100, SRC + jmp .Lad_u_loop + +.Lad_out_0: + state_store0 + FRAME_END + ret + +.Lad_out_1: + state_store1 + FRAME_END + ret + +.Lad_out_2: + state_store2 + FRAME_END + ret + +.Lad_out_3: + state_store3 + FRAME_END + ret + +.Lad_out_4: + state_store4 + FRAME_END + ret + +.Lad_out_5: + state_store5 + FRAME_END + ret + +.Lad_out_6: + state_store6 + FRAME_END + ret + +.Lad_out_7: + state_store7 + FRAME_END + ret + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_ad) + +.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7 + pxor \s1, \m0 + pxor \s6, \m0 + movdqa \s2, T3 + pand \s3, T3 + pxor T3, \m0 + + pxor \s2, \m1 + pxor \s5, \m1 + movdqa \s6, T3 + pand \s7, T3 + pxor T3, \m1 +.endm + +.macro crypt0 m0 m1 + crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 +.endm + +.macro crypt1 m0 m1 + crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 +.endm + +.macro crypt2 m0 m1 + crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro crypt3 m0 m1 + crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro crypt4 m0 m1 + crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro crypt5 m0 m1 + crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 +.endm + +.macro crypt6 m0 m1 + crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 +.endm + +.macro crypt7 m0 m1 + crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 +.endm + +.macro encrypt_block a i + movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 + movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 + movdqa MSG0, T0 + movdqa MSG1, T1 + crypt\i T0, T1 + movdq\a T0, (\i * 0x20 + 0x00)(DST) + movdq\a T1, (\i * 0x20 + 0x10)(DST) + + update\i + + sub $0x20, LEN + cmp $0x20, LEN + jl .Lenc_out_\i +.endm + +.macro decrypt_block a i + movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 + movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 + crypt\i MSG0, MSG1 + movdq\a MSG0, (\i * 0x20 + 0x00)(DST) + movdq\a MSG1, (\i * 0x20 + 0x10)(DST) + + update\i + + sub $0x20, LEN + cmp $0x20, LEN + jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis128l_aesni_enc(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_enc) + FRAME_BEGIN + + cmp $0x20, LEN + jb .Lenc_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xf, %r8 + jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: + encrypt_block a 0 + encrypt_block a 1 + encrypt_block a 2 + encrypt_block a 3 + encrypt_block a 4 + encrypt_block a 5 + encrypt_block a 6 + encrypt_block a 7 + + add $0x100, SRC + add $0x100, DST + jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: + encrypt_block u 0 + encrypt_block u 1 + encrypt_block u 2 + encrypt_block u 3 + encrypt_block u 4 + encrypt_block u 5 + encrypt_block u 6 + encrypt_block u 7 + + add $0x100, SRC + add $0x100, DST + jmp .Lenc_u_loop + +.Lenc_out_0: + state_store0 + FRAME_END + ret + +.Lenc_out_1: + state_store1 + FRAME_END + ret + +.Lenc_out_2: + state_store2 + FRAME_END + ret + +.Lenc_out_3: + state_store3 + FRAME_END + ret + +.Lenc_out_4: + state_store4 + FRAME_END + ret + +.Lenc_out_5: + state_store5 + FRAME_END + ret + +.Lenc_out_6: + state_store6 + FRAME_END + ret + +.Lenc_out_7: + state_store7 + FRAME_END + ret + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_enc) + +/* + * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_enc_tail) + FRAME_BEGIN + + state_load + + /* encrypt message: */ + call __load_partial + + movdqa MSG0, T0 + movdqa MSG1, T1 + crypt0 T0, T1 + + call __store_partial + + update0 + + state_store0 + + FRAME_END +ENDPROC(crypto_aegis128l_aesni_enc_tail) + +/* + * void crypto_aegis128l_aesni_dec(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_dec) + FRAME_BEGIN + + cmp $0x20, LEN + jb .Ldec_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: + decrypt_block a 0 + decrypt_block a 1 + decrypt_block a 2 + decrypt_block a 3 + decrypt_block a 4 + decrypt_block a 5 + decrypt_block a 6 + decrypt_block a 7 + + add $0x100, SRC + add $0x100, DST + jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: + decrypt_block u 0 + decrypt_block u 1 + decrypt_block u 2 + decrypt_block u 3 + decrypt_block u 4 + decrypt_block u 5 + decrypt_block u 6 + decrypt_block u 7 + + add $0x100, SRC + add $0x100, DST + jmp .Ldec_u_loop + +.Ldec_out_0: + state_store0 + FRAME_END + ret + +.Ldec_out_1: + state_store1 + FRAME_END + ret + +.Ldec_out_2: + state_store2 + FRAME_END + ret + +.Ldec_out_3: + state_store3 + FRAME_END + ret + +.Ldec_out_4: + state_store4 + FRAME_END + ret + +.Ldec_out_5: + state_store5 + FRAME_END + ret + +.Ldec_out_6: + state_store6 + FRAME_END + ret + +.Ldec_out_7: + state_store7 + FRAME_END + ret + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_dec) + +/* + * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_dec_tail) + FRAME_BEGIN + + state_load + + /* decrypt message: */ + call __load_partial + + crypt0 MSG0, MSG1 + + movdqa MSG0, T0 + movdqa MSG1, T1 + call __store_partial + + /* mask with byte count: */ + movq LEN, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa T0, T1 + movdqa .Laegis128l_counter0, T2 + movdqa .Laegis128l_counter1, T3 + pcmpgtb T2, T0 + pcmpgtb T3, T1 + pand T0, MSG0 + pand T1, MSG1 + + update0 + + state_store0 + + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_dec_tail) + +/* + * void crypto_aegis128l_aesni_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis128l_aesni_final) + FRAME_BEGIN + + state_load + + /* prepare length block: */ + movq %rdx, MSG0 + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG0 + psllq $3, MSG0 /* multiply by 8 (to get bit count) */ + + pxor STATE2, MSG0 + movdqa MSG0, MSG1 + + /* update state: */ + update0 + update1 + update2 + update3 + update4 + update5 + update6 + + /* xor tag: */ + movdqu (%rsi), T0 + + pxor STATE1, T0 + pxor STATE2, T0 + pxor STATE3, T0 + pxor STATE4, T0 + pxor STATE5, T0 + pxor STATE6, T0 + pxor STATE7, T0 + + movdqu T0, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_aegis128l_aesni_final) diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c new file mode 100644 index 000000000000..876e4866e633 --- /dev/null +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-128L Authenticated-Encryption Algorithm + * Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +#define AEGIS128L_BLOCK_ALIGN 16 +#define AEGIS128L_BLOCK_SIZE 32 +#define AEGIS128L_NONCE_SIZE 16 +#define AEGIS128L_STATE_BLOCKS 8 +#define AEGIS128L_KEY_SIZE 16 +#define AEGIS128L_MIN_AUTH_SIZE 8 +#define AEGIS128L_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis128l_aesni_ad( + void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis128l_aesni_enc( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_dec( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_enc_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_dec_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_final( + void *state, void *tag_xor, unsigned int cryptlen, + unsigned int assoclen); + +struct aegis_block { + u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN); +}; + +struct aegis_state { + struct aegis_block blocks[AEGIS128L_STATE_BLOCKS]; +}; + +struct aegis_ctx { + struct aegis_block key; +}; + +struct aegis_crypt_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, unsigned int length, const void *src, + void *dst); + void (*crypt_tail)(void *state, unsigned int length, const void *src, + void *dst); +}; + +static void crypto_aegis128l_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, + unsigned int assoclen) +{ + struct scatter_walk walk; + struct aegis_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= AEGIS128L_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = AEGIS128L_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + crypto_aegis128l_aesni_ad(state, + AEGIS128L_BLOCK_SIZE, + buf.bytes); + pos = 0; + left -= fill; + src += fill; + } + + crypto_aegis128l_aesni_ad(state, left, src); + + src += left & ~(AEGIS128L_BLOCK_SIZE - 1); + left &= AEGIS128L_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + pos += left; + assoclen -= size; + + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos); + crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes); + } +} + +static void crypto_aegis128l_aesni_process_crypt( + struct aegis_state *state, struct aead_request *req, + const struct aegis_crypt_ops *ops) +{ + struct skcipher_walk walk; + u8 *src, *dst; + unsigned int chunksize, base; + + ops->skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + src = walk.src.virt.addr; + dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops->crypt_blocks(state, chunksize, src, dst); + + base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1); + src += base; + dst += base; + chunksize &= AEGIS128L_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops->crypt_tail(state, chunksize, src, dst); + + skcipher_walk_done(&walk, 0); + } +} + +static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead) +{ + u8 *ctx = crypto_aead_ctx(aead); + ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); + return (void *)ctx; +} + +static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead); + + if (keylen != AEGIS128L_KEY_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE); + + return 0; +} + +static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + if (authsize > AEGIS128L_MAX_AUTH_SIZE) + return -EINVAL; + if (authsize < AEGIS128L_MIN_AUTH_SIZE) + return -EINVAL; + return 0; +} + +static void crypto_aegis128l_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, + const struct aegis_crypt_ops *ops) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm); + struct aegis_state state; + + kernel_fpu_begin(); + + crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv); + crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen); + crypto_aegis128l_aesni_process_crypt(&state, req, ops); + crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +static int crypto_aegis128l_aesni_encrypt(struct aead_request *req) +{ + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = crypto_aegis128l_aesni_enc, + .crypt_tail = crypto_aegis128l_aesni_enc_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} + +static int crypto_aegis128l_aesni_decrypt(struct aead_request *req) +{ + static const struct aegis_block zeros = {}; + + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = crypto_aegis128l_aesni_dec, + .crypt_tail = crypto_aegis128l_aesni_dec_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead) +{ + return 0; +} + +static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} + +static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} + +static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} + +static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis128l_aesni_alg[] = { + { + .setkey = crypto_aegis128l_aesni_setkey, + .setauthsize = crypto_aegis128l_aesni_setauthsize, + .encrypt = crypto_aegis128l_aesni_encrypt, + .decrypt = crypto_aegis128l_aesni_decrypt, + .init = crypto_aegis128l_aesni_init_tfm, + .exit = crypto_aegis128l_aesni_exit_tfm, + + .ivsize = AEGIS128L_NONCE_SIZE, + .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, + .chunksize = AEGIS128L_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + + .cra_name = "__aegis128l", + .cra_driver_name = "__aegis128l-aesni", + + .cra_module = THIS_MODULE, + } + }, { + .setkey = cryptd_aegis128l_aesni_setkey, + .setauthsize = cryptd_aegis128l_aesni_setauthsize, + .encrypt = cryptd_aegis128l_aesni_encrypt, + .decrypt = cryptd_aegis128l_aesni_decrypt, + .init = cryptd_aegis128l_aesni_init_tfm, + .exit = cryptd_aegis128l_aesni_exit_tfm, + + .ivsize = AEGIS128L_NONCE_SIZE, + .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, + .chunksize = AEGIS128L_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_alignmask = 0, + + .cra_priority = 400, + + .cra_name = "aegis128l", + .cra_driver_name = "aegis128l-aesni", + + .cra_module = THIS_MODULE, + } + } +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis128l_aesni_module_init(void) +{ + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_aegis128l_aesni_alg, + ARRAY_SIZE(crypto_aegis128l_aesni_alg)); +} + +static void __exit crypto_aegis128l_aesni_module_exit(void) +{ + crypto_unregister_aeads(crypto_aegis128l_aesni_alg, + ARRAY_SIZE(crypto_aegis128l_aesni_alg)); +} + +module_init(crypto_aegis128l_aesni_module_init); +module_exit(crypto_aegis128l_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis128l"); +MODULE_ALIAS_CRYPTO("aegis128l-aesni"); diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S new file mode 100644 index 000000000000..1d977d515bf9 --- /dev/null +++ b/arch/x86/crypto/aegis256-aesni-asm.S @@ -0,0 +1,702 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128L + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define STATE5 %xmm5 +#define MSG %xmm6 +#define T0 %xmm7 +#define T1 %xmm8 +#define T2 %xmm9 +#define T3 %xmm10 + +#define STATEP %rdi +#define LEN %rsi +#define SRC %rdx +#define DST %rcx + +.section .rodata.cst16.aegis256_const, "aM", @progbits, 32 +.align 16 +.Laegis256_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis256_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16 +.align 16 +.Laegis256_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +/* + * __load_partial: internal ABI + * input: + * LEN - bytes + * SRC - src + * output: + * MSG - message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG, MSG + + mov LEN, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov LEN, %r8 + and $0x1E, %r8 + add SRC, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov LEN, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov LEN, %r8 + and $0x1C, %r8 + add SRC, %r8 + shl $0x10, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov LEN, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov LEN, %r8 + and $0x18, %r8 + add SRC, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG + + mov LEN, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov LEN, %r8 + and $0x10, %r8 + add SRC, %r8 + pslldq $8, MSG + movq (%r8), T0 + pxor T0, MSG + +.Lld_partial_8: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * LEN - bytes + * DST - dst + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov LEN, %r8 + mov DST, %r9 + + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $0x10, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +.macro update + movdqa STATE5, T0 + aesenc STATE0, STATE5 + aesenc STATE1, STATE0 + aesenc STATE2, STATE1 + aesenc STATE3, STATE2 + aesenc STATE4, STATE3 + aesenc T0, STATE4 +.endm + +.macro update0 m + update + pxor \m, STATE5 +.endm + +.macro update1 m + update + pxor \m, STATE4 +.endm + +.macro update2 m + update + pxor \m, STATE3 +.endm + +.macro update3 m + update + pxor \m, STATE2 +.endm + +.macro update4 m + update + pxor \m, STATE1 +.endm + +.macro update5 m + update + pxor \m, STATE0 +.endm + +.macro state_load + movdqu 0x00(STATEP), STATE0 + movdqu 0x10(STATEP), STATE1 + movdqu 0x20(STATEP), STATE2 + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + movdqu 0x50(STATEP), STATE5 +.endm + +.macro state_store s0 s1 s2 s3 s4 s5 + movdqu \s5, 0x00(STATEP) + movdqu \s0, 0x10(STATEP) + movdqu \s1, 0x20(STATEP) + movdqu \s2, 0x30(STATEP) + movdqu \s3, 0x40(STATEP) + movdqu \s4, 0x50(STATEP) +.endm + +.macro state_store0 + state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro state_store1 + state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro state_store2 + state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro state_store3 + state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 +.endm + +.macro state_store4 + state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 +.endm + +.macro state_store5 + state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 +.endm + +/* + * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis256_aesni_init) + FRAME_BEGIN + + /* load key: */ + movdqa 0x00(%rsi), MSG + movdqa 0x10(%rsi), T1 + movdqa MSG, STATE4 + movdqa T1, STATE5 + + /* load IV: */ + movdqu 0x00(%rdx), T2 + movdqu 0x10(%rdx), T3 + pxor MSG, T2 + pxor T1, T3 + movdqa T2, STATE0 + movdqa T3, STATE1 + + /* load the constants: */ + movdqa .Laegis256_const_0, STATE3 + movdqa .Laegis256_const_1, STATE2 + pxor STATE3, STATE4 + pxor STATE2, STATE5 + + /* update 10 times with IV and KEY: */ + update0 MSG + update1 T1 + update2 T2 + update3 T3 + update4 MSG + update5 T1 + update0 T2 + update1 T3 + update2 MSG + update3 T1 + update4 T2 + update5 T3 + update0 MSG + update1 T1 + update2 T2 + update3 T3 + + state_store3 + + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_init) + +.macro ad_block a i + movdq\a (\i * 0x10)(SRC), MSG + update\i MSG + sub $0x10, LEN + cmp $0x10, LEN + jl .Lad_out_\i +.endm + +/* + * void crypto_aegis256_aesni_ad(void *state, unsigned int length, + * const void *data); + */ +ENTRY(crypto_aegis256_aesni_ad) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lad_out + + state_load + + mov SRC, %r8 + and $0xf, %r8 + jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: + ad_block a 0 + ad_block a 1 + ad_block a 2 + ad_block a 3 + ad_block a 4 + ad_block a 5 + + add $0x60, SRC + jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: + ad_block u 0 + ad_block u 1 + ad_block u 2 + ad_block u 3 + ad_block u 4 + ad_block u 5 + + add $0x60, SRC + jmp .Lad_u_loop + +.Lad_out_0: + state_store0 + FRAME_END + ret + +.Lad_out_1: + state_store1 + FRAME_END + ret + +.Lad_out_2: + state_store2 + FRAME_END + ret + +.Lad_out_3: + state_store3 + FRAME_END + ret + +.Lad_out_4: + state_store4 + FRAME_END + ret + +.Lad_out_5: + state_store5 + FRAME_END + ret + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_ad) + +.macro crypt m s0 s1 s2 s3 s4 s5 + pxor \s1, \m + pxor \s4, \m + pxor \s5, \m + movdqa \s2, T3 + pand \s3, T3 + pxor T3, \m +.endm + +.macro crypt0 m + crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro crypt1 m + crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro crypt2 m + crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro crypt3 m + crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 +.endm + +.macro crypt4 m + crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 +.endm + +.macro crypt5 m + crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 +.endm + +.macro encrypt_block a i + movdq\a (\i * 0x10)(SRC), MSG + movdqa MSG, T0 + crypt\i T0 + movdq\a T0, (\i * 0x10)(DST) + + update\i MSG + + sub $0x10, LEN + cmp $0x10, LEN + jl .Lenc_out_\i +.endm + +.macro decrypt_block a i + movdq\a (\i * 0x10)(SRC), MSG + crypt\i MSG + movdq\a MSG, (\i * 0x10)(DST) + + update\i MSG + + sub $0x10, LEN + cmp $0x10, LEN + jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis256_aesni_enc(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_enc) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Lenc_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xf, %r8 + jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: + encrypt_block a 0 + encrypt_block a 1 + encrypt_block a 2 + encrypt_block a 3 + encrypt_block a 4 + encrypt_block a 5 + + add $0x60, SRC + add $0x60, DST + jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: + encrypt_block u 0 + encrypt_block u 1 + encrypt_block u 2 + encrypt_block u 3 + encrypt_block u 4 + encrypt_block u 5 + + add $0x60, SRC + add $0x60, DST + jmp .Lenc_u_loop + +.Lenc_out_0: + state_store0 + FRAME_END + ret + +.Lenc_out_1: + state_store1 + FRAME_END + ret + +.Lenc_out_2: + state_store2 + FRAME_END + ret + +.Lenc_out_3: + state_store3 + FRAME_END + ret + +.Lenc_out_4: + state_store4 + FRAME_END + ret + +.Lenc_out_5: + state_store5 + FRAME_END + ret + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_enc) + +/* + * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_enc_tail) + FRAME_BEGIN + + state_load + + /* encrypt message: */ + call __load_partial + + movdqa MSG, T0 + crypt0 T0 + + call __store_partial + + update0 MSG + + state_store0 + + FRAME_END +ENDPROC(crypto_aegis256_aesni_enc_tail) + +/* + * void crypto_aegis256_aesni_dec(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_dec) + FRAME_BEGIN + + cmp $0x10, LEN + jb .Ldec_out + + state_load + + mov SRC, %r8 + or DST, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: + decrypt_block a 0 + decrypt_block a 1 + decrypt_block a 2 + decrypt_block a 3 + decrypt_block a 4 + decrypt_block a 5 + + add $0x60, SRC + add $0x60, DST + jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: + decrypt_block u 0 + decrypt_block u 1 + decrypt_block u 2 + decrypt_block u 3 + decrypt_block u 4 + decrypt_block u 5 + + add $0x60, SRC + add $0x60, DST + jmp .Ldec_u_loop + +.Ldec_out_0: + state_store0 + FRAME_END + ret + +.Ldec_out_1: + state_store1 + FRAME_END + ret + +.Ldec_out_2: + state_store2 + FRAME_END + ret + +.Ldec_out_3: + state_store3 + FRAME_END + ret + +.Ldec_out_4: + state_store4 + FRAME_END + ret + +.Ldec_out_5: + state_store5 + FRAME_END + ret + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_dec) + +/* + * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length, + * const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_dec_tail) + FRAME_BEGIN + + state_load + + /* decrypt message: */ + call __load_partial + + crypt0 MSG + + movdqa MSG, T0 + call __store_partial + + /* mask with byte count: */ + movq LEN, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa .Laegis256_counter, T1 + pcmpgtb T1, T0 + pand T0, MSG + + update0 MSG + + state_store0 + + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_dec_tail) + +/* + * void crypto_aegis256_aesni_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis256_aesni_final) + FRAME_BEGIN + + state_load + + /* prepare length block: */ + movq %rdx, MSG + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + pxor STATE3, MSG + + /* update state: */ + update0 MSG + update1 MSG + update2 MSG + update3 MSG + update4 MSG + update5 MSG + update0 MSG + + /* xor tag: */ + movdqu (%rsi), MSG + + pxor STATE0, MSG + pxor STATE1, MSG + pxor STATE2, MSG + pxor STATE3, MSG + pxor STATE4, MSG + pxor STATE5, MSG + + movdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_aegis256_aesni_final) diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c new file mode 100644 index 000000000000..2b5dd3af8f4d --- /dev/null +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-256 Authenticated-Encryption Algorithm + * Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +#define AEGIS256_BLOCK_ALIGN 16 +#define AEGIS256_BLOCK_SIZE 16 +#define AEGIS256_NONCE_SIZE 32 +#define AEGIS256_STATE_BLOCKS 6 +#define AEGIS256_KEY_SIZE 32 +#define AEGIS256_MIN_AUTH_SIZE 8 +#define AEGIS256_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis256_aesni_ad( + void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis256_aesni_enc( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_dec( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_enc_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_dec_tail( + void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_final( + void *state, void *tag_xor, unsigned int cryptlen, + unsigned int assoclen); + +struct aegis_block { + u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN); +}; + +struct aegis_state { + struct aegis_block blocks[AEGIS256_STATE_BLOCKS]; +}; + +struct aegis_ctx { + struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE]; +}; + +struct aegis_crypt_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, unsigned int length, const void *src, + void *dst); + void (*crypt_tail)(void *state, unsigned int length, const void *src, + void *dst); +}; + +static void crypto_aegis256_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, + unsigned int assoclen) +{ + struct scatter_walk walk; + struct aegis_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= AEGIS256_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = AEGIS256_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + crypto_aegis256_aesni_ad(state, + AEGIS256_BLOCK_SIZE, + buf.bytes); + pos = 0; + left -= fill; + src += fill; + } + + crypto_aegis256_aesni_ad(state, left, src); + + src += left & ~(AEGIS256_BLOCK_SIZE - 1); + left &= AEGIS256_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + pos += left; + assoclen -= size; + + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos); + crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes); + } +} + +static void crypto_aegis256_aesni_process_crypt( + struct aegis_state *state, struct aead_request *req, + const struct aegis_crypt_ops *ops) +{ + struct skcipher_walk walk; + u8 *src, *dst; + unsigned int chunksize, base; + + ops->skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + src = walk.src.virt.addr; + dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops->crypt_blocks(state, chunksize, src, dst); + + base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1); + src += base; + dst += base; + chunksize &= AEGIS256_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops->crypt_tail(state, chunksize, src, dst); + + skcipher_walk_done(&walk, 0); + } +} + +static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead) +{ + u8 *ctx = crypto_aead_ctx(aead); + ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); + return (void *)ctx; +} + +static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead); + + if (keylen != AEGIS256_KEY_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key, key, AEGIS256_KEY_SIZE); + + return 0; +} + +static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + if (authsize > AEGIS256_MAX_AUTH_SIZE) + return -EINVAL; + if (authsize < AEGIS256_MIN_AUTH_SIZE) + return -EINVAL; + return 0; +} + +static void crypto_aegis256_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, + const struct aegis_crypt_ops *ops) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm); + struct aegis_state state; + + kernel_fpu_begin(); + + crypto_aegis256_aesni_init(&state, ctx->key, req->iv); + crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen); + crypto_aegis256_aesni_process_crypt(&state, req, ops); + crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +static int crypto_aegis256_aesni_encrypt(struct aead_request *req) +{ + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = crypto_aegis256_aesni_enc, + .crypt_tail = crypto_aegis256_aesni_enc_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} + +static int crypto_aegis256_aesni_decrypt(struct aead_request *req) +{ + static const struct aegis_block zeros = {}; + + static const struct aegis_crypt_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = crypto_aegis256_aesni_dec, + .crypt_tail = crypto_aegis256_aesni_dec_tail, + }; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead) +{ + return 0; +} + +static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead, + const u8 *key, unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis256_aesni_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} + +static int cryptd_aegis256_aesni_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} + +static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} + +static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis256_aesni_alg[] = { + { + .setkey = crypto_aegis256_aesni_setkey, + .setauthsize = crypto_aegis256_aesni_setauthsize, + .encrypt = crypto_aegis256_aesni_encrypt, + .decrypt = crypto_aegis256_aesni_decrypt, + .init = crypto_aegis256_aesni_init_tfm, + .exit = crypto_aegis256_aesni_exit_tfm, + + .ivsize = AEGIS256_NONCE_SIZE, + .maxauthsize = AEGIS256_MAX_AUTH_SIZE, + .chunksize = AEGIS256_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + + .cra_name = "__aegis256", + .cra_driver_name = "__aegis256-aesni", + + .cra_module = THIS_MODULE, + } + }, { + .setkey = cryptd_aegis256_aesni_setkey, + .setauthsize = cryptd_aegis256_aesni_setauthsize, + .encrypt = cryptd_aegis256_aesni_encrypt, + .decrypt = cryptd_aegis256_aesni_decrypt, + .init = cryptd_aegis256_aesni_init_tfm, + .exit = cryptd_aegis256_aesni_exit_tfm, + + .ivsize = AEGIS256_NONCE_SIZE, + .maxauthsize = AEGIS256_MAX_AUTH_SIZE, + .chunksize = AEGIS256_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_alignmask = 0, + + .cra_priority = 400, + + .cra_name = "aegis256", + .cra_driver_name = "aegis256-aesni", + + .cra_module = THIS_MODULE, + } + } +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis256_aesni_module_init(void) +{ + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_aegis256_aesni_alg, + ARRAY_SIZE(crypto_aegis256_aesni_alg)); +} + +static void __exit crypto_aegis256_aesni_module_exit(void) +{ + crypto_unregister_aeads(crypto_aegis256_aesni_alg, + ARRAY_SIZE(crypto_aegis256_aesni_alg)); +} + +module_init(crypto_aegis256_aesni_module_init); +module_exit(crypto_aegis256_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis256"); +MODULE_ALIAS_CRYPTO("aegis256-aesni"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 0420bab19efb..2ddbe3a1868b 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -364,5 +364,5 @@ module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " - "acclerated by PCLMULQDQ-NI"); + "accelerated by PCLMULQDQ-NI"); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S new file mode 100644 index 000000000000..37d422e77931 --- /dev/null +++ b/arch/x86/crypto/morus1280-avx2-asm.S @@ -0,0 +1,621 @@ +/* + * AVX2 implementation of MORUS-1280 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ + (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) +#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) + +#define STATE0 %ymm0 +#define STATE0_LOW %xmm0 +#define STATE1 %ymm1 +#define STATE2 %ymm2 +#define STATE3 %ymm3 +#define STATE4 %ymm4 +#define KEY %ymm5 +#define MSG %ymm5 +#define MSG_LOW %xmm5 +#define T0 %ymm6 +#define T0_LOW %xmm6 +#define T1 %ymm7 + +.section .rodata.cst32.morus1280_const, "aM", @progbits, 32 +.align 32 +.Lmorus1280_const: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 +.align 32 +.Lmorus1280_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +.macro morus1280_round s0, s1, s2, s3, s4, b, w + vpand \s1, \s2, T0 + vpxor T0, \s0, \s0 + vpxor \s3, \s0, \s0 + vpsllq $\b, \s0, T0 + vpsrlq $(64 - \b), \s0, \s0 + vpxor T0, \s0, \s0 + vpermq $\w, \s3, \s3 +.endm + +/* + * __morus1280_update: internal ABI + * input: + * STATE[0-4] - input state + * MSG - message block + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update: + morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 + vpxor MSG, STATE1, STATE1 + morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 + vpxor MSG, STATE2, STATE2 + morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 + vpxor MSG, STATE3, STATE3 + morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 + vpxor MSG, STATE4, STATE4 + morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 + ret +ENDPROC(__morus1280_update) + +/* + * __morus1280_update_zero: internal ABI + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update_zero: + morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 + morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 + morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 + morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 + morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 + ret +ENDPROC(__morus1280_update_zero) + +/* + * __load_partial: internal ABI + * input: + * %rsi - src + * %rcx - bytes + * output: + * MSG - message block + * changed: + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + vpxor MSG, MSG, MSG + + mov %rcx, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov %rcx, %r8 + and $0x1E, %r8 + add %rsi, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov %rcx, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov %rcx, %r8 + and $0x1C, %r8 + add %rsi, %r8 + shl $16, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov %rcx, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov %rcx, %r8 + and $0x18, %r8 + add %rsi, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG_LOW + + mov %rcx, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov %rcx, %r8 + and $0x10, %r8 + add %rsi, %r8 + pshufd $MASK2, MSG_LOW, MSG_LOW + pinsrq $0, (%r8), MSG_LOW + +.Lld_partial_8: + mov %rcx, %r8 + and $0x10, %r8 + jz .Lld_partial_16 + + vpermq $MASK2, MSG, MSG + movdqu (%rsi), MSG_LOW + +.Lld_partial_16: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * %rdx - dst + * %rcx - bytes + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov %rcx, %r8 + mov %rdx, %r9 + + cmp $16, %r8 + jl .Lst_partial_16 + + movdqu T0_LOW, (%r9) + vpermq $MASK2, T0, T0 + + sub $16, %r8 + add $16, %r9 + +.Lst_partial_16: + movq T0_LOW, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + pextrq $1, T0_LOW, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $16, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_morus1280_avx2_init(void *state, const void *key, + * const void *iv); + */ +ENTRY(crypto_morus1280_avx2_init) + FRAME_BEGIN + + /* load IV: */ + vpxor STATE0, STATE0, STATE0 + movdqu (%rdx), STATE0_LOW + /* load key: */ + vmovdqu (%rsi), KEY + vmovdqa KEY, STATE1 + /* load all ones: */ + vpcmpeqd STATE2, STATE2, STATE2 + /* load all zeros: */ + vpxor STATE3, STATE3, STATE3 + /* load the constant: */ + vmovdqa .Lmorus1280_const, STATE4 + + /* update 16 times with zero: */ + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + + /* xor-in the key again after updates: */ + vpxor KEY, STATE1, STATE1 + + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_init) + +/* + * void crypto_morus1280_avx2_ad(void *state, const void *data, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_ad) + FRAME_BEGIN + + cmp $32, %rdx + jb .Lad_out + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + mov %rsi, %r8 + and $0x1F, %r8 + jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: + vmovdqa (%rsi), MSG + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_a_loop + + jmp .Lad_cont +.align 4 +.Lad_u_loop: + vmovdqu (%rsi), MSG + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_u_loop + +.Lad_cont: + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_ad) + +/* + * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_enc) + FRAME_BEGIN + + cmp $32, %rcx + jb .Lenc_out + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0x1F, %r8 + jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: + vmovdqa (%rsi), MSG + vmovdqa MSG, T0 + vpxor STATE0, T0, T0 + vpermq $MASK3, STATE1, T1 + vpxor T1, T0, T0 + vpand STATE2, STATE3, T1 + vpxor T1, T0, T0 + vmovdqa T0, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_a_loop + + jmp .Lenc_cont +.align 4 +.Lenc_u_loop: + vmovdqu (%rsi), MSG + vmovdqa MSG, T0 + vpxor STATE0, T0, T0 + vpermq $MASK3, STATE1, T1 + vpxor T1, T0, T0 + vpand STATE2, STATE3, T1 + vpxor T1, T0, T0 + vmovdqu T0, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_u_loop + +.Lenc_cont: + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_enc) + +/* + * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_enc_tail) + FRAME_BEGIN + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + /* encrypt message: */ + call __load_partial + + vmovdqa MSG, T0 + vpxor STATE0, T0, T0 + vpermq $MASK3, STATE1, T1 + vpxor T1, T0, T0 + vpand STATE2, STATE3, T1 + vpxor T1, T0, T0 + + call __store_partial + + call __morus1280_update + + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + + FRAME_END +ENDPROC(crypto_morus1280_avx2_enc_tail) + +/* + * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_dec) + FRAME_BEGIN + + cmp $32, %rcx + jb .Ldec_out + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0x1F, %r8 + jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: + vmovdqa (%rsi), MSG + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqa MSG, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_a_loop + + jmp .Ldec_cont +.align 4 +.Ldec_u_loop: + vmovdqu (%rsi), MSG + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqu MSG, (%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_u_loop + +.Ldec_cont: + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_dec) + +/* + * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_dec_tail) + FRAME_BEGIN + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + /* decrypt message: */ + call __load_partial + + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqa MSG, T0 + + call __store_partial + + /* mask with byte count: */ + movq %rcx, T0_LOW + vpbroadcastb T0_LOW, T0 + vmovdqa .Lmorus1280_counter, T1 + vpcmpgtb T1, T0, T0 + vpand T0, MSG, MSG + + call __morus1280_update + + /* store the state: */ + vmovdqu STATE0, (0 * 32)(%rdi) + vmovdqu STATE1, (1 * 32)(%rdi) + vmovdqu STATE2, (2 * 32)(%rdi) + vmovdqu STATE3, (3 * 32)(%rdi) + vmovdqu STATE4, (4 * 32)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_dec_tail) + +/* + * void crypto_morus1280_avx2_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus1280_avx2_final) + FRAME_BEGIN + + /* load the state: */ + vmovdqu (0 * 32)(%rdi), STATE0 + vmovdqu (1 * 32)(%rdi), STATE1 + vmovdqu (2 * 32)(%rdi), STATE2 + vmovdqu (3 * 32)(%rdi), STATE3 + vmovdqu (4 * 32)(%rdi), STATE4 + + /* xor state[0] into state[4]: */ + vpxor STATE0, STATE4, STATE4 + + /* prepare length block: */ + vpxor MSG, MSG, MSG + vpinsrq $0, %rdx, MSG_LOW, MSG_LOW + vpinsrq $1, %rcx, MSG_LOW, MSG_LOW + vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ + + /* update state: */ + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + + /* xor tag: */ + vmovdqu (%rsi), MSG + + vpxor STATE0, MSG, MSG + vpermq $MASK3, STATE1, T0 + vpxor T0, MSG, MSG + vpand STATE2, STATE3, T0 + vpxor T0, MSG, MSG + vmovdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_avx2_final) diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c new file mode 100644 index 000000000000..f111f36d26dc --- /dev/null +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + * Glue for AVX2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/internal/aead.h> +#include <crypto/morus1280_glue.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key, + const void *iv); +asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data, + unsigned int length); + +asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, + u64 assoclen, u64 cryptlen); + +MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); + +static const struct x86_cpu_id avx2_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AVX2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id); + +static int __init crypto_morus1280_avx2_module_init(void) +{ + if (!x86_match_cpu(avx2_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_morus1280_avx2_algs, + ARRAY_SIZE(crypto_morus1280_avx2_algs)); +} + +static void __exit crypto_morus1280_avx2_module_exit(void) +{ + crypto_unregister_aeads(crypto_morus1280_avx2_algs, + ARRAY_SIZE(crypto_morus1280_avx2_algs)); +} + +module_init(crypto_morus1280_avx2_module_init); +module_exit(crypto_morus1280_avx2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation"); +MODULE_ALIAS_CRYPTO("morus1280"); +MODULE_ALIAS_CRYPTO("morus1280-avx2"); diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S new file mode 100644 index 000000000000..1fe637c7be9d --- /dev/null +++ b/arch/x86/crypto/morus1280-sse2-asm.S @@ -0,0 +1,895 @@ +/* + * SSE2 implementation of MORUS-1280 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ + (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) + +#define STATE0_LO %xmm0 +#define STATE0_HI %xmm1 +#define STATE1_LO %xmm2 +#define STATE1_HI %xmm3 +#define STATE2_LO %xmm4 +#define STATE2_HI %xmm5 +#define STATE3_LO %xmm6 +#define STATE3_HI %xmm7 +#define STATE4_LO %xmm8 +#define STATE4_HI %xmm9 +#define KEY_LO %xmm10 +#define KEY_HI %xmm11 +#define MSG_LO %xmm10 +#define MSG_HI %xmm11 +#define T0_LO %xmm12 +#define T0_HI %xmm13 +#define T1_LO %xmm14 +#define T1_HI %xmm15 + +.section .rodata.cst16.morus640_const, "aM", @progbits, 16 +.align 16 +.Lmorus640_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Lmorus640_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 +.align 16 +.Lmorus640_counter_0: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Lmorus640_counter_1: + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +.macro rol1 hi, lo + /* + * HI_1 | HI_0 || LO_1 | LO_0 + * ==> + * HI_0 | HI_1 || LO_1 | LO_0 + * ==> + * HI_0 | LO_1 || LO_0 | HI_1 + */ + pshufd $MASK2, \hi, \hi + movdqa \hi, T0_LO + punpcklqdq \lo, T0_LO + punpckhqdq \hi, \lo + movdqa \lo, \hi + movdqa T0_LO, \lo +.endm + +.macro rol2 hi, lo + movdqa \lo, T0_LO + movdqa \hi, \lo + movdqa T0_LO, \hi +.endm + +.macro rol3 hi, lo + /* + * HI_1 | HI_0 || LO_1 | LO_0 + * ==> + * HI_0 | HI_1 || LO_1 | LO_0 + * ==> + * LO_0 | HI_1 || HI_0 | LO_1 + */ + pshufd $MASK2, \hi, \hi + movdqa \lo, T0_LO + punpckhqdq \hi, T0_LO + punpcklqdq \lo, \hi + movdqa T0_LO, \lo +.endm + +.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w + movdqa \s1_l, T0_LO + pand \s2_l, T0_LO + pxor T0_LO, \s0_l + + movdqa \s1_h, T0_LO + pand \s2_h, T0_LO + pxor T0_LO, \s0_h + + pxor \s3_l, \s0_l + pxor \s3_h, \s0_h + + movdqa \s0_l, T0_LO + psllq $\b, T0_LO + psrlq $(64 - \b), \s0_l + pxor T0_LO, \s0_l + + movdqa \s0_h, T0_LO + psllq $\b, T0_LO + psrlq $(64 - \b), \s0_h + pxor T0_LO, \s0_h + + \w \s3_h, \s3_l +.endm + +/* + * __morus1280_update: internal ABI + * input: + * STATE[0-4] - input state + * MSG - message block + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update: + morus1280_round \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + 13, rol1 + pxor MSG_LO, STATE1_LO + pxor MSG_HI, STATE1_HI + morus1280_round \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + 46, rol2 + pxor MSG_LO, STATE2_LO + pxor MSG_HI, STATE2_HI + morus1280_round \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + 38, rol3 + pxor MSG_LO, STATE3_LO + pxor MSG_HI, STATE3_HI + morus1280_round \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + 7, rol2 + pxor MSG_LO, STATE4_LO + pxor MSG_HI, STATE4_HI + morus1280_round \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + 4, rol1 + ret +ENDPROC(__morus1280_update) + +/* + * __morus1280_update_zero: internal ABI + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus1280_update_zero: + morus1280_round \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + 13, rol1 + morus1280_round \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + 46, rol2 + morus1280_round \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + 38, rol3 + morus1280_round \ + STATE3_LO, STATE3_HI, \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + 7, rol2 + morus1280_round \ + STATE4_LO, STATE4_HI, \ + STATE0_LO, STATE0_HI, \ + STATE1_LO, STATE1_HI, \ + STATE2_LO, STATE2_HI, \ + STATE3_LO, STATE3_HI, \ + 4, rol1 + ret +ENDPROC(__morus1280_update_zero) + +/* + * __load_partial: internal ABI + * input: + * %rsi - src + * %rcx - bytes + * output: + * MSG - message block + * changed: + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG_LO, MSG_LO + pxor MSG_HI, MSG_HI + + mov %rcx, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov %rcx, %r8 + and $0x1E, %r8 + add %rsi, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov %rcx, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov %rcx, %r8 + and $0x1C, %r8 + add %rsi, %r8 + shl $16, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov %rcx, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov %rcx, %r8 + and $0x18, %r8 + add %rsi, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG_LO + + mov %rcx, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov %rcx, %r8 + and $0x10, %r8 + add %rsi, %r8 + pslldq $8, MSG_LO + movq (%r8), T0_LO + pxor T0_LO, MSG_LO + +.Lld_partial_8: + mov %rcx, %r8 + and $0x10, %r8 + jz .Lld_partial_16 + + movdqa MSG_LO, MSG_HI + movdqu (%rsi), MSG_LO + +.Lld_partial_16: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * %rdx - dst + * %rcx - bytes + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov %rcx, %r8 + mov %rdx, %r9 + + cmp $16, %r8 + jl .Lst_partial_16 + + movdqu T0_LO, (%r9) + movdqa T0_HI, T0_LO + + sub $16, %r8 + add $16, %r9 + +.Lst_partial_16: + movq T0_LO, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0_LO + movq T0_LO, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $16, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_morus1280_sse2_init(void *state, const void *key, + * const void *iv); + */ +ENTRY(crypto_morus1280_sse2_init) + FRAME_BEGIN + + /* load IV: */ + pxor STATE0_HI, STATE0_HI + movdqu (%rdx), STATE0_LO + /* load key: */ + movdqu 0(%rsi), KEY_LO + movdqu 16(%rsi), KEY_HI + movdqa KEY_LO, STATE1_LO + movdqa KEY_HI, STATE1_HI + /* load all ones: */ + pcmpeqd STATE2_LO, STATE2_LO + pcmpeqd STATE2_HI, STATE2_HI + /* load all zeros: */ + pxor STATE3_LO, STATE3_LO + pxor STATE3_HI, STATE3_HI + /* load the constant: */ + movdqa .Lmorus640_const_0, STATE4_LO + movdqa .Lmorus640_const_1, STATE4_HI + + /* update 16 times with zero: */ + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + call __morus1280_update_zero + + /* xor-in the key again after updates: */ + pxor KEY_LO, STATE1_LO + pxor KEY_HI, STATE1_HI + + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_init) + +/* + * void crypto_morus1280_sse2_ad(void *state, const void *data, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_ad) + FRAME_BEGIN + + cmp $32, %rdx + jb .Lad_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + mov %rsi, %r8 + and $0xF, %r8 + jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: + movdqa 0(%rsi), MSG_LO + movdqa 16(%rsi), MSG_HI + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_a_loop + + jmp .Lad_cont +.align 4 +.Lad_u_loop: + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + call __morus1280_update + sub $32, %rdx + add $32, %rsi + cmp $32, %rdx + jge .Lad_u_loop + +.Lad_cont: + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_ad) + +/* + * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_enc) + FRAME_BEGIN + + cmp $32, %rcx + jb .Lenc_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: + movdqa 0(%rsi), MSG_LO + movdqa 16(%rsi), MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + pxor STATE0_LO, T0_LO + pxor STATE0_HI, T0_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + movdqa T0_LO, 0(%rdx) + movdqa T0_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_a_loop + + jmp .Lenc_cont +.align 4 +.Lenc_u_loop: + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + pxor STATE0_LO, T0_LO + pxor STATE0_HI, T0_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + movdqu T0_LO, 0(%rdx) + movdqu T0_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Lenc_u_loop + +.Lenc_cont: + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_enc) + +/* + * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_enc_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + /* encrypt message: */ + call __load_partial + + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + pxor STATE0_LO, T0_LO + pxor STATE0_HI, T0_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, T0_LO + pxor T1_HI, T0_HI + + call __store_partial + + call __morus1280_update + + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + + FRAME_END +ENDPROC(crypto_morus1280_sse2_enc_tail) + +/* + * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_dec) + FRAME_BEGIN + + cmp $32, %rcx + jb .Ldec_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: + movdqa 0(%rsi), MSG_LO + movdqa 16(%rsi), MSG_HI + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa MSG_LO, 0(%rdx) + movdqa MSG_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_a_loop + + jmp .Ldec_cont +.align 4 +.Ldec_u_loop: + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqu MSG_LO, 0(%rdx) + movdqu MSG_HI, 16(%rdx) + + call __morus1280_update + sub $32, %rcx + add $32, %rsi + add $32, %rdx + cmp $32, %rcx + jge .Ldec_u_loop + +.Ldec_cont: + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_dec) + +/* + * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_dec_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + /* decrypt message: */ + call __load_partial + + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T1_LO + movdqa STATE1_HI, T1_HI + rol3 T1_HI, T1_LO + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa STATE2_LO, T1_LO + movdqa STATE2_HI, T1_HI + pand STATE3_LO, T1_LO + pand STATE3_HI, T1_HI + pxor T1_LO, MSG_LO + pxor T1_HI, MSG_HI + movdqa MSG_LO, T0_LO + movdqa MSG_HI, T0_HI + + call __store_partial + + /* mask with byte count: */ + movq %rcx, T0_LO + punpcklbw T0_LO, T0_LO + punpcklbw T0_LO, T0_LO + punpcklbw T0_LO, T0_LO + punpcklbw T0_LO, T0_LO + movdqa T0_LO, T0_HI + movdqa .Lmorus640_counter_0, T1_LO + movdqa .Lmorus640_counter_1, T1_HI + pcmpgtb T1_LO, T0_LO + pcmpgtb T1_HI, T0_HI + pand T0_LO, MSG_LO + pand T0_HI, MSG_HI + + call __morus1280_update + + /* store the state: */ + movdqu STATE0_LO, (0 * 16)(%rdi) + movdqu STATE0_HI, (1 * 16)(%rdi) + movdqu STATE1_LO, (2 * 16)(%rdi) + movdqu STATE1_HI, (3 * 16)(%rdi) + movdqu STATE2_LO, (4 * 16)(%rdi) + movdqu STATE2_HI, (5 * 16)(%rdi) + movdqu STATE3_LO, (6 * 16)(%rdi) + movdqu STATE3_HI, (7 * 16)(%rdi) + movdqu STATE4_LO, (8 * 16)(%rdi) + movdqu STATE4_HI, (9 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_dec_tail) + +/* + * void crypto_morus1280_sse2_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus1280_sse2_final) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0_LO + movdqu (1 * 16)(%rdi), STATE0_HI + movdqu (2 * 16)(%rdi), STATE1_LO + movdqu (3 * 16)(%rdi), STATE1_HI + movdqu (4 * 16)(%rdi), STATE2_LO + movdqu (5 * 16)(%rdi), STATE2_HI + movdqu (6 * 16)(%rdi), STATE3_LO + movdqu (7 * 16)(%rdi), STATE3_HI + movdqu (8 * 16)(%rdi), STATE4_LO + movdqu (9 * 16)(%rdi), STATE4_HI + + /* xor state[0] into state[4]: */ + pxor STATE0_LO, STATE4_LO + pxor STATE0_HI, STATE4_HI + + /* prepare length block: */ + movq %rdx, MSG_LO + movq %rcx, T0_LO + pslldq $8, T0_LO + pxor T0_LO, MSG_LO + psllq $3, MSG_LO /* multiply by 8 (to get bit count) */ + pxor MSG_HI, MSG_HI + + /* update state: */ + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + call __morus1280_update + + /* xor tag: */ + movdqu 0(%rsi), MSG_LO + movdqu 16(%rsi), MSG_HI + + pxor STATE0_LO, MSG_LO + pxor STATE0_HI, MSG_HI + movdqa STATE1_LO, T0_LO + movdqa STATE1_HI, T0_HI + rol3 T0_HI, T0_LO + pxor T0_LO, MSG_LO + pxor T0_HI, MSG_HI + movdqa STATE2_LO, T0_LO + movdqa STATE2_HI, T0_HI + pand STATE3_LO, T0_LO + pand STATE3_HI, T0_HI + pxor T0_LO, MSG_LO + pxor T0_HI, MSG_HI + + movdqu MSG_LO, 0(%rsi) + movdqu MSG_HI, 16(%rsi) + + FRAME_END + ret +ENDPROC(crypto_morus1280_sse2_final) diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c new file mode 100644 index 000000000000..839270aa713c --- /dev/null +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + * Glue for SSE2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/internal/aead.h> +#include <crypto/morus1280_glue.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key, + const void *iv); +asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data, + unsigned int length); + +asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, + u64 assoclen, u64 cryptlen); + +MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); + +static const struct x86_cpu_id sse2_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); + +static int __init crypto_morus1280_sse2_module_init(void) +{ + if (!x86_match_cpu(sse2_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_morus1280_sse2_algs, + ARRAY_SIZE(crypto_morus1280_sse2_algs)); +} + +static void __exit crypto_morus1280_sse2_module_exit(void) +{ + crypto_unregister_aeads(crypto_morus1280_sse2_algs, + ARRAY_SIZE(crypto_morus1280_sse2_algs)); +} + +module_init(crypto_morus1280_sse2_module_init); +module_exit(crypto_morus1280_sse2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation"); +MODULE_ALIAS_CRYPTO("morus1280"); +MODULE_ALIAS_CRYPTO("morus1280-sse2"); diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c new file mode 100644 index 000000000000..0dccdda1eb3a --- /dev/null +++ b/arch/x86/crypto/morus1280_glue.c @@ -0,0 +1,302 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + * Common x86 SIMD glue skeleton + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/morus1280_glue.h> +#include <crypto/scatterwalk.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <asm/fpu/api.h> + +struct morus1280_state { + struct morus1280_block s[MORUS_STATE_BLOCKS]; +}; + +struct morus1280_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, const void *src, void *dst, + unsigned int length); + void (*crypt_tail)(void *state, const void *src, void *dst, + unsigned int length); +}; + +static void crypto_morus1280_glue_process_ad( + struct morus1280_state *state, + const struct morus1280_glue_ops *ops, + struct scatterlist *sg_src, unsigned int assoclen) +{ + struct scatter_walk walk; + struct morus1280_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= MORUS1280_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = MORUS1280_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); + pos = 0; + left -= fill; + src += fill; + } + + ops->ad(state, src, left); + src += left & ~(MORUS1280_BLOCK_SIZE - 1); + left &= MORUS1280_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + + pos += left; + assoclen -= size; + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); + ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); + } +} + +static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state, + struct morus1280_ops ops, + struct aead_request *req) +{ + struct skcipher_walk walk; + u8 *cursor_src, *cursor_dst; + unsigned int chunksize, base; + + ops.skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + cursor_src = walk.src.virt.addr; + cursor_dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); + + base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1); + cursor_src += base; + cursor_dst += base; + chunksize &= MORUS1280_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops.crypt_tail(state, cursor_src, cursor_dst, + chunksize); + + skcipher_walk_done(&walk, 0); + } +} + +int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct morus1280_ctx *ctx = crypto_aead_ctx(aead); + + if (keylen == MORUS1280_BLOCK_SIZE) { + memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE); + } else if (keylen == MORUS1280_BLOCK_SIZE / 2) { + memcpy(ctx->key.bytes, key, keylen); + memcpy(ctx->key.bytes + keylen, key, keylen); + } else { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey); + +int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize); + +static void crypto_morus1280_glue_crypt(struct aead_request *req, + struct morus1280_ops ops, + unsigned int cryptlen, + struct morus1280_block *tag_xor) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); + struct morus1280_state state; + + kernel_fpu_begin(); + + ctx->ops->init(&state, &ctx->key, req->iv); + crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); + crypto_morus1280_glue_process_crypt(&state, ops, req); + ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +int crypto_morus1280_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); + struct morus1280_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = ctx->ops->enc, + .crypt_tail = ctx->ops->enc_tail, + }; + + struct morus1280_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt); + +int crypto_morus1280_glue_decrypt(struct aead_request *req) +{ + static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); + struct morus1280_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = ctx->ops->dec, + .crypt_tail = ctx->ops->dec_tail, + }; + + struct morus1280_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); + + return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt); + +void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, + const struct morus1280_glue_ops *ops) +{ + struct morus1280_ctx *ctx = crypto_aead_ctx(aead); + ctx->ops = ops; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); + +int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey); + +int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize); + +int cryptd_morus1280_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt); + +int cryptd_morus1280_glue_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt); + +int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + const char *name = crypto_aead_alg(aead)->base.cra_driver_name; + char internal_name[CRYPTO_MAX_ALG_NAME]; + + if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) + >= CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm); + +void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S new file mode 100644 index 000000000000..71c72a0a0862 --- /dev/null +++ b/arch/x86/crypto/morus640-sse2-asm.S @@ -0,0 +1,614 @@ +/* + * SSE2 implementation of MORUS-640 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ + (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) +#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) + +#define STATE0 %xmm0 +#define STATE1 %xmm1 +#define STATE2 %xmm2 +#define STATE3 %xmm3 +#define STATE4 %xmm4 +#define KEY %xmm5 +#define MSG %xmm5 +#define T0 %xmm6 +#define T1 %xmm7 + +.section .rodata.cst16.morus640_const, "aM", @progbits, 32 +.align 16 +.Lmorus640_const_0: + .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d + .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Lmorus640_const_1: + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 +.align 16 +.Lmorus640_counter: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +.macro morus640_round s0, s1, s2, s3, s4, b, w + movdqa \s1, T0 + pand \s2, T0 + pxor T0, \s0 + pxor \s3, \s0 + movdqa \s0, T0 + pslld $\b, T0 + psrld $(32 - \b), \s0 + pxor T0, \s0 + pshufd $\w, \s3, \s3 +.endm + +/* + * __morus640_update: internal ABI + * input: + * STATE[0-4] - input state + * MSG - message block + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus640_update: + morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 + pxor MSG, STATE1 + morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 + pxor MSG, STATE2 + morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 + pxor MSG, STATE3 + morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 + pxor MSG, STATE4 + morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 + ret +ENDPROC(__morus640_update) + + +/* + * __morus640_update_zero: internal ABI + * input: + * STATE[0-4] - input state + * output: + * STATE[0-4] - output state + * changed: + * T0 + */ +__morus640_update_zero: + morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 + morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 + morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 + morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 + morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 + ret +ENDPROC(__morus640_update_zero) + +/* + * __load_partial: internal ABI + * input: + * %rsi - src + * %rcx - bytes + * output: + * MSG - message block + * changed: + * T0 + * %r8 + * %r9 + */ +__load_partial: + xor %r9, %r9 + pxor MSG, MSG + + mov %rcx, %r8 + and $0x1, %r8 + jz .Lld_partial_1 + + mov %rcx, %r8 + and $0x1E, %r8 + add %rsi, %r8 + mov (%r8), %r9b + +.Lld_partial_1: + mov %rcx, %r8 + and $0x2, %r8 + jz .Lld_partial_2 + + mov %rcx, %r8 + and $0x1C, %r8 + add %rsi, %r8 + shl $16, %r9 + mov (%r8), %r9w + +.Lld_partial_2: + mov %rcx, %r8 + and $0x4, %r8 + jz .Lld_partial_4 + + mov %rcx, %r8 + and $0x18, %r8 + add %rsi, %r8 + shl $32, %r9 + mov (%r8), %r8d + xor %r8, %r9 + +.Lld_partial_4: + movq %r9, MSG + + mov %rcx, %r8 + and $0x8, %r8 + jz .Lld_partial_8 + + mov %rcx, %r8 + and $0x10, %r8 + add %rsi, %r8 + pslldq $8, MSG + movq (%r8), T0 + pxor T0, MSG + +.Lld_partial_8: + ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + * %rdx - dst + * %rcx - bytes + * output: + * T0 - message block + * changed: + * %r8 + * %r9 + * %r10 + */ +__store_partial: + mov %rcx, %r8 + mov %rdx, %r9 + + movq T0, %r10 + + cmp $8, %r8 + jl .Lst_partial_8 + + mov %r10, (%r9) + psrldq $8, T0 + movq T0, %r10 + + sub $8, %r8 + add $8, %r9 + +.Lst_partial_8: + cmp $4, %r8 + jl .Lst_partial_4 + + mov %r10d, (%r9) + shr $32, %r10 + + sub $4, %r8 + add $4, %r9 + +.Lst_partial_4: + cmp $2, %r8 + jl .Lst_partial_2 + + mov %r10w, (%r9) + shr $16, %r10 + + sub $2, %r8 + add $2, %r9 + +.Lst_partial_2: + cmp $1, %r8 + jl .Lst_partial_1 + + mov %r10b, (%r9) + +.Lst_partial_1: + ret +ENDPROC(__store_partial) + +/* + * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_morus640_sse2_init) + FRAME_BEGIN + + /* load IV: */ + movdqu (%rdx), STATE0 + /* load key: */ + movdqu (%rsi), KEY + movdqa KEY, STATE1 + /* load all ones: */ + pcmpeqd STATE2, STATE2 + /* load the constants: */ + movdqa .Lmorus640_const_0, STATE3 + movdqa .Lmorus640_const_1, STATE4 + + /* update 16 times with zero: */ + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + call __morus640_update_zero + + /* xor-in the key again after updates: */ + pxor KEY, STATE1 + + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_init) + +/* + * void crypto_morus640_sse2_ad(void *state, const void *data, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_ad) + FRAME_BEGIN + + cmp $16, %rdx + jb .Lad_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + mov %rsi, %r8 + and $0xF, %r8 + jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: + movdqa (%rsi), MSG + call __morus640_update + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lad_a_loop + + jmp .Lad_cont +.align 4 +.Lad_u_loop: + movdqu (%rsi), MSG + call __morus640_update + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lad_u_loop + +.Lad_cont: + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + +.Lad_out: + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_ad) + +/* + * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_enc) + FRAME_BEGIN + + cmp $16, %rcx + jb .Lenc_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: + movdqa (%rsi), MSG + movdqa MSG, T0 + pxor STATE0, T0 + pshufd $MASK3, STATE1, T1 + pxor T1, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + movdqa T0, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Lenc_a_loop + + jmp .Lenc_cont +.align 4 +.Lenc_u_loop: + movdqu (%rsi), MSG + movdqa MSG, T0 + pxor STATE0, T0 + pshufd $MASK3, STATE1, T1 + pxor T1, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + movdqu T0, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Lenc_u_loop + +.Lenc_cont: + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + +.Lenc_out: + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_enc) + +/* + * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_enc_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + /* encrypt message: */ + call __load_partial + + movdqa MSG, T0 + pxor STATE0, T0 + pshufd $MASK3, STATE1, T1 + pxor T1, T0 + movdqa STATE2, T1 + pand STATE3, T1 + pxor T1, T0 + + call __store_partial + + call __morus640_update + + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + + FRAME_END +ENDPROC(crypto_morus640_sse2_enc_tail) + +/* + * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_dec) + FRAME_BEGIN + + cmp $16, %rcx + jb .Ldec_out + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + mov %rsi, %r8 + or %rdx, %r8 + and $0xF, %r8 + jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: + movdqa (%rsi), MSG + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + movdqa MSG, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Ldec_a_loop + + jmp .Ldec_cont +.align 4 +.Ldec_u_loop: + movdqu (%rsi), MSG + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + movdqu MSG, (%rdx) + + call __morus640_update + sub $16, %rcx + add $16, %rsi + add $16, %rdx + cmp $16, %rcx + jge .Ldec_u_loop + +.Ldec_cont: + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + +.Ldec_out: + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_dec) + +/* + * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst, + * unsigned int length); + */ +ENTRY(crypto_morus640_sse2_dec_tail) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + /* decrypt message: */ + call __load_partial + + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + movdqa MSG, T0 + + call __store_partial + + /* mask with byte count: */ + movq %rcx, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + punpcklbw T0, T0 + movdqa .Lmorus640_counter, T1 + pcmpgtb T1, T0 + pand T0, MSG + + call __morus640_update + + /* store the state: */ + movdqu STATE0, (0 * 16)(%rdi) + movdqu STATE1, (1 * 16)(%rdi) + movdqu STATE2, (2 * 16)(%rdi) + movdqu STATE3, (3 * 16)(%rdi) + movdqu STATE4, (4 * 16)(%rdi) + + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_dec_tail) + +/* + * void crypto_morus640_sse2_final(void *state, void *tag_xor, + * u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus640_sse2_final) + FRAME_BEGIN + + /* load the state: */ + movdqu (0 * 16)(%rdi), STATE0 + movdqu (1 * 16)(%rdi), STATE1 + movdqu (2 * 16)(%rdi), STATE2 + movdqu (3 * 16)(%rdi), STATE3 + movdqu (4 * 16)(%rdi), STATE4 + + /* xor state[0] into state[4]: */ + pxor STATE0, STATE4 + + /* prepare length block: */ + movq %rdx, MSG + movq %rcx, T0 + pslldq $8, T0 + pxor T0, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + /* update state: */ + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + call __morus640_update + + /* xor tag: */ + movdqu (%rsi), MSG + + pxor STATE0, MSG + pshufd $MASK3, STATE1, T0 + pxor T0, MSG + movdqa STATE2, T0 + pand STATE3, T0 + pxor T0, MSG + + movdqu MSG, (%rsi) + + FRAME_END + ret +ENDPROC(crypto_morus640_sse2_final) diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c new file mode 100644 index 000000000000..26b47e2db8d2 --- /dev/null +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-640 Authenticated-Encryption Algorithm + * Glue for SSE2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/internal/aead.h> +#include <crypto/morus640_glue.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +asmlinkage void crypto_morus640_sse2_init(void *state, const void *key, + const void *iv); +asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data, + unsigned int length); + +asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src, + void *dst, unsigned int length); +asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, + void *dst, unsigned int length); + +asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, + u64 assoclen, u64 cryptlen); + +MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); + +static const struct x86_cpu_id sse2_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_XMM2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); + +static int __init crypto_morus640_sse2_module_init(void) +{ + if (!x86_match_cpu(sse2_cpu_id)) + return -ENODEV; + + return crypto_register_aeads(crypto_morus640_sse2_algs, + ARRAY_SIZE(crypto_morus640_sse2_algs)); +} + +static void __exit crypto_morus640_sse2_module_exit(void) +{ + crypto_unregister_aeads(crypto_morus640_sse2_algs, + ARRAY_SIZE(crypto_morus640_sse2_algs)); +} + +module_init(crypto_morus640_sse2_module_init); +module_exit(crypto_morus640_sse2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation"); +MODULE_ALIAS_CRYPTO("morus640"); +MODULE_ALIAS_CRYPTO("morus640-sse2"); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c new file mode 100644 index 000000000000..7b58fe4d9bd1 --- /dev/null +++ b/arch/x86/crypto/morus640_glue.c @@ -0,0 +1,298 @@ +/* + * The MORUS-640 Authenticated-Encryption Algorithm + * Common x86 SIMD glue skeleton + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/morus640_glue.h> +#include <crypto/scatterwalk.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <asm/fpu/api.h> + +struct morus640_state { + struct morus640_block s[MORUS_STATE_BLOCKS]; +}; + +struct morus640_ops { + int (*skcipher_walk_init)(struct skcipher_walk *walk, + struct aead_request *req, bool atomic); + + void (*crypt_blocks)(void *state, const void *src, void *dst, + unsigned int length); + void (*crypt_tail)(void *state, const void *src, void *dst, + unsigned int length); +}; + +static void crypto_morus640_glue_process_ad( + struct morus640_state *state, + const struct morus640_glue_ops *ops, + struct scatterlist *sg_src, unsigned int assoclen) +{ + struct scatter_walk walk; + struct morus640_block buf; + unsigned int pos = 0; + + scatterwalk_start(&walk, sg_src); + while (assoclen != 0) { + unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int left = size; + void *mapped = scatterwalk_map(&walk); + const u8 *src = (const u8 *)mapped; + + if (pos + size >= MORUS640_BLOCK_SIZE) { + if (pos > 0) { + unsigned int fill = MORUS640_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); + ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); + pos = 0; + left -= fill; + src += fill; + } + + ops->ad(state, src, left); + src += left & ~(MORUS640_BLOCK_SIZE - 1); + left &= MORUS640_BLOCK_SIZE - 1; + } + + memcpy(buf.bytes + pos, src, left); + + pos += left; + assoclen -= size; + scatterwalk_unmap(mapped); + scatterwalk_advance(&walk, size); + scatterwalk_done(&walk, 0, assoclen); + } + + if (pos > 0) { + memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); + ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); + } +} + +static void crypto_morus640_glue_process_crypt(struct morus640_state *state, + struct morus640_ops ops, + struct aead_request *req) +{ + struct skcipher_walk walk; + u8 *cursor_src, *cursor_dst; + unsigned int chunksize, base; + + ops.skcipher_walk_init(&walk, req, false); + + while (walk.nbytes) { + cursor_src = walk.src.virt.addr; + cursor_dst = walk.dst.virt.addr; + chunksize = walk.nbytes; + + ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); + + base = chunksize & ~(MORUS640_BLOCK_SIZE - 1); + cursor_src += base; + cursor_dst += base; + chunksize &= MORUS640_BLOCK_SIZE - 1; + + if (chunksize > 0) + ops.crypt_tail(state, cursor_src, cursor_dst, + chunksize); + + skcipher_walk_done(&walk, 0); + } +} + +int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct morus640_ctx *ctx = crypto_aead_ctx(aead); + + if (keylen != MORUS640_BLOCK_SIZE) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE); + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey); + +int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize); + +static void crypto_morus640_glue_crypt(struct aead_request *req, + struct morus640_ops ops, + unsigned int cryptlen, + struct morus640_block *tag_xor) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus640_ctx *ctx = crypto_aead_ctx(tfm); + struct morus640_state state; + + kernel_fpu_begin(); + + ctx->ops->init(&state, &ctx->key, req->iv); + crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); + crypto_morus640_glue_process_crypt(&state, ops, req); + ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); +} + +int crypto_morus640_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus640_ctx *ctx = crypto_aead_ctx(tfm); + struct morus640_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_encrypt, + .crypt_blocks = ctx->ops->enc, + .crypt_tail = ctx->ops->enc_tail, + }; + + struct morus640_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + + crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); + return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt); + +int crypto_morus640_glue_decrypt(struct aead_request *req) +{ + static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; + + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct morus640_ctx *ctx = crypto_aead_ctx(tfm); + struct morus640_ops OPS = { + .skcipher_walk_init = skcipher_walk_aead_decrypt, + .crypt_blocks = ctx->ops->dec, + .crypt_tail = ctx->ops->dec_tail, + }; + + struct morus640_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen - authsize; + + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + + crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); + + return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt); + +void crypto_morus640_glue_init_ops(struct crypto_aead *aead, + const struct morus640_glue_ops *ops) +{ + struct morus640_ctx *ctx = crypto_aead_ctx(aead); + ctx->ops = ops; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); + +int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey); + +int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize); + +int cryptd_morus640_glue_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_encrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt); + +int cryptd_morus640_glue_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + struct cryptd_aead *cryptd_tfm = *ctx; + + aead = &cryptd_tfm->base; + if (irq_fpu_usable() && (!in_atomic() || + !cryptd_aead_queued(cryptd_tfm))) + aead = cryptd_aead_child(cryptd_tfm); + + aead_request_set_tfm(req, aead); + + return crypto_aead_decrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt); + +int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + const char *name = crypto_aead_alg(aead)->base.cra_driver_name; + char internal_name[CRYPTO_MAX_ALG_NAME]; + + if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) + >= CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + return 0; +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm); + +void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); +MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S deleted file mode 100644 index 6014b7b9e52a..000000000000 --- a/arch/x86/crypto/salsa20-i586-asm_32.S +++ /dev/null @@ -1,938 +0,0 @@ -# Derived from: -# salsa20_pm.s version 20051229 -# D. J. Bernstein -# Public domain. - -#include <linux/linkage.h> - -.text - -# enter salsa20_encrypt_bytes -ENTRY(salsa20_encrypt_bytes) - mov %esp,%eax - and $31,%eax - add $256,%eax - sub %eax,%esp - # eax_stack = eax - movl %eax,80(%esp) - # ebx_stack = ebx - movl %ebx,84(%esp) - # esi_stack = esi - movl %esi,88(%esp) - # edi_stack = edi - movl %edi,92(%esp) - # ebp_stack = ebp - movl %ebp,96(%esp) - # x = arg1 - movl 4(%esp,%eax),%edx - # m = arg2 - movl 8(%esp,%eax),%esi - # out = arg3 - movl 12(%esp,%eax),%edi - # bytes = arg4 - movl 16(%esp,%eax),%ebx - # bytes -= 0 - sub $0,%ebx - # goto done if unsigned<= - jbe ._done -._start: - # in0 = *(uint32 *) (x + 0) - movl 0(%edx),%eax - # in1 = *(uint32 *) (x + 4) - movl 4(%edx),%ecx - # in2 = *(uint32 *) (x + 8) - movl 8(%edx),%ebp - # j0 = in0 - movl %eax,164(%esp) - # in3 = *(uint32 *) (x + 12) - movl 12(%edx),%eax - # j1 = in1 - movl %ecx,168(%esp) - # in4 = *(uint32 *) (x + 16) - movl 16(%edx),%ecx - # j2 = in2 - movl %ebp,172(%esp) - # in5 = *(uint32 *) (x + 20) - movl 20(%edx),%ebp - # j3 = in3 - movl %eax,176(%esp) - # in6 = *(uint32 *) (x + 24) - movl 24(%edx),%eax - # j4 = in4 - movl %ecx,180(%esp) - # in7 = *(uint32 *) (x + 28) - movl 28(%edx),%ecx - # j5 = in5 - movl %ebp,184(%esp) - # in8 = *(uint32 *) (x + 32) - movl 32(%edx),%ebp - # j6 = in6 - movl %eax,188(%esp) - # in9 = *(uint32 *) (x + 36) - movl 36(%edx),%eax - # j7 = in7 - movl %ecx,192(%esp) - # in10 = *(uint32 *) (x + 40) - movl 40(%edx),%ecx - # j8 = in8 - movl %ebp,196(%esp) - # in11 = *(uint32 *) (x + 44) - movl 44(%edx),%ebp - # j9 = in9 - movl %eax,200(%esp) - # in12 = *(uint32 *) (x + 48) - movl 48(%edx),%eax - # j10 = in10 - movl %ecx,204(%esp) - # in13 = *(uint32 *) (x + 52) - movl 52(%edx),%ecx - # j11 = in11 - movl %ebp,208(%esp) - # in14 = *(uint32 *) (x + 56) - movl 56(%edx),%ebp - # j12 = in12 - movl %eax,212(%esp) - # in15 = *(uint32 *) (x + 60) - movl 60(%edx),%eax - # j13 = in13 - movl %ecx,216(%esp) - # j14 = in14 - movl %ebp,220(%esp) - # j15 = in15 - movl %eax,224(%esp) - # x_backup = x - movl %edx,64(%esp) -._bytesatleast1: - # bytes - 64 - cmp $64,%ebx - # goto nocopy if unsigned>= - jae ._nocopy - # ctarget = out - movl %edi,228(%esp) - # out = &tmp - leal 0(%esp),%edi - # i = bytes - mov %ebx,%ecx - # while (i) { *out++ = *m++; --i } - rep movsb - # out = &tmp - leal 0(%esp),%edi - # m = &tmp - leal 0(%esp),%esi -._nocopy: - # out_backup = out - movl %edi,72(%esp) - # m_backup = m - movl %esi,68(%esp) - # bytes_backup = bytes - movl %ebx,76(%esp) - # in0 = j0 - movl 164(%esp),%eax - # in1 = j1 - movl 168(%esp),%ecx - # in2 = j2 - movl 172(%esp),%edx - # in3 = j3 - movl 176(%esp),%ebx - # x0 = in0 - movl %eax,100(%esp) - # x1 = in1 - movl %ecx,104(%esp) - # x2 = in2 - movl %edx,108(%esp) - # x3 = in3 - movl %ebx,112(%esp) - # in4 = j4 - movl 180(%esp),%eax - # in5 = j5 - movl 184(%esp),%ecx - # in6 = j6 - movl 188(%esp),%edx - # in7 = j7 - movl 192(%esp),%ebx - # x4 = in4 - movl %eax,116(%esp) - # x5 = in5 - movl %ecx,120(%esp) - # x6 = in6 - movl %edx,124(%esp) - # x7 = in7 - movl %ebx,128(%esp) - # in8 = j8 - movl 196(%esp),%eax - # in9 = j9 - movl 200(%esp),%ecx - # in10 = j10 - movl 204(%esp),%edx - # in11 = j11 - movl 208(%esp),%ebx - # x8 = in8 - movl %eax,132(%esp) - # x9 = in9 - movl %ecx,136(%esp) - # x10 = in10 - movl %edx,140(%esp) - # x11 = in11 - movl %ebx,144(%esp) - # in12 = j12 - movl 212(%esp),%eax - # in13 = j13 - movl 216(%esp),%ecx - # in14 = j14 - movl 220(%esp),%edx - # in15 = j15 - movl 224(%esp),%ebx - # x12 = in12 - movl %eax,148(%esp) - # x13 = in13 - movl %ecx,152(%esp) - # x14 = in14 - movl %edx,156(%esp) - # x15 = in15 - movl %ebx,160(%esp) - # i = 20 - mov $20,%ebp - # p = x0 - movl 100(%esp),%eax - # s = x5 - movl 120(%esp),%ecx - # t = x10 - movl 140(%esp),%edx - # w = x15 - movl 160(%esp),%ebx -._mainloop: - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x12 - addl 148(%esp),%eax - # x5 = s - movl %ecx,120(%esp) - # t += x6 - addl 124(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x1 - movl 104(%esp),%esi - # r += s - add %ecx,%esi - # v = x11 - movl 144(%esp),%edi - # v += w - add %ebx,%edi - # p <<<= 7 - rol $7,%eax - # p ^= x4 - xorl 116(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x14 - xorl 156(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x9 - xorl 136(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x3 - xorl 112(%esp),%edi - # x4 = p - movl %eax,116(%esp) - # x14 = t - movl %edx,156(%esp) - # p += x0 - addl 100(%esp),%eax - # x9 = r - movl %esi,136(%esp) - # t += x10 - addl 140(%esp),%edx - # x3 = v - movl %edi,112(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x8 - xorl 132(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x2 - xorl 108(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x13 - xorl 152(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x7 - xorl 128(%esp),%ebx - # x8 = p - movl %eax,132(%esp) - # x2 = t - movl %edx,108(%esp) - # p += x4 - addl 116(%esp),%eax - # x13 = s - movl %ecx,152(%esp) - # t += x14 - addl 156(%esp),%edx - # x7 = w - movl %ebx,128(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x12 - xorl 148(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x6 - xorl 124(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x1 - xorl 104(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x11 - xorl 144(%esp),%edi - # x12 = p - movl %eax,148(%esp) - # x6 = t - movl %edx,124(%esp) - # p += x8 - addl 132(%esp),%eax - # x1 = r - movl %esi,104(%esp) - # t += x2 - addl 108(%esp),%edx - # x11 = v - movl %edi,144(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x3 - addl 112(%esp),%eax - # p <<<= 7 - rol $7,%eax - # x5 = s - movl %ecx,120(%esp) - # t += x9 - addl 136(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x4 - movl 116(%esp),%esi - # r += s - add %ecx,%esi - # v = x14 - movl 156(%esp),%edi - # v += w - add %ebx,%edi - # p ^= x1 - xorl 104(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x11 - xorl 144(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x6 - xorl 124(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x12 - xorl 148(%esp),%edi - # x1 = p - movl %eax,104(%esp) - # x11 = t - movl %edx,144(%esp) - # p += x0 - addl 100(%esp),%eax - # x6 = r - movl %esi,124(%esp) - # t += x10 - addl 140(%esp),%edx - # x12 = v - movl %edi,148(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x2 - xorl 108(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x8 - xorl 132(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x7 - xorl 128(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x13 - xorl 152(%esp),%ebx - # x2 = p - movl %eax,108(%esp) - # x8 = t - movl %edx,132(%esp) - # p += x1 - addl 104(%esp),%eax - # x7 = s - movl %ecx,128(%esp) - # t += x11 - addl 144(%esp),%edx - # x13 = w - movl %ebx,152(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x3 - xorl 112(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x9 - xorl 136(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x4 - xorl 116(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x14 - xorl 156(%esp),%edi - # x3 = p - movl %eax,112(%esp) - # x9 = t - movl %edx,136(%esp) - # p += x2 - addl 108(%esp),%eax - # x4 = r - movl %esi,116(%esp) - # t += x8 - addl 132(%esp),%edx - # x14 = v - movl %edi,156(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x12 - addl 148(%esp),%eax - # x5 = s - movl %ecx,120(%esp) - # t += x6 - addl 124(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x1 - movl 104(%esp),%esi - # r += s - add %ecx,%esi - # v = x11 - movl 144(%esp),%edi - # v += w - add %ebx,%edi - # p <<<= 7 - rol $7,%eax - # p ^= x4 - xorl 116(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x14 - xorl 156(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x9 - xorl 136(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x3 - xorl 112(%esp),%edi - # x4 = p - movl %eax,116(%esp) - # x14 = t - movl %edx,156(%esp) - # p += x0 - addl 100(%esp),%eax - # x9 = r - movl %esi,136(%esp) - # t += x10 - addl 140(%esp),%edx - # x3 = v - movl %edi,112(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x8 - xorl 132(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x2 - xorl 108(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x13 - xorl 152(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x7 - xorl 128(%esp),%ebx - # x8 = p - movl %eax,132(%esp) - # x2 = t - movl %edx,108(%esp) - # p += x4 - addl 116(%esp),%eax - # x13 = s - movl %ecx,152(%esp) - # t += x14 - addl 156(%esp),%edx - # x7 = w - movl %ebx,128(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x12 - xorl 148(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x6 - xorl 124(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x1 - xorl 104(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x11 - xorl 144(%esp),%edi - # x12 = p - movl %eax,148(%esp) - # x6 = t - movl %edx,124(%esp) - # p += x8 - addl 132(%esp),%eax - # x1 = r - movl %esi,104(%esp) - # t += x2 - addl 108(%esp),%edx - # x11 = v - movl %edi,144(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # x0 = p - movl %eax,100(%esp) - # x10 = t - movl %edx,140(%esp) - # p += x3 - addl 112(%esp),%eax - # p <<<= 7 - rol $7,%eax - # x5 = s - movl %ecx,120(%esp) - # t += x9 - addl 136(%esp),%edx - # x15 = w - movl %ebx,160(%esp) - # r = x4 - movl 116(%esp),%esi - # r += s - add %ecx,%esi - # v = x14 - movl 156(%esp),%edi - # v += w - add %ebx,%edi - # p ^= x1 - xorl 104(%esp),%eax - # t <<<= 7 - rol $7,%edx - # t ^= x11 - xorl 144(%esp),%edx - # r <<<= 7 - rol $7,%esi - # r ^= x6 - xorl 124(%esp),%esi - # v <<<= 7 - rol $7,%edi - # v ^= x12 - xorl 148(%esp),%edi - # x1 = p - movl %eax,104(%esp) - # x11 = t - movl %edx,144(%esp) - # p += x0 - addl 100(%esp),%eax - # x6 = r - movl %esi,124(%esp) - # t += x10 - addl 140(%esp),%edx - # x12 = v - movl %edi,148(%esp) - # p <<<= 9 - rol $9,%eax - # p ^= x2 - xorl 108(%esp),%eax - # t <<<= 9 - rol $9,%edx - # t ^= x8 - xorl 132(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 9 - rol $9,%ecx - # s ^= x7 - xorl 128(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 9 - rol $9,%ebx - # w ^= x13 - xorl 152(%esp),%ebx - # x2 = p - movl %eax,108(%esp) - # x8 = t - movl %edx,132(%esp) - # p += x1 - addl 104(%esp),%eax - # x7 = s - movl %ecx,128(%esp) - # t += x11 - addl 144(%esp),%edx - # x13 = w - movl %ebx,152(%esp) - # p <<<= 13 - rol $13,%eax - # p ^= x3 - xorl 112(%esp),%eax - # t <<<= 13 - rol $13,%edx - # t ^= x9 - xorl 136(%esp),%edx - # r += s - add %ecx,%esi - # r <<<= 13 - rol $13,%esi - # r ^= x4 - xorl 116(%esp),%esi - # v += w - add %ebx,%edi - # v <<<= 13 - rol $13,%edi - # v ^= x14 - xorl 156(%esp),%edi - # x3 = p - movl %eax,112(%esp) - # x9 = t - movl %edx,136(%esp) - # p += x2 - addl 108(%esp),%eax - # x4 = r - movl %esi,116(%esp) - # t += x8 - addl 132(%esp),%edx - # x14 = v - movl %edi,156(%esp) - # p <<<= 18 - rol $18,%eax - # p ^= x0 - xorl 100(%esp),%eax - # t <<<= 18 - rol $18,%edx - # t ^= x10 - xorl 140(%esp),%edx - # s += r - add %esi,%ecx - # s <<<= 18 - rol $18,%ecx - # s ^= x5 - xorl 120(%esp),%ecx - # w += v - add %edi,%ebx - # w <<<= 18 - rol $18,%ebx - # w ^= x15 - xorl 160(%esp),%ebx - # i -= 4 - sub $4,%ebp - # goto mainloop if unsigned > - ja ._mainloop - # x0 = p - movl %eax,100(%esp) - # x5 = s - movl %ecx,120(%esp) - # x10 = t - movl %edx,140(%esp) - # x15 = w - movl %ebx,160(%esp) - # out = out_backup - movl 72(%esp),%edi - # m = m_backup - movl 68(%esp),%esi - # in0 = x0 - movl 100(%esp),%eax - # in1 = x1 - movl 104(%esp),%ecx - # in0 += j0 - addl 164(%esp),%eax - # in1 += j1 - addl 168(%esp),%ecx - # in0 ^= *(uint32 *) (m + 0) - xorl 0(%esi),%eax - # in1 ^= *(uint32 *) (m + 4) - xorl 4(%esi),%ecx - # *(uint32 *) (out + 0) = in0 - movl %eax,0(%edi) - # *(uint32 *) (out + 4) = in1 - movl %ecx,4(%edi) - # in2 = x2 - movl 108(%esp),%eax - # in3 = x3 - movl 112(%esp),%ecx - # in2 += j2 - addl 172(%esp),%eax - # in3 += j3 - addl 176(%esp),%ecx - # in2 ^= *(uint32 *) (m + 8) - xorl 8(%esi),%eax - # in3 ^= *(uint32 *) (m + 12) - xorl 12(%esi),%ecx - # *(uint32 *) (out + 8) = in2 - movl %eax,8(%edi) - # *(uint32 *) (out + 12) = in3 - movl %ecx,12(%edi) - # in4 = x4 - movl 116(%esp),%eax - # in5 = x5 - movl 120(%esp),%ecx - # in4 += j4 - addl 180(%esp),%eax - # in5 += j5 - addl 184(%esp),%ecx - # in4 ^= *(uint32 *) (m + 16) - xorl 16(%esi),%eax - # in5 ^= *(uint32 *) (m + 20) - xorl 20(%esi),%ecx - # *(uint32 *) (out + 16) = in4 - movl %eax,16(%edi) - # *(uint32 *) (out + 20) = in5 - movl %ecx,20(%edi) - # in6 = x6 - movl 124(%esp),%eax - # in7 = x7 - movl 128(%esp),%ecx - # in6 += j6 - addl 188(%esp),%eax - # in7 += j7 - addl 192(%esp),%ecx - # in6 ^= *(uint32 *) (m + 24) - xorl 24(%esi),%eax - # in7 ^= *(uint32 *) (m + 28) - xorl 28(%esi),%ecx - # *(uint32 *) (out + 24) = in6 - movl %eax,24(%edi) - # *(uint32 *) (out + 28) = in7 - movl %ecx,28(%edi) - # in8 = x8 - movl 132(%esp),%eax - # in9 = x9 - movl 136(%esp),%ecx - # in8 += j8 - addl 196(%esp),%eax - # in9 += j9 - addl 200(%esp),%ecx - # in8 ^= *(uint32 *) (m + 32) - xorl 32(%esi),%eax - # in9 ^= *(uint32 *) (m + 36) - xorl 36(%esi),%ecx - # *(uint32 *) (out + 32) = in8 - movl %eax,32(%edi) - # *(uint32 *) (out + 36) = in9 - movl %ecx,36(%edi) - # in10 = x10 - movl 140(%esp),%eax - # in11 = x11 - movl 144(%esp),%ecx - # in10 += j10 - addl 204(%esp),%eax - # in11 += j11 - addl 208(%esp),%ecx - # in10 ^= *(uint32 *) (m + 40) - xorl 40(%esi),%eax - # in11 ^= *(uint32 *) (m + 44) - xorl 44(%esi),%ecx - # *(uint32 *) (out + 40) = in10 - movl %eax,40(%edi) - # *(uint32 *) (out + 44) = in11 - movl %ecx,44(%edi) - # in12 = x12 - movl 148(%esp),%eax - # in13 = x13 - movl 152(%esp),%ecx - # in12 += j12 - addl 212(%esp),%eax - # in13 += j13 - addl 216(%esp),%ecx - # in12 ^= *(uint32 *) (m + 48) - xorl 48(%esi),%eax - # in13 ^= *(uint32 *) (m + 52) - xorl 52(%esi),%ecx - # *(uint32 *) (out + 48) = in12 - movl %eax,48(%edi) - # *(uint32 *) (out + 52) = in13 - movl %ecx,52(%edi) - # in14 = x14 - movl 156(%esp),%eax - # in15 = x15 - movl 160(%esp),%ecx - # in14 += j14 - addl 220(%esp),%eax - # in15 += j15 - addl 224(%esp),%ecx - # in14 ^= *(uint32 *) (m + 56) - xorl 56(%esi),%eax - # in15 ^= *(uint32 *) (m + 60) - xorl 60(%esi),%ecx - # *(uint32 *) (out + 56) = in14 - movl %eax,56(%edi) - # *(uint32 *) (out + 60) = in15 - movl %ecx,60(%edi) - # bytes = bytes_backup - movl 76(%esp),%ebx - # in8 = j8 - movl 196(%esp),%eax - # in9 = j9 - movl 200(%esp),%ecx - # in8 += 1 - add $1,%eax - # in9 += 0 + carry - adc $0,%ecx - # j8 = in8 - movl %eax,196(%esp) - # j9 = in9 - movl %ecx,200(%esp) - # bytes - 64 - cmp $64,%ebx - # goto bytesatleast65 if unsigned> - ja ._bytesatleast65 - # goto bytesatleast64 if unsigned>= - jae ._bytesatleast64 - # m = out - mov %edi,%esi - # out = ctarget - movl 228(%esp),%edi - # i = bytes - mov %ebx,%ecx - # while (i) { *out++ = *m++; --i } - rep movsb -._bytesatleast64: - # x = x_backup - movl 64(%esp),%eax - # in8 = j8 - movl 196(%esp),%ecx - # in9 = j9 - movl 200(%esp),%edx - # *(uint32 *) (x + 32) = in8 - movl %ecx,32(%eax) - # *(uint32 *) (x + 36) = in9 - movl %edx,36(%eax) -._done: - # eax = eax_stack - movl 80(%esp),%eax - # ebx = ebx_stack - movl 84(%esp),%ebx - # esi = esi_stack - movl 88(%esp),%esi - # edi = edi_stack - movl 92(%esp),%edi - # ebp = ebp_stack - movl 96(%esp),%ebp - # leave - add %eax,%esp - ret -._bytesatleast65: - # bytes -= 64 - sub $64,%ebx - # out += 64 - add $64,%edi - # m += 64 - add $64,%esi - # goto bytesatleast1 - jmp ._bytesatleast1 -ENDPROC(salsa20_encrypt_bytes) diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S deleted file mode 100644 index 03a4918f41ee..000000000000 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ /dev/null @@ -1,805 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/linkage.h> - -# enter salsa20_encrypt_bytes -ENTRY(salsa20_encrypt_bytes) - mov %rsp,%r11 - and $31,%r11 - add $256,%r11 - sub %r11,%rsp - # x = arg1 - mov %rdi,%r8 - # m = arg2 - mov %rsi,%rsi - # out = arg3 - mov %rdx,%rdi - # bytes = arg4 - mov %rcx,%rdx - # unsigned>? bytes - 0 - cmp $0,%rdx - # comment:fp stack unchanged by jump - # goto done if !unsigned> - jbe ._done - # comment:fp stack unchanged by fallthrough -# start: -._start: - # r11_stack = r11 - movq %r11,0(%rsp) - # r12_stack = r12 - movq %r12,8(%rsp) - # r13_stack = r13 - movq %r13,16(%rsp) - # r14_stack = r14 - movq %r14,24(%rsp) - # r15_stack = r15 - movq %r15,32(%rsp) - # rbx_stack = rbx - movq %rbx,40(%rsp) - # rbp_stack = rbp - movq %rbp,48(%rsp) - # in0 = *(uint64 *) (x + 0) - movq 0(%r8),%rcx - # in2 = *(uint64 *) (x + 8) - movq 8(%r8),%r9 - # in4 = *(uint64 *) (x + 16) - movq 16(%r8),%rax - # in6 = *(uint64 *) (x + 24) - movq 24(%r8),%r10 - # in8 = *(uint64 *) (x + 32) - movq 32(%r8),%r11 - # in10 = *(uint64 *) (x + 40) - movq 40(%r8),%r12 - # in12 = *(uint64 *) (x + 48) - movq 48(%r8),%r13 - # in14 = *(uint64 *) (x + 56) - movq 56(%r8),%r14 - # j0 = in0 - movq %rcx,56(%rsp) - # j2 = in2 - movq %r9,64(%rsp) - # j4 = in4 - movq %rax,72(%rsp) - # j6 = in6 - movq %r10,80(%rsp) - # j8 = in8 - movq %r11,88(%rsp) - # j10 = in10 - movq %r12,96(%rsp) - # j12 = in12 - movq %r13,104(%rsp) - # j14 = in14 - movq %r14,112(%rsp) - # x_backup = x - movq %r8,120(%rsp) -# bytesatleast1: -._bytesatleast1: - # unsigned<? bytes - 64 - cmp $64,%rdx - # comment:fp stack unchanged by jump - # goto nocopy if !unsigned< - jae ._nocopy - # ctarget = out - movq %rdi,128(%rsp) - # out = &tmp - leaq 192(%rsp),%rdi - # i = bytes - mov %rdx,%rcx - # while (i) { *out++ = *m++; --i } - rep movsb - # out = &tmp - leaq 192(%rsp),%rdi - # m = &tmp - leaq 192(%rsp),%rsi - # comment:fp stack unchanged by fallthrough -# nocopy: -._nocopy: - # out_backup = out - movq %rdi,136(%rsp) - # m_backup = m - movq %rsi,144(%rsp) - # bytes_backup = bytes - movq %rdx,152(%rsp) - # x1 = j0 - movq 56(%rsp),%rdi - # x0 = x1 - mov %rdi,%rdx - # (uint64) x1 >>= 32 - shr $32,%rdi - # x3 = j2 - movq 64(%rsp),%rsi - # x2 = x3 - mov %rsi,%rcx - # (uint64) x3 >>= 32 - shr $32,%rsi - # x5 = j4 - movq 72(%rsp),%r8 - # x4 = x5 - mov %r8,%r9 - # (uint64) x5 >>= 32 - shr $32,%r8 - # x5_stack = x5 - movq %r8,160(%rsp) - # x7 = j6 - movq 80(%rsp),%r8 - # x6 = x7 - mov %r8,%rax - # (uint64) x7 >>= 32 - shr $32,%r8 - # x9 = j8 - movq 88(%rsp),%r10 - # x8 = x9 - mov %r10,%r11 - # (uint64) x9 >>= 32 - shr $32,%r10 - # x11 = j10 - movq 96(%rsp),%r12 - # x10 = x11 - mov %r12,%r13 - # x10_stack = x10 - movq %r13,168(%rsp) - # (uint64) x11 >>= 32 - shr $32,%r12 - # x13 = j12 - movq 104(%rsp),%r13 - # x12 = x13 - mov %r13,%r14 - # (uint64) x13 >>= 32 - shr $32,%r13 - # x15 = j14 - movq 112(%rsp),%r15 - # x14 = x15 - mov %r15,%rbx - # (uint64) x15 >>= 32 - shr $32,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # i = 20 - mov $20,%r15 -# mainloop: -._mainloop: - # i_backup = i - movq %r15,184(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x12 + x0 - lea (%r14,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x4 ^= a - xor %rbp,%r9 - # b = x1 + x5 - lea (%rdi,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x9 ^= b - xor %rbp,%r10 - # a = x0 + x4 - lea (%rdx,%r9),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x8 ^= a - xor %rbp,%r11 - # b = x5 + x9 - lea (%r15,%r10),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x13 ^= b - xor %rbp,%r13 - # a = x4 + x8 - lea (%r9,%r11),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x12 ^= a - xor %rbp,%r14 - # b = x9 + x13 - lea (%r10,%r13),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x1 ^= b - xor %rbp,%rdi - # a = x8 + x12 - lea (%r11,%r14),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x13 + x1 - lea (%r13,%rdi),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x6 + x10 - lea (%rax,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x14 ^= c - xor %r15,%rbx - # c = x10 + x14 - lea (%rbp,%rbx),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x2 ^= c - xor %r15,%rcx - # c = x14 + x2 - lea (%rbx,%rcx),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x6 ^= c - xor %r15,%rax - # c = x2 + x6 - lea (%rcx,%rax),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x11 + x15 - lea (%r12,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x3 ^= d - xor %rbp,%rsi - # d = x15 + x3 - lea (%r15,%rsi),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x7 ^= d - xor %rbp,%r8 - # d = x3 + x7 - lea (%rsi,%r8),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x11 ^= d - xor %rbp,%r12 - # d = x7 + x11 - lea (%r8,%r12),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x3 + x0 - lea (%rsi,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x1 ^= a - xor %rbp,%rdi - # b = x4 + x5 - lea (%r9,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x6 ^= b - xor %rbp,%rax - # a = x0 + x1 - lea (%rdx,%rdi),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x2 ^= a - xor %rbp,%rcx - # b = x5 + x6 - lea (%r15,%rax),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x7 ^= b - xor %rbp,%r8 - # a = x1 + x2 - lea (%rdi,%rcx),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x3 ^= a - xor %rbp,%rsi - # b = x6 + x7 - lea (%rax,%r8),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x4 ^= b - xor %rbp,%r9 - # a = x2 + x3 - lea (%rcx,%rsi),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x7 + x4 - lea (%r8,%r9),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x9 + x10 - lea (%r10,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x11 ^= c - xor %r15,%r12 - # c = x10 + x11 - lea (%rbp,%r12),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x8 ^= c - xor %r15,%r11 - # c = x11 + x8 - lea (%r12,%r11),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x9 ^= c - xor %r15,%r10 - # c = x8 + x9 - lea (%r11,%r10),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x14 + x15 - lea (%rbx,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x12 ^= d - xor %rbp,%r14 - # d = x15 + x12 - lea (%r15,%r14),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x13 ^= d - xor %rbp,%r13 - # d = x12 + x13 - lea (%r14,%r13),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x14 ^= d - xor %rbp,%rbx - # d = x13 + x14 - lea (%r13,%rbx),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x12 + x0 - lea (%r14,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x4 ^= a - xor %rbp,%r9 - # b = x1 + x5 - lea (%rdi,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x9 ^= b - xor %rbp,%r10 - # a = x0 + x4 - lea (%rdx,%r9),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x8 ^= a - xor %rbp,%r11 - # b = x5 + x9 - lea (%r15,%r10),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x13 ^= b - xor %rbp,%r13 - # a = x4 + x8 - lea (%r9,%r11),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x12 ^= a - xor %rbp,%r14 - # b = x9 + x13 - lea (%r10,%r13),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x1 ^= b - xor %rbp,%rdi - # a = x8 + x12 - lea (%r11,%r14),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x13 + x1 - lea (%r13,%rdi),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x6 + x10 - lea (%rax,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x14 ^= c - xor %r15,%rbx - # c = x10 + x14 - lea (%rbp,%rbx),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x2 ^= c - xor %r15,%rcx - # c = x14 + x2 - lea (%rbx,%rcx),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x6 ^= c - xor %r15,%rax - # c = x2 + x6 - lea (%rcx,%rax),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x11 + x15 - lea (%r12,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x3 ^= d - xor %rbp,%rsi - # d = x15 + x3 - lea (%r15,%rsi),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x7 ^= d - xor %rbp,%r8 - # d = x3 + x7 - lea (%rsi,%r8),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x11 ^= d - xor %rbp,%r12 - # d = x7 + x11 - lea (%r8,%r12),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # x5 = x5_stack - movq 160(%rsp),%r15 - # a = x3 + x0 - lea (%rsi,%rdx),%rbp - # (uint32) a <<<= 7 - rol $7,%ebp - # x1 ^= a - xor %rbp,%rdi - # b = x4 + x5 - lea (%r9,%r15),%rbp - # (uint32) b <<<= 7 - rol $7,%ebp - # x6 ^= b - xor %rbp,%rax - # a = x0 + x1 - lea (%rdx,%rdi),%rbp - # (uint32) a <<<= 9 - rol $9,%ebp - # x2 ^= a - xor %rbp,%rcx - # b = x5 + x6 - lea (%r15,%rax),%rbp - # (uint32) b <<<= 9 - rol $9,%ebp - # x7 ^= b - xor %rbp,%r8 - # a = x1 + x2 - lea (%rdi,%rcx),%rbp - # (uint32) a <<<= 13 - rol $13,%ebp - # x3 ^= a - xor %rbp,%rsi - # b = x6 + x7 - lea (%rax,%r8),%rbp - # (uint32) b <<<= 13 - rol $13,%ebp - # x4 ^= b - xor %rbp,%r9 - # a = x2 + x3 - lea (%rcx,%rsi),%rbp - # (uint32) a <<<= 18 - rol $18,%ebp - # x0 ^= a - xor %rbp,%rdx - # b = x7 + x4 - lea (%r8,%r9),%rbp - # (uint32) b <<<= 18 - rol $18,%ebp - # x5 ^= b - xor %rbp,%r15 - # x10 = x10_stack - movq 168(%rsp),%rbp - # x5_stack = x5 - movq %r15,160(%rsp) - # c = x9 + x10 - lea (%r10,%rbp),%r15 - # (uint32) c <<<= 7 - rol $7,%r15d - # x11 ^= c - xor %r15,%r12 - # c = x10 + x11 - lea (%rbp,%r12),%r15 - # (uint32) c <<<= 9 - rol $9,%r15d - # x8 ^= c - xor %r15,%r11 - # c = x11 + x8 - lea (%r12,%r11),%r15 - # (uint32) c <<<= 13 - rol $13,%r15d - # x9 ^= c - xor %r15,%r10 - # c = x8 + x9 - lea (%r11,%r10),%r15 - # (uint32) c <<<= 18 - rol $18,%r15d - # x10 ^= c - xor %r15,%rbp - # x15 = x15_stack - movq 176(%rsp),%r15 - # x10_stack = x10 - movq %rbp,168(%rsp) - # d = x14 + x15 - lea (%rbx,%r15),%rbp - # (uint32) d <<<= 7 - rol $7,%ebp - # x12 ^= d - xor %rbp,%r14 - # d = x15 + x12 - lea (%r15,%r14),%rbp - # (uint32) d <<<= 9 - rol $9,%ebp - # x13 ^= d - xor %rbp,%r13 - # d = x12 + x13 - lea (%r14,%r13),%rbp - # (uint32) d <<<= 13 - rol $13,%ebp - # x14 ^= d - xor %rbp,%rbx - # d = x13 + x14 - lea (%r13,%rbx),%rbp - # (uint32) d <<<= 18 - rol $18,%ebp - # x15 ^= d - xor %rbp,%r15 - # x15_stack = x15 - movq %r15,176(%rsp) - # i = i_backup - movq 184(%rsp),%r15 - # unsigned>? i -= 4 - sub $4,%r15 - # comment:fp stack unchanged by jump - # goto mainloop if unsigned> - ja ._mainloop - # (uint32) x2 += j2 - addl 64(%rsp),%ecx - # x3 <<= 32 - shl $32,%rsi - # x3 += j2 - addq 64(%rsp),%rsi - # (uint64) x3 >>= 32 - shr $32,%rsi - # x3 <<= 32 - shl $32,%rsi - # x2 += x3 - add %rsi,%rcx - # (uint32) x6 += j6 - addl 80(%rsp),%eax - # x7 <<= 32 - shl $32,%r8 - # x7 += j6 - addq 80(%rsp),%r8 - # (uint64) x7 >>= 32 - shr $32,%r8 - # x7 <<= 32 - shl $32,%r8 - # x6 += x7 - add %r8,%rax - # (uint32) x8 += j8 - addl 88(%rsp),%r11d - # x9 <<= 32 - shl $32,%r10 - # x9 += j8 - addq 88(%rsp),%r10 - # (uint64) x9 >>= 32 - shr $32,%r10 - # x9 <<= 32 - shl $32,%r10 - # x8 += x9 - add %r10,%r11 - # (uint32) x12 += j12 - addl 104(%rsp),%r14d - # x13 <<= 32 - shl $32,%r13 - # x13 += j12 - addq 104(%rsp),%r13 - # (uint64) x13 >>= 32 - shr $32,%r13 - # x13 <<= 32 - shl $32,%r13 - # x12 += x13 - add %r13,%r14 - # (uint32) x0 += j0 - addl 56(%rsp),%edx - # x1 <<= 32 - shl $32,%rdi - # x1 += j0 - addq 56(%rsp),%rdi - # (uint64) x1 >>= 32 - shr $32,%rdi - # x1 <<= 32 - shl $32,%rdi - # x0 += x1 - add %rdi,%rdx - # x5 = x5_stack - movq 160(%rsp),%rdi - # (uint32) x4 += j4 - addl 72(%rsp),%r9d - # x5 <<= 32 - shl $32,%rdi - # x5 += j4 - addq 72(%rsp),%rdi - # (uint64) x5 >>= 32 - shr $32,%rdi - # x5 <<= 32 - shl $32,%rdi - # x4 += x5 - add %rdi,%r9 - # x10 = x10_stack - movq 168(%rsp),%r8 - # (uint32) x10 += j10 - addl 96(%rsp),%r8d - # x11 <<= 32 - shl $32,%r12 - # x11 += j10 - addq 96(%rsp),%r12 - # (uint64) x11 >>= 32 - shr $32,%r12 - # x11 <<= 32 - shl $32,%r12 - # x10 += x11 - add %r12,%r8 - # x15 = x15_stack - movq 176(%rsp),%rdi - # (uint32) x14 += j14 - addl 112(%rsp),%ebx - # x15 <<= 32 - shl $32,%rdi - # x15 += j14 - addq 112(%rsp),%rdi - # (uint64) x15 >>= 32 - shr $32,%rdi - # x15 <<= 32 - shl $32,%rdi - # x14 += x15 - add %rdi,%rbx - # out = out_backup - movq 136(%rsp),%rdi - # m = m_backup - movq 144(%rsp),%rsi - # x0 ^= *(uint64 *) (m + 0) - xorq 0(%rsi),%rdx - # *(uint64 *) (out + 0) = x0 - movq %rdx,0(%rdi) - # x2 ^= *(uint64 *) (m + 8) - xorq 8(%rsi),%rcx - # *(uint64 *) (out + 8) = x2 - movq %rcx,8(%rdi) - # x4 ^= *(uint64 *) (m + 16) - xorq 16(%rsi),%r9 - # *(uint64 *) (out + 16) = x4 - movq %r9,16(%rdi) - # x6 ^= *(uint64 *) (m + 24) - xorq 24(%rsi),%rax - # *(uint64 *) (out + 24) = x6 - movq %rax,24(%rdi) - # x8 ^= *(uint64 *) (m + 32) - xorq 32(%rsi),%r11 - # *(uint64 *) (out + 32) = x8 - movq %r11,32(%rdi) - # x10 ^= *(uint64 *) (m + 40) - xorq 40(%rsi),%r8 - # *(uint64 *) (out + 40) = x10 - movq %r8,40(%rdi) - # x12 ^= *(uint64 *) (m + 48) - xorq 48(%rsi),%r14 - # *(uint64 *) (out + 48) = x12 - movq %r14,48(%rdi) - # x14 ^= *(uint64 *) (m + 56) - xorq 56(%rsi),%rbx - # *(uint64 *) (out + 56) = x14 - movq %rbx,56(%rdi) - # bytes = bytes_backup - movq 152(%rsp),%rdx - # in8 = j8 - movq 88(%rsp),%rcx - # in8 += 1 - add $1,%rcx - # j8 = in8 - movq %rcx,88(%rsp) - # unsigned>? unsigned<? bytes - 64 - cmp $64,%rdx - # comment:fp stack unchanged by jump - # goto bytesatleast65 if unsigned> - ja ._bytesatleast65 - # comment:fp stack unchanged by jump - # goto bytesatleast64 if !unsigned< - jae ._bytesatleast64 - # m = out - mov %rdi,%rsi - # out = ctarget - movq 128(%rsp),%rdi - # i = bytes - mov %rdx,%rcx - # while (i) { *out++ = *m++; --i } - rep movsb - # comment:fp stack unchanged by fallthrough -# bytesatleast64: -._bytesatleast64: - # x = x_backup - movq 120(%rsp),%rdi - # in8 = j8 - movq 88(%rsp),%rsi - # *(uint64 *) (x + 32) = in8 - movq %rsi,32(%rdi) - # r11 = r11_stack - movq 0(%rsp),%r11 - # r12 = r12_stack - movq 8(%rsp),%r12 - # r13 = r13_stack - movq 16(%rsp),%r13 - # r14 = r14_stack - movq 24(%rsp),%r14 - # r15 = r15_stack - movq 32(%rsp),%r15 - # rbx = rbx_stack - movq 40(%rsp),%rbx - # rbp = rbp_stack - movq 48(%rsp),%rbp - # comment:fp stack unchanged by fallthrough -# done: -._done: - # leave - add %r11,%rsp - mov %rdi,%rax - mov %rsi,%rdx - ret -# bytesatleast65: -._bytesatleast65: - # bytes -= 64 - sub $64,%rdx - # out += 64 - add $64,%rdi - # m += 64 - add $64,%rsi - # comment:fp stack unchanged by jump - # goto bytesatleast1 - jmp ._bytesatleast1 -ENDPROC(salsa20_encrypt_bytes) diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c deleted file mode 100644 index b07d7d959806..000000000000 --- a/arch/x86/crypto/salsa20_glue.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Glue code for optimized assembly version of Salsa20. - * - * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com> - * - * The assembly codes are public domain assembly codes written by Daniel. J. - * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation - * and to remove extraneous comments and functions that are not needed. - * - i586 version, renamed as salsa20-i586-asm_32.S - * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s> - * - x86-64 version, renamed as salsa20-x86_64-asm_64.S - * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s> - * - * Also modified to set up the initial state using the generic C code rather - * than in assembly. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ - -#include <asm/unaligned.h> -#include <crypto/internal/skcipher.h> -#include <crypto/salsa20.h> -#include <linux/module.h> - -asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst, - u32 bytes); - -static int salsa20_asm_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, true); - - crypto_salsa20_init(state, ctx, walk.iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - salsa20_encrypt_bytes(state, walk.src.virt.addr, - walk.dst.virt.addr, nbytes); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static struct skcipher_alg alg = { - .base.cra_name = "salsa20", - .base.cra_driver_name = "salsa20-asm", - .base.cra_priority = 200, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct salsa20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = SALSA20_MIN_KEY_SIZE, - .max_keysize = SALSA20_MAX_KEY_SIZE, - .ivsize = SALSA20_IV_SIZE, - .chunksize = SALSA20_BLOCK_SIZE, - .setkey = crypto_salsa20_setkey, - .encrypt = salsa20_asm_crypt, - .decrypt = salsa20_asm_crypt, -}; - -static int __init init(void) -{ - return crypto_register_skcipher(&alg); -} - -static void __exit fini(void) -{ - crypto_unregister_skcipher(&alg); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)"); -MODULE_ALIAS_CRYPTO("salsa20"); -MODULE_ALIAS_CRYPTO("salsa20-asm"); |