diff options
author | Pauli <pauli@openssl.org> | 2021-05-20 05:51:59 +0200 |
---|---|---|
committer | Matt Caswell <matt@openssl.org> | 2021-05-20 09:51:30 +0200 |
commit | e3884ec5c37334e585e9208ce69d7e5b3cad4624 (patch) | |
tree | 08ade3022fda3a64cd84b629736c0c9ac051833b /crypto | |
parent | Add migration guide for 3.0 (diff) | |
download | openssl-e3884ec5c37334e585e9208ce69d7e5b3cad4624.tar.xz openssl-e3884ec5c37334e585e9208ce69d7e5b3cad4624.zip |
Revert "ARM assembly pack: translate bit-sliced AES implementation to AArch64"
This reverts commit da51566b256e0c0536d5b986e676863b0526bf5e.
Fixes #15321
Reviewed-by: Tim Hudson <tjh@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/15364)
Diffstat (limited to 'crypto')
-rw-r--r-- | crypto/aes/asm/bsaes-armv8.S | 2338 | ||||
-rw-r--r-- | crypto/aes/build.info | 5 |
2 files changed, 2 insertions, 2341 deletions
diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S deleted file mode 100644 index 9bd02d0c8a..0000000000 --- a/crypto/aes/asm/bsaes-armv8.S +++ /dev/null @@ -1,2338 +0,0 @@ -// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the OpenSSL license (the "License"). You may not use -// this file except in compliance with the License. You can obtain a copy -// in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html -// -// ==================================================================== -// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL -// project. Rights for redistribution and usage in source and binary -// forms are granted according to the OpenSSL license. -// ==================================================================== -// -// This implementation is a translation of bsaes-armv7 for AArch64. -// No attempt has been made to carry across the build switches for -// kernel targets, since the Linux kernel crypto support has moved on -// from when it was based on OpenSSL. - -// A lot of hand-scheduling has been performed. Consequently, this code -// doesn't factor out neatly into macros in the same way that the -// AArch32 version did, and there is little to be gained by wrapping it -// up in Perl, and it is presented as pure assembly. - - -#include "crypto/arm_arch.h" - -.text - -.type _bsaes_decrypt8,%function -.align 4 -// On entry: -// x9 -> key (previously expanded using _bsaes_key_convert) -// x10 = number of rounds -// v0-v7 input data -// On exit: -// x9-x11 corrupted -// other general-purpose registers preserved -// v0-v7 output data -// v11-v15 preserved -// other SIMD registers corrupted -_bsaes_decrypt8: - ldr q8, [x9], #16 - adr x11, .LM0ISR - movi v9.16b, #0x55 - ldr q10, [x11], #16 - movi v16.16b, #0x33 - movi v17.16b, #0x0f - sub x10, x10, #1 - eor v0.16b, v0.16b, v8.16b - eor v1.16b, v1.16b, v8.16b - eor v2.16b, v2.16b, v8.16b - eor v4.16b, v4.16b, v8.16b - eor v3.16b, v3.16b, v8.16b - eor v5.16b, v5.16b, v8.16b - tbl v0.16b, {v0.16b}, v10.16b - tbl v1.16b, {v1.16b}, v10.16b - tbl v2.16b, {v2.16b}, v10.16b - tbl v4.16b, {v4.16b}, v10.16b - eor v6.16b, v6.16b, v8.16b - eor v7.16b, v7.16b, v8.16b - tbl v3.16b, {v3.16b}, v10.16b - tbl v5.16b, {v5.16b}, v10.16b - tbl v6.16b, {v6.16b}, v10.16b - ushr v8.2d, v0.2d, #1 - tbl v7.16b, {v7.16b}, v10.16b - ushr v10.2d, v4.2d, #1 - ushr v18.2d, v2.2d, #1 - eor v8.16b, v8.16b, v1.16b - ushr v19.2d, v6.2d, #1 - eor v10.16b, v10.16b, v5.16b - eor v18.16b, v18.16b, v3.16b - and v8.16b, v8.16b, v9.16b - eor v19.16b, v19.16b, v7.16b - and v10.16b, v10.16b, v9.16b - and v18.16b, v18.16b, v9.16b - eor v1.16b, v1.16b, v8.16b - shl v8.2d, v8.2d, #1 - and v9.16b, v19.16b, v9.16b - eor v5.16b, v5.16b, v10.16b - shl v10.2d, v10.2d, #1 - eor v3.16b, v3.16b, v18.16b - shl v18.2d, v18.2d, #1 - eor v0.16b, v0.16b, v8.16b - shl v8.2d, v9.2d, #1 - eor v7.16b, v7.16b, v9.16b - eor v4.16b, v4.16b, v10.16b - eor v2.16b, v2.16b, v18.16b - ushr v9.2d, v1.2d, #2 - eor v6.16b, v6.16b, v8.16b - ushr v8.2d, v0.2d, #2 - ushr v10.2d, v5.2d, #2 - ushr v18.2d, v4.2d, #2 - eor v9.16b, v9.16b, v3.16b - eor v8.16b, v8.16b, v2.16b - eor v10.16b, v10.16b, v7.16b - eor v18.16b, v18.16b, v6.16b - and v9.16b, v9.16b, v16.16b - and v8.16b, v8.16b, v16.16b - and v10.16b, v10.16b, v16.16b - and v16.16b, v18.16b, v16.16b - eor v3.16b, v3.16b, v9.16b - shl v9.2d, v9.2d, #2 - eor v2.16b, v2.16b, v8.16b - shl v8.2d, v8.2d, #2 - eor v7.16b, v7.16b, v10.16b - shl v10.2d, v10.2d, #2 - eor v6.16b, v6.16b, v16.16b - shl v16.2d, v16.2d, #2 - eor v1.16b, v1.16b, v9.16b - eor v0.16b, v0.16b, v8.16b - eor v5.16b, v5.16b, v10.16b - eor v4.16b, v4.16b, v16.16b - ushr v8.2d, v3.2d, #4 - ushr v9.2d, v2.2d, #4 - ushr v10.2d, v1.2d, #4 - ushr v16.2d, v0.2d, #4 - eor v8.16b, v8.16b, v7.16b - eor v9.16b, v9.16b, v6.16b - eor v10.16b, v10.16b, v5.16b - eor v16.16b, v16.16b, v4.16b - and v8.16b, v8.16b, v17.16b - and v9.16b, v9.16b, v17.16b - and v10.16b, v10.16b, v17.16b - and v16.16b, v16.16b, v17.16b - eor v7.16b, v7.16b, v8.16b - shl v8.2d, v8.2d, #4 - eor v6.16b, v6.16b, v9.16b - shl v9.2d, v9.2d, #4 - eor v5.16b, v5.16b, v10.16b - shl v10.2d, v10.2d, #4 - eor v4.16b, v4.16b, v16.16b - shl v16.2d, v16.2d, #4 - eor v3.16b, v3.16b, v8.16b - eor v2.16b, v2.16b, v9.16b - eor v1.16b, v1.16b, v10.16b - eor v0.16b, v0.16b, v16.16b - b .Ldec_sbox -.align 4 -.Ldec_loop: - ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 - ldp q8, q9, [x9], #32 - eor v0.16b, v16.16b, v0.16b - ldr q10, [x9], #16 - eor v1.16b, v17.16b, v1.16b - ldr q16, [x9], #16 - eor v2.16b, v18.16b, v2.16b - eor v3.16b, v19.16b, v3.16b - eor v4.16b, v8.16b, v4.16b - eor v5.16b, v9.16b, v5.16b - eor v6.16b, v10.16b, v6.16b - eor v7.16b, v16.16b, v7.16b - tbl v0.16b, {v0.16b}, v28.16b - tbl v1.16b, {v1.16b}, v28.16b - tbl v2.16b, {v2.16b}, v28.16b - tbl v3.16b, {v3.16b}, v28.16b - tbl v4.16b, {v4.16b}, v28.16b - tbl v5.16b, {v5.16b}, v28.16b - tbl v6.16b, {v6.16b}, v28.16b - tbl v7.16b, {v7.16b}, v28.16b -.Ldec_sbox: - eor v1.16b, v1.16b, v4.16b - eor v3.16b, v3.16b, v4.16b - subs x10, x10, #1 - eor v4.16b, v4.16b, v7.16b - eor v2.16b, v2.16b, v7.16b - eor v1.16b, v1.16b, v6.16b - eor v6.16b, v6.16b, v4.16b - eor v2.16b, v2.16b, v5.16b - eor v0.16b, v0.16b, v1.16b - eor v7.16b, v7.16b, v6.16b - eor v8.16b, v6.16b, v2.16b - and v9.16b, v4.16b, v6.16b - eor v10.16b, v2.16b, v6.16b - eor v3.16b, v3.16b, v0.16b - eor v5.16b, v5.16b, v0.16b - eor v16.16b, v7.16b, v4.16b - eor v17.16b, v4.16b, v0.16b - and v18.16b, v0.16b, v2.16b - eor v19.16b, v7.16b, v4.16b - eor v1.16b, v1.16b, v3.16b - eor v20.16b, v3.16b, v0.16b - eor v21.16b, v5.16b, v2.16b - eor v22.16b, v3.16b, v7.16b - and v8.16b, v17.16b, v8.16b - orr v17.16b, v3.16b, v5.16b - eor v23.16b, v1.16b, v6.16b - eor v24.16b, v20.16b, v16.16b - eor v25.16b, v1.16b, v5.16b - orr v26.16b, v20.16b, v21.16b - and v20.16b, v20.16b, v21.16b - and v27.16b, v7.16b, v1.16b - eor v21.16b, v21.16b, v23.16b - orr v28.16b, v16.16b, v23.16b - orr v29.16b, v22.16b, v25.16b - eor v26.16b, v26.16b, v8.16b - and v16.16b, v16.16b, v23.16b - and v22.16b, v22.16b, v25.16b - and v21.16b, v24.16b, v21.16b - eor v8.16b, v28.16b, v8.16b - eor v23.16b, v5.16b, v2.16b - eor v24.16b, v1.16b, v6.16b - eor v16.16b, v16.16b, v22.16b - eor v22.16b, v3.16b, v0.16b - eor v25.16b, v29.16b, v21.16b - eor v21.16b, v26.16b, v21.16b - eor v8.16b, v8.16b, v20.16b - eor v26.16b, v23.16b, v24.16b - eor v16.16b, v16.16b, v20.16b - eor v28.16b, v22.16b, v19.16b - eor v20.16b, v25.16b, v20.16b - eor v9.16b, v21.16b, v9.16b - eor v8.16b, v8.16b, v18.16b - eor v18.16b, v5.16b, v1.16b - eor v21.16b, v16.16b, v17.16b - eor v16.16b, v16.16b, v17.16b - eor v17.16b, v20.16b, v27.16b - eor v20.16b, v3.16b, v7.16b - eor v25.16b, v9.16b, v8.16b - eor v27.16b, v0.16b, v4.16b - and v29.16b, v9.16b, v17.16b - eor v30.16b, v8.16b, v29.16b - eor v31.16b, v21.16b, v29.16b - eor v29.16b, v21.16b, v29.16b - bsl v30.16b, v17.16b, v21.16b - bsl v31.16b, v9.16b, v8.16b - bsl v16.16b, v30.16b, v29.16b - bsl v21.16b, v29.16b, v30.16b - eor v8.16b, v31.16b, v30.16b - and v1.16b, v1.16b, v31.16b - and v9.16b, v16.16b, v31.16b - and v6.16b, v6.16b, v30.16b - eor v16.16b, v17.16b, v21.16b - and v4.16b, v4.16b, v30.16b - eor v17.16b, v8.16b, v30.16b - and v21.16b, v24.16b, v8.16b - eor v9.16b, v9.16b, v25.16b - and v19.16b, v19.16b, v8.16b - eor v24.16b, v30.16b, v16.16b - eor v25.16b, v30.16b, v16.16b - and v7.16b, v7.16b, v17.16b - and v10.16b, v10.16b, v16.16b - eor v29.16b, v9.16b, v16.16b - eor v30.16b, v31.16b, v9.16b - and v0.16b, v24.16b, v0.16b - and v9.16b, v18.16b, v9.16b - and v2.16b, v25.16b, v2.16b - eor v10.16b, v10.16b, v6.16b - eor v18.16b, v29.16b, v16.16b - and v5.16b, v30.16b, v5.16b - eor v24.16b, v8.16b, v29.16b - and v25.16b, v26.16b, v29.16b - and v26.16b, v28.16b, v29.16b - eor v8.16b, v8.16b, v29.16b - eor v17.16b, v17.16b, v18.16b - eor v5.16b, v1.16b, v5.16b - and v23.16b, v24.16b, v23.16b - eor v21.16b, v21.16b, v25.16b - eor v19.16b, v19.16b, v26.16b - eor v0.16b, v4.16b, v0.16b - and v3.16b, v17.16b, v3.16b - eor v1.16b, v9.16b, v1.16b - eor v9.16b, v25.16b, v23.16b - eor v5.16b, v5.16b, v21.16b - eor v2.16b, v6.16b, v2.16b - and v6.16b, v8.16b, v22.16b - eor v3.16b, v7.16b, v3.16b - and v8.16b, v20.16b, v18.16b - eor v10.16b, v10.16b, v9.16b - eor v0.16b, v0.16b, v19.16b - eor v9.16b, v1.16b, v9.16b - eor v1.16b, v2.16b, v21.16b - eor v3.16b, v3.16b, v19.16b - and v16.16b, v27.16b, v16.16b - eor v17.16b, v26.16b, v6.16b - eor v6.16b, v8.16b, v7.16b - eor v7.16b, v1.16b, v9.16b - eor v1.16b, v5.16b, v3.16b - eor v2.16b, v10.16b, v3.16b - eor v4.16b, v16.16b, v4.16b - eor v8.16b, v6.16b, v17.16b - eor v5.16b, v9.16b, v3.16b - eor v9.16b, v0.16b, v1.16b - eor v6.16b, v7.16b, v1.16b - eor v0.16b, v4.16b, v17.16b - eor v4.16b, v8.16b, v7.16b - eor v7.16b, v9.16b, v2.16b - eor v8.16b, v3.16b, v0.16b - eor v7.16b, v7.16b, v5.16b - eor v3.16b, v4.16b, v7.16b - eor v4.16b, v7.16b, v0.16b - eor v7.16b, v8.16b, v3.16b - bcc .Ldec_done - ext v8.16b, v0.16b, v0.16b, #8 - ext v9.16b, v1.16b, v1.16b, #8 - ldr q28, [x11] // load from .LISR in common case (x10 > 0) - ext v10.16b, v6.16b, v6.16b, #8 - ext v16.16b, v3.16b, v3.16b, #8 - ext v17.16b, v5.16b, v5.16b, #8 - ext v18.16b, v4.16b, v4.16b, #8 - eor v8.16b, v8.16b, v0.16b - eor v9.16b, v9.16b, v1.16b - eor v10.16b, v10.16b, v6.16b - eor v16.16b, v16.16b, v3.16b - eor v17.16b, v17.16b, v5.16b - ext v19.16b, v2.16b, v2.16b, #8 - ext v20.16b, v7.16b, v7.16b, #8 - eor v18.16b, v18.16b, v4.16b - eor v6.16b, v6.16b, v8.16b - eor v8.16b, v2.16b, v10.16b - eor v4.16b, v4.16b, v9.16b - eor v2.16b, v19.16b, v2.16b - eor v9.16b, v20.16b, v7.16b - eor v0.16b, v0.16b, v16.16b - eor v1.16b, v1.16b, v16.16b - eor v6.16b, v6.16b, v17.16b - eor v8.16b, v8.16b, v16.16b - eor v7.16b, v7.16b, v18.16b - eor v4.16b, v4.16b, v16.16b - eor v2.16b, v3.16b, v2.16b - eor v1.16b, v1.16b, v17.16b - eor v3.16b, v5.16b, v9.16b - eor v5.16b, v8.16b, v17.16b - eor v7.16b, v7.16b, v17.16b - ext v8.16b, v0.16b, v0.16b, #12 - ext v9.16b, v6.16b, v6.16b, #12 - ext v10.16b, v4.16b, v4.16b, #12 - ext v16.16b, v1.16b, v1.16b, #12 - ext v17.16b, v5.16b, v5.16b, #12 - ext v18.16b, v7.16b, v7.16b, #12 - eor v0.16b, v0.16b, v8.16b - eor v6.16b, v6.16b, v9.16b - eor v4.16b, v4.16b, v10.16b - ext v19.16b, v2.16b, v2.16b, #12 - ext v20.16b, v3.16b, v3.16b, #12 - eor v1.16b, v1.16b, v16.16b - eor v5.16b, v5.16b, v17.16b - eor v7.16b, v7.16b, v18.16b - eor v2.16b, v2.16b, v19.16b - eor v16.16b, v16.16b, v0.16b - eor v3.16b, v3.16b, v20.16b - eor v17.16b, v17.16b, v4.16b - eor v10.16b, v10.16b, v6.16b - ext v0.16b, v0.16b, v0.16b, #8 - eor v9.16b, v9.16b, v1.16b - ext v1.16b, v1.16b, v1.16b, #8 - eor v8.16b, v8.16b, v3.16b - eor v16.16b, v16.16b, v3.16b - eor v18.16b, v18.16b, v5.16b - eor v19.16b, v19.16b, v7.16b - ext v21.16b, v5.16b, v5.16b, #8 - ext v5.16b, v7.16b, v7.16b, #8 - eor v7.16b, v20.16b, v2.16b - ext v4.16b, v4.16b, v4.16b, #8 - ext v20.16b, v3.16b, v3.16b, #8 - eor v17.16b, v17.16b, v3.16b - ext v2.16b, v2.16b, v2.16b, #8 - eor v3.16b, v10.16b, v3.16b - ext v10.16b, v6.16b, v6.16b, #8 - eor v0.16b, v0.16b, v8.16b - eor v1.16b, v1.16b, v16.16b - eor v5.16b, v5.16b, v18.16b - eor v3.16b, v3.16b, v4.16b - eor v7.16b, v20.16b, v7.16b - eor v6.16b, v2.16b, v19.16b - eor v4.16b, v21.16b, v17.16b - eor v2.16b, v10.16b, v9.16b - bne .Ldec_loop - ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) - b .Ldec_loop -.align 4 -.Ldec_done: - ushr v8.2d, v0.2d, #1 - movi v9.16b, #0x55 - ldr q10, [x9] - ushr v16.2d, v2.2d, #1 - movi v17.16b, #0x33 - ushr v18.2d, v6.2d, #1 - movi v19.16b, #0x0f - eor v8.16b, v8.16b, v1.16b - ushr v20.2d, v3.2d, #1 - eor v16.16b, v16.16b, v7.16b - eor v18.16b, v18.16b, v4.16b - and v8.16b, v8.16b, v9.16b - eor v20.16b, v20.16b, v5.16b - and v16.16b, v16.16b, v9.16b - and v18.16b, v18.16b, v9.16b - shl v21.2d, v8.2d, #1 - eor v1.16b, v1.16b, v8.16b - and v8.16b, v20.16b, v9.16b - eor v7.16b, v7.16b, v16.16b - shl v9.2d, v16.2d, #1 - eor v4.16b, v4.16b, v18.16b - shl v16.2d, v18.2d, #1 - eor v0.16b, v0.16b, v21.16b - shl v18.2d, v8.2d, #1 - eor v5.16b, v5.16b, v8.16b - eor v2.16b, v2.16b, v9.16b - eor v6.16b, v6.16b, v16.16b - ushr v8.2d, v1.2d, #2 - eor v3.16b, v3.16b, v18.16b - ushr v9.2d, v0.2d, #2 - ushr v16.2d, v7.2d, #2 - ushr v18.2d, v2.2d, #2 - eor v8.16b, v8.16b, v4.16b - eor v9.16b, v9.16b, v6.16b - eor v16.16b, v16.16b, v5.16b - eor v18.16b, v18.16b, v3.16b - and v8.16b, v8.16b, v17.16b - and v9.16b, v9.16b, v17.16b - and v16.16b, v16.16b, v17.16b - and v17.16b, v18.16b, v17.16b - eor v4.16b, v4.16b, v8.16b - shl v8.2d, v8.2d, #2 - eor v6.16b, v6.16b, v9.16b - shl v9.2d, v9.2d, #2 - eor v5.16b, v5.16b, v16.16b - shl v16.2d, v16.2d, #2 - eor v3.16b, v3.16b, v17.16b - shl v17.2d, v17.2d, #2 - eor v1.16b, v1.16b, v8.16b - eor v0.16b, v0.16b, v9.16b - eor v7.16b, v7.16b, v16.16b - eor v2.16b, v2.16b, v17.16b - ushr v8.2d, v4.2d, #4 - ushr v9.2d, v6.2d, #4 - ushr v16.2d, v1.2d, #4 - ushr v17.2d, v0.2d, #4 - eor v8.16b, v8.16b, v5.16b - eor v9.16b, v9.16b, v3.16b - eor v16.16b, v16.16b, v7.16b - eor v17.16b, v17.16b, v2.16b - and v8.16b, v8.16b, v19.16b - and v9.16b, v9.16b, v19.16b - and v16.16b, v16.16b, v19.16b - and v17.16b, v17.16b, v19.16b - eor v5.16b, v5.16b, v8.16b - shl v8.2d, v8.2d, #4 - eor v3.16b, v3.16b, v9.16b - shl v9.2d, v9.2d, #4 - eor v7.16b, v7.16b, v16.16b - shl v16.2d, v16.2d, #4 - eor v2.16b, v2.16b, v17.16b - shl v17.2d, v17.2d, #4 - eor v4.16b, v4.16b, v8.16b - eor v6.16b, v6.16b, v9.16b - eor v7.16b, v7.16b, v10.16b - eor v1.16b, v1.16b, v16.16b - eor v2.16b, v2.16b, v10.16b - eor v0.16b, v0.16b, v17.16b - eor v4.16b, v4.16b, v10.16b - eor v6.16b, v6.16b, v10.16b - eor v3.16b, v3.16b, v10.16b - eor v5.16b, v5.16b, v10.16b - eor v1.16b, v1.16b, v10.16b - eor v0.16b, v0.16b, v10.16b - ret -.size _bsaes_decrypt8,.-_bsaes_decrypt8 - -.type _bsaes_const,%object -.align 6 -_bsaes_const: -// InvShiftRows constants -// Used in _bsaes_decrypt8, which assumes contiguity -// .LM0ISR used with round 0 key -// .LISR used with middle round keys -// .LISRM0 used with final round key -.LM0ISR: -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -.LISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -.LISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d - -// ShiftRows constants -// Used in _bsaes_encrypt8, which assumes contiguity -// .LM0SR used with round 0 key -// .LSR used with middle round keys -// .LSRM0 used with final round key -.LM0SR: -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -.LSR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -.LSRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d - -.LM0_bigendian: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -.LM0_littleendian: -.quad 0x0105090d0004080c, 0x03070b0f02060a0e - -// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into -// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR -.LREVM0SR: -.quad 0x090d01050c000408, 0x03070b0f060a0e02 - -.align 6 -.size _bsaes_const,.-_bsaes_const - -.type _bsaes_encrypt8,%function -.align 4 -// On entry: -// x9 -> key (previously expanded using _bsaes_key_convert) -// x10 = number of rounds -// v0-v7 input data -// On exit: -// x9-x11 corrupted -// other general-purpose registers preserved -// v0-v7 output data -// v11-v15 preserved -// other SIMD registers corrupted -_bsaes_encrypt8: - ldr q8, [x9], #16 - adr x11, .LM0SR - ldr q9, [x11], #16 -_bsaes_encrypt8_alt: - eor v0.16b, v0.16b, v8.16b - eor v1.16b, v1.16b, v8.16b - sub x10, x10, #1 - eor v2.16b, v2.16b, v8.16b - eor v4.16b, v4.16b, v8.16b - eor v3.16b, v3.16b, v8.16b - eor v5.16b, v5.16b, v8.16b - tbl v0.16b, {v0.16b}, v9.16b - tbl v1.16b, {v1.16b}, v9.16b - tbl v2.16b, {v2.16b}, v9.16b - tbl v4.16b, {v4.16b}, v9.16b - eor v6.16b, v6.16b, v8.16b - eor v7.16b, v7.16b, v8.16b - tbl v3.16b, {v3.16b}, v9.16b - tbl v5.16b, {v5.16b}, v9.16b - tbl v6.16b, {v6.16b}, v9.16b - ushr v8.2d, v0.2d, #1 - movi v10.16b, #0x55 - tbl v7.16b, {v7.16b}, v9.16b - ushr v9.2d, v4.2d, #1 - movi v16.16b, #0x33 - ushr v17.2d, v2.2d, #1 - eor v8.16b, v8.16b, v1.16b - movi v18.16b, #0x0f - ushr v19.2d, v6.2d, #1 - eor v9.16b, v9.16b, v5.16b - eor v17.16b, v17.16b, v3.16b - and v8.16b, v8.16b, v10.16b - eor v19.16b, v19.16b, v7.16b - and v9.16b, v9.16b, v10.16b - and v17.16b, v17.16b, v10.16b - eor v1.16b, v1.16b, v8.16b - shl v8.2d, v8.2d, #1 - and v10.16b, v19.16b, v10.16b - eor v5.16b, v5.16b, v9.16b - shl v9.2d, v9.2d, #1 - eor v3.16b, v3.16b, v17.16b - shl v17.2d, v17.2d, #1 - eor v0.16b, v0.16b, v8.16b - shl v8.2d, v10.2d, #1 - eor v7.16b, v7.16b, v10.16b - eor v4.16b, v4.16b, v9.16b - eor v2.16b, v2.16b, v17.16b - ushr v9.2d, v1.2d, #2 - eor v6.16b, v6.16b, v8.16b - ushr v8.2d, v0.2d, #2 - ushr v10.2d, v5.2d, #2 - ushr v17.2d, v4.2d, #2 - eor v9.16b, v9.16b, v3.16b - eor v8.16b, v8.16b, v2.16b - eor v10.16b, v10.16b, v7.16b - eor v17.16b, v17.16b, v6.16b - and v9.16b, v9.16b, v16.16b - and v8.16b, v8.16b, v16.16b - and v10.16b, v10.16b, v16.16b - and v16.16b, v17.16b, v16.16b - eor v3.16b, v3.16b, v9.16b - shl v9.2d, v9.2d, #2 - eor v2.16b, v2.16b, v8.16b - shl v8.2d, v8.2d, #2 - eor v7.16b, v7.16b, v10.16b - shl v10.2d, v10.2d, #2 - eor v6.16b, v6.16b, v16.16b - shl v16.2d, v16.2d, #2 - eor v1.16b, v1.16b, v9.16b - eor v0.16b, v0.16b, v8.16b - eor v5.16b, v5.16b, v10.16b - eor v4.16b, v4.16b, v16.16b - ushr v8.2d, v3.2d, #4 - ushr v9.2d, v2.2d, #4 - ushr v10.2d, v1.2d, #4 - ushr v16.2d, v0.2d, #4 - eor v8.16b, v8.16b, v7.16b - eor v9.16b, v9.16b, v6.16b - eor v10.16b, v10.16b, v5.16b - eor v16.16b, v16.16b, v4.16b - and v8.16b, v8.16b, v18.16b - and v9.16b, v9.16b, v18.16b - and v10.16b, v10.16b, v18.16b - and v16.16b, v16.16b, v18.16b - eor v7.16b, v7.16b, v8.16b - shl v8.2d, v8.2d, #4 - eor v6.16b, v6.16b, v9.16b - shl v9.2d, v9.2d, #4 - eor v5.16b, v5.16b, v10.16b - shl v10.2d, v10.2d, #4 - eor v4.16b, v4.16b, v16.16b - shl v16.2d, v16.2d, #4 - eor v3.16b, v3.16b, v8.16b - eor v2.16b, v2.16b, v9.16b - eor v1.16b, v1.16b, v10.16b - eor v0.16b, v0.16b, v16.16b - b .Lenc_sbox -.align 4 -.Lenc_loop: - ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 - ldp q8, q9, [x9], #32 - eor v0.16b, v16.16b, v0.16b - ldr q10, [x9], #16 - eor v1.16b, v17.16b, v1.16b - ldr q16, [x9], #16 - eor v2.16b, v18.16b, v2.16b - eor v3.16b, v19.16b, v3.16b - eor v4.16b, v8.16b, v4.16b - eor v5.16b, v9.16b, v5.16b - eor v6.16b, v10.16b, v6.16b - eor v7.16b, v16.16b, v7.16b - tbl v0.16b, {v0.16b}, v28.16b - tbl v1.16b, {v1.16b}, v28.16b - tbl v2.16b, {v2.16b}, v28.16b - tbl v3.16b, {v3.16b}, v28.16b - tbl v4.16b, {v4.16b}, v28.16b - tbl v5.16b, {v5.16b}, v28.16b - tbl v6.16b, {v6.16b}, v28.16b - tbl v7.16b, {v7.16b}, v28.16b -.Lenc_sbox: - eor v5.16b, v5.16b, v6.16b - eor v3.16b, v3.16b, v0.16b - subs x10, x10, #1 - eor v2.16b, v2.16b, v1.16b - eor v5.16b, v5.16b, v0.16b - eor v8.16b, v3.16b, v7.16b - eor v6.16b, v6.16b, v2.16b - eor v7.16b, v7.16b, v5.16b - eor v8.16b, v8.16b, v4.16b - eor v3.16b, v6.16b, v3.16b - eor v4.16b, v4.16b, v5.16b - eor v6.16b, v1.16b, v5.16b - eor v2.16b, v2.16b, v7.16b - eor v1.16b, v8.16b, v1.16b - eor v8.16b, v7.16b, v4.16b - eor v9.16b, v3.16b, v0.16b - eor v10.16b, v7.16b, v6.16b - eor v16.16b, v5.16b, v3.16b - eor v17.16b, v6.16b, v2.16b - eor v18.16b, v5.16b, v1.16b - eor v19.16b, v2.16b, v4.16b - eor v20.16b, v1.16b, v0.16b - orr v21.16b, v8.16b, v9.16b - orr v22.16b, v10.16b, v16.16b - eor v23.16b, v8.16b, v17.16b - eor v24.16b, v9.16b, v18.16b - and v19.16b, v19.16b, v20.16b - orr v20.16b, v17.16b, v18.16b - and v8.16b, v8.16b, v9.16b - and v9.16b, v17.16b, v18.16b - and v17.16b, v23.16b, v24.16b - and v10.16b, v10.16b, v16.16b - eor v16.16b, v21.16b, v19.16b - eor v18.16b, v20.16b, v19.16b - and v19.16b, v2.16b, v1.16b - and v20.16b, v6.16b, v5.16b - eor v21.16b, v22.16b, v17.16b - eor v9.16b, v9.16b, v10.16b - eor v10.16b, v16.16b, v17.16b - eor v16.16b, v18.16b, v8.16b - and v17.16b, v4.16b, v0.16b - orr v18.16b, v7.16b, v3.16b - eor v21.16b, v21.16b, v8.16b - eor v8.16b, v9.16b, v8.16b - eor v9.16b, v10.16b, v19.16b - eor v10.16b, v3.16b, v0.16b - eor v16.16b, v16.16b, v17.16b - eor v17.16b, v5.16b, v1.16b - eor v19.16b, v21.16b, v20.16b - eor v20.16b, v8.16b, v18.16b - eor v8.16b, v8.16b, v18.16b - eor v18.16b, v7.16b, v4.16b - eor v21.16b, v9.16b, v16.16b - eor v22.16b, v6.16b, v2.16b - and v23.16b, v9.16b, v19.16b - eor v24.16b, v10.16b, v17.16b - eor v25.16b, v0.16b, v1.16b - eor v26.16b, v7.16b, v6.16b - eor v27.16b, v18.16b, v22.16b - eor v28.16b, v3.16b, v5.16b - eor v29.16b, v16.16b, v23.16b - eor v30.16b, v20.16b, v23.16b - eor v23.16b, v20.16b, v23.16b - eor v31.16b, v4.16b, v2.16b - bsl v29.16b, v19.16b, v20.16b - bsl v30.16b, v9.16b, v16.16b - bsl v8.16b, v29.16b, v23.16b - bsl v20.16b, v23.16b, v29.16b - eor v9.16b, v30.16b, v29.16b - and v5.16b, v5.16b, v30.16b - and v8.16b, v8.16b, v30.16b - and v1.16b, v1.16b, v29.16b - eor v16.16b, v19.16b, v20.16b - and v2.16b, v2.16b, v29.16b - eor v19.16b, v9.16b, v29.16b - and v17.16b, v17.16b, v9.16b - eor v8.16b, v8.16b, v21.16b - and v20.16b, v22.16b, v9.16b - eor v21.16b, v29.16b, v16.16b - eor v22.16b, v29.16b, v16.16b - and v23.16b, v25.16b, v16.16b - and v6.16b, v6.16b, v19.16b - eor v25.16b, v8.16b, v16.16b - eor v29.16b, v30.16b, v8.16b - and v4.16b, v21.16b, v4.16b - and v8.16b, v28.16b, v8.16b - and v0.16b, v22.16b, v0.16b - eor v21.16b, v23.16b, v1.16b - eor v22.16b, v9.16b, v25.16b - eor v9.16b, v9.16b, v25.16b - eor v23.16b, v25.16b, v16.16b - and v3.16b, v29.16b, v3.16b - and v24.16b, v24.16b, v25.16b - and v25.16b, v27.16b, v25.16b - and v10.16b, v22.16b, v10.16b - and v9.16b, v9.16b, v18.16b - eor v18.16b, v19.16b, v23.16b - and v19.16b, v26.16b, v23.16b - eor v3.16b, v5.16b, v3.16b - eor v17.16b, v17.16b, v24.16b - eor v10.16b, v24.16b, v10.16b - and v16.16b, v31.16b, v16.16b - eor v20.16b, v20.16b, v25.16b - eor v9.16b, v25.16b, v9.16b - eor v4.16b, v2.16b, v4.16b - and v7.16b, v18.16b, v7.16b - eor v18.16b, v19.16b, v6.16b - eor v5.16b, v8.16b, v5.16b - eor v0.16b, v1.16b, v0.16b - eor v1.16b, v21.16b, v10.16b - eor v8.16b, v3.16b, v17.16b - eor v2.16b, v16.16b, v2.16b - eor v3.16b, v6.16b, v7.16b - eor v6.16b, v18.16b, v9.16b - eor v4.16b, v4.16b, v20.16b - eor v10.16b, v5.16b, v10.16b - eor v0.16b, v0.16b, v17.16b - eor v9.16b, v2.16b, v9.16b - eor v3.16b, v3.16b, v20.16b - eor v7.16b, v6.16b, v1.16b - eor v5.16b, v8.16b, v4.16b - eor v6.16b, v10.16b, v1.16b - eor v2.16b, v4.16b, v0.16b - eor v4.16b, v3.16b, v10.16b - eor v9.16b, v9.16b, v7.16b - eor v3.16b, v0.16b, v5.16b - eor v0.16b, v1.16b, v4.16b - eor v1.16b, v4.16b, v8.16b - eor v4.16b, v9.16b, v5.16b - eor v6.16b, v6.16b, v3.16b - bcc .Lenc_done - ext v8.16b, v0.16b, v0.16b, #12 - ext v9.16b, v4.16b, v4.16b, #12 - ldr q28, [x11] - ext v10.16b, v6.16b, v6.16b, #12 - ext v16.16b, v1.16b, v1.16b, #12 - ext v17.16b, v3.16b, v3.16b, #12 - ext v18.16b, v7.16b, v7.16b, #12 - eor v0.16b, v0.16b, v8.16b - eor v4.16b, v4.16b, v9.16b - eor v6.16b, v6.16b, v10.16b - ext v19.16b, v2.16b, v2.16b, #12 - ext v20.16b, v5.16b, v5.16b, #12 - eor v1.16b, v1.16b, v16.16b - eor v3.16b, v3.16b, v17.16b - eor v7.16b, v7.16b, v18.16b - eor v2.16b, v2.16b, v19.16b - eor v16.16b, v16.16b, v0.16b - eor v5.16b, v5.16b, v20.16b - eor v17.16b, v17.16b, v6.16b - eor v10.16b, v10.16b, v4.16b - ext v0.16b, v0.16b, v0.16b, #8 - eor v9.16b, v9.16b, v1.16b - ext v1.16b, v1.16b, v1.16b, #8 - eor v8.16b, v8.16b, v5.16b - eor v16.16b, v16.16b, v5.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v7.16b - ext v3.16b, v3.16b, v3.16b, #8 - ext v7.16b, v7.16b, v7.16b, #8 - eor v20.16b, v20.16b, v2.16b - ext v6.16b, v6.16b, v6.16b, #8 - ext v21.16b, v5.16b, v5.16b, #8 - eor v17.16b, v17.16b, v5.16b - ext v2.16b, v2.16b, v2.16b, #8 - eor v10.16b, v10.16b, v5.16b - ext v22.16b, v4.16b, v4.16b, #8 - eor v0.16b, v0.16b, v8.16b - eor v1.16b, v1.16b, v16.16b - eor v5.16b, v7.16b, v18.16b - eor v4.16b, v3.16b, v17.16b - eor v3.16b, v6.16b, v10.16b - eor v7.16b, v21.16b, v20.16b - eor v6.16b, v2.16b, v19.16b - eor v2.16b, v22.16b, v9.16b - bne .Lenc_loop - ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) - b .Lenc_loop -.align 4 -.Lenc_done: - ushr v8.2d, v0.2d, #1 - movi v9.16b, #0x55 - ldr q10, [x9] - ushr v16.2d, v3.2d, #1 - movi v17.16b, #0x33 - ushr v18.2d, v4.2d, #1 - movi v19.16b, #0x0f - eor v8.16b, v8.16b, v1.16b - ushr v20.2d, v2.2d, #1 - eor v16.16b, v16.16b, v7.16b - eor v18.16b, v18.16b, v6.16b - and v8.16b, v8.16b, v9.16b - eor v20.16b, v20.16b, v5.16b - and v16.16b, v16.16b, v9.16b - and v18.16b, v18.16b, v9.16b - shl v21.2d, v8.2d, #1 - eor v1.16b, v1.16b, v8.16b - and v8.16b, v20.16b, v9.16b - eor v7.16b, v7.16b, v16.16b - shl v9.2d, v16.2d, #1 - eor v6.16b, v6.16b, v18.16b - shl v16.2d, v18.2d, #1 - eor v0.16b, v0.16b, v21.16b - shl v18.2d, v8.2d, #1 - eor v5.16b, v5.16b, v8.16b - eor v3.16b, v3.16b, v9.16b - eor v4.16b, v4.16b, v16.16b - ushr v8.2d, v1.2d, #2 - eor v2.16b, v2.16b, v18.16b - ushr v9.2d, v0.2d, #2 - ushr v16.2d, v7.2d, #2 - ushr v18.2d, v3.2d, #2 - eor v8.16b, v8.16b, v6.16b - eor v9.16b, v9.16b, v4.16b - eor v16.16b, v16.16b, v5.16b - eor v18.16b, v18.16b, v2.16b - and v8.16b, v8.16b, v17.16b - and v9.16b, v9.16b, v17.16b - and v16.16b, v16.16b, v17.16b - and v17.16b, v18.16b, v17.16b - eor v6.16b, v6.16b, v8.16b - shl v8.2d, v8.2d, #2 - eor v4.16b, v4.16b, v9.16b - shl v9.2d, v9.2d, #2 - eor v5.16b, v5.16b, v16.16b - shl v16.2d, v16.2d, #2 - eor v2.16b, v2.16b, v17.16b - shl v17.2d, v17.2d, #2 - eor v1.16b, v1.16b, v8.16b - eor v0.16b, v0.16b, v9.16b - eor v7.16b, v7.16b, v16.16b - eor v3.16b, v3.16b, v17.16b - ushr v8.2d, v6.2d, #4 - ushr v9.2d, v4.2d, #4 - ushr v16.2d, v1.2d, #4 - ushr v17.2d, v0.2d, #4 - eor v8.16b, v8.16b, v5.16b - eor v9.16b, v9.16b, v2.16b - eor v16.16b, v16.16b, v7.16b - eor v17.16b, v17.16b, v3.16b - and v8.16b, v8.16b, v19.16b - and v9.16b, v9.16b, v19.16b - and v16.16b, v16.16b, v19.16b - and v17.16b, v17.16b, v19.16b - eor v5.16b, v5.16b, v8.16b - shl v8.2d, v8.2d, #4 - eor v2.16b, v2.16b, v9.16b - shl v9.2d, v9.2d, #4 - eor v7.16b, v7.16b, v16.16b - shl v16.2d, v16.2d, #4 - eor v3.16b, v3.16b, v17.16b - shl v17.2d, v17.2d, #4 - eor v6.16b, v6.16b, v8.16b - eor v4.16b, v4.16b, v9.16b - eor v7.16b, v7.16b, v10.16b - eor v1.16b, v1.16b, v16.16b - eor v3.16b, v3.16b, v10.16b - eor v0.16b, v0.16b, v17.16b - eor v6.16b, v6.16b, v10.16b - eor v4.16b, v4.16b, v10.16b - eor v2.16b, v2.16b, v10.16b - eor v5.16b, v5.16b, v10.16b - eor v1.16b, v1.16b, v10.16b - eor v0.16b, v0.16b, v10.16b - ret -.size _bsaes_encrypt8,.-_bsaes_encrypt8 - -.type _bsaes_key_convert,%function -.align 4 -// On entry: -// x9 -> input key (big-endian) -// x10 = number of rounds -// x17 -> output key (native endianness) -// On exit: -// x9, x10 corrupted -// x11 -> .LM0_bigendian -// x17 -> last quadword of output key -// other general-purpose registers preserved -// v2-v6 preserved -// v7.16b[] = 0x63 -// v8-v14 preserved -// v15 = last round key (converted to native endianness) -// other SIMD registers corrupted -_bsaes_key_convert: -#ifdef __ARMEL__ - adr x11, .LM0_littleendian -#else - adr x11, .LM0_bigendian -#endif - ldr q0, [x9], #16 // load round 0 key - ldr q1, [x11] // .LM0 - ldr q15, [x9], #16 // load round 1 key - - movi v7.16b, #0x63 // compose .L63 - movi v16.16b, #0x01 // bit masks - movi v17.16b, #0x02 - movi v18.16b, #0x04 - movi v19.16b, #0x08 - movi v20.16b, #0x10 - movi v21.16b, #0x20 - movi v22.16b, #0x40 - movi v23.16b, #0x80 - -#ifdef __ARMEL__ - rev32 v0.16b, v0.16b -#endif - sub x10, x10, #1 - str q0, [x17], #16 // save round 0 key - -.align 4 -.Lkey_loop: - tbl v0.16b, {v15.16b}, v1.16b - ldr q15, [x9], #16 // load next round key - - eor v0.16b, v0.16b, v7.16b - cmtst v24.16b, v0.16b, v16.16b - cmtst v25.16b, v0.16b, v17.16b - cmtst v26.16b, v0.16b, v18.16b - cmtst v27.16b, v0.16b, v19.16b - cmtst v28.16b, v0.16b, v20.16b - cmtst v29.16b, v0.16b, v21.16b - cmtst v30.16b, v0.16b, v22.16b - cmtst v31.16b, v0.16b, v23.16b - sub x10, x10, #1 - st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key - st1 {v28.16b-v31.16b}, [x17], #64 - cbnz x10, .Lkey_loop - - // don't save last round key -#ifdef __ARMEL__ - rev32 v15.16b, v15.16b - adr x11, .LM0_bigendian -#endif - ret -.size _bsaes_key_convert,.-_bsaes_key_convert - -.globl bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,%function -.align 4 -// On entry: -// x0 -> input ciphertext -// x1 -> output plaintext -// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) -// x3 -> key -// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) -// w5 must be == 0 -// On exit: -// Output plaintext filled in -// Initialisation vector overwritten with last quadword of ciphertext -// No output registers, usual AAPCS64 register preservation -bsaes_cbc_encrypt: - cmp x2, #128 - blo AES_cbc_encrypt - - // it is up to the caller to make sure we are called with enc == 0 - - stp fp, lr, [sp, #-48]! - stp d8, d9, [sp, #16] - stp d10, d15, [sp, #32] - lsr x2, x2, #4 // len in 16 byte blocks - - ldr w15, [x3, #240] // get # of rounds - mov x14, sp - - // allocate the key schedule on the stack - add x17, sp, #96 - sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes - - // populate the key schedule - mov x9, x3 // pass key - mov x10, x15 // pass # of rounds - mov sp, x17 // sp is sp - bl _bsaes_key_convert - ldr q6, [sp] - str q15, [x17] // save last round key - eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) - str q6, [sp] - - ldr q15, [x4] // load IV - b .Lcbc_dec_loop - -.align 4 -.Lcbc_dec_loop: - subs x2, x2, #0x8 - bmi .Lcbc_dec_loop_finish - - ldr q0, [x0], #16 // load input - mov x9, sp // pass the key - ldr q1, [x0], #16 - mov x10, x15 - ldr q2, [x0], #16 - ldr q3, [x0], #16 - ldr q4, [x0], #16 - ldr q5, [x0], #16 - ldr q6, [x0], #16 - ldr q7, [x0], #-7*16 - - bl _bsaes_decrypt8 - - ldr q16, [x0], #16 // reload input - eor v0.16b, v0.16b, v15.16b // ^= IV - eor v1.16b, v1.16b, v16.16b - str q0, [x1], #16 // write output - ldr q0, [x0], #16 - str q1, [x1], #16 - ldr q1, [x0], #16 - eor v1.16b, v4.16b, v1.16b - ldr q4, [x0], #16 - eor v2.16b, v2.16b, v4.16b - eor v0.16b, v6.16b, v0.16b - ldr q4, [x0], #16 - str q0, [x1], #16 - str q1, [x1], #16 - eor v0.16b, v7.16b, v4.16b - ldr q1, [x0], #16 - str q2, [x1], #16 - ldr q2, [x0], #16 - ldr q15, [x0], #16 - str q0, [x1], #16 - eor v0.16b, v5.16b, v2.16b - eor v1.16b, v3.16b, v1.16b - str q1, [x1], #16 - str q0, [x1], #16 - - b .Lcbc_dec_loop - -.Lcbc_dec_loop_finish: - adds x2, x2, #8 - beq .Lcbc_dec_done - - ldr q0, [x0], #16 // load input - cmp x2, #2 - blo .Lcbc_dec_one - ldr q1, [x0], #16 - mov x9, sp // pass the key - mov x10, x15 - beq .Lcbc_dec_two - ldr q2, [x0], #16 - cmp x2, #4 - blo .Lcbc_dec_three - ldr q3, [x0], #16 - beq .Lcbc_dec_four - ldr q4, [x0], #16 - cmp x2, #6 - blo .Lcbc_dec_five - ldr q5, [x0], #16 - beq .Lcbc_dec_six - ldr q6, [x0], #-6*16 - - bl _bsaes_decrypt8 - - ldr q5, [x0], #16 // reload input - eor v0.16b, v0.16b, v15.16b // ^= IV - ldr q8, [x0], #16 - ldr q9, [x0], #16 - ldr q10, [x0], #16 - str q0, [x1], #16 // write output - ldr q0, [x0], #16 - eor v1.16b, v1.16b, v5.16b - ldr q5, [x0], #16 - eor v6.16b, v6.16b, v8.16b - ldr q15, [x0] - eor v4.16b, v4.16b, v9.16b - eor v2.16b, v2.16b, v10.16b - str q1, [x1], #16 - eor v0.16b, v7.16b, v0.16b - str q6, [x1], #16 - eor v1.16b, v3.16b, v5.16b - str q4, [x1], #16 - str q2, [x1], #16 - str q0, [x1], #16 - str q1, [x1] - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_six: - sub x0, x0, #0x60 - bl _bsaes_decrypt8 - ldr q3, [x0], #16 // reload input - eor v0.16b, v0.16b, v15.16b // ^= IV - ldr q5, [x0], #16 - ldr q8, [x0], #16 - ldr q9, [x0], #16 - str q0, [x1], #16 // write output - ldr q0, [x0], #16 - eor v1.16b, v1.16b, v3.16b - ldr q15, [x0] - eor v3.16b, v6.16b, v5.16b - eor v4.16b, v4.16b, v8.16b - eor v2.16b, v2.16b, v9.16b - str q1, [x1], #16 - eor v0.16b, v7.16b, v0.16b - str q3, [x1], #16 - str q4, [x1], #16 - str q2, [x1], #16 - str q0, [x1] - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_five: - sub x0, x0, #0x50 - bl _bsaes_decrypt8 - ldr q3, [x0], #16 // reload input - eor v0.16b, v0.16b, v15.16b // ^= IV - ldr q5, [x0], #16 - ldr q7, [x0], #16 - ldr q8, [x0], #16 - str q0, [x1], #16 // write output - ldr q15, [x0] - eor v0.16b, v1.16b, v3.16b - eor v1.16b, v6.16b, v5.16b - eor v3.16b, v4.16b, v7.16b - str q0, [x1], #16 - eor v0.16b, v2.16b, v8.16b - str q1, [x1], #16 - str q3, [x1], #16 - str q0, [x1] - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_four: - sub x0, x0, #0x40 - bl _bsaes_decrypt8 - ldr q2, [x0], #16 // reload input - eor v0.16b, v0.16b, v15.16b // ^= IV - ldr q3, [x0], #16 - ldr q5, [x0], #16 - str q0, [x1], #16 // write output - ldr q15, [x0] - eor v0.16b, v1.16b, v2.16b - eor v1.16b, v6.16b, v3.16b - eor v2.16b, v4.16b, v5.16b - str q0, [x1], #16 - str q1, [x1], #16 - str q2, [x1] - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_three: - sub x0, x0, #0x30 - bl _bsaes_decrypt8 - ldr q2, [x0], #16 // reload input - eor v0.16b, v0.16b, v15.16b // ^= IV - ldr q3, [x0], #16 - ldr q15, [x0] - str q0, [x1], #16 // write output - eor v0.16b, v1.16b, v2.16b - eor v1.16b, v6.16b, v3.16b - str q0, [x1], #16 - str q1, [x1] - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_two: - sub x0, x0, #0x20 - bl _bsaes_decrypt8 - ldr q2, [x0], #16 // reload input - eor v0.16b, v0.16b, v15.16b // ^= IV - ldr q15, [x0] - str q0, [x1], #16 // write output - eor v0.16b, v1.16b, v2.16b - str q0, [x1] - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_one: - sub x0, x0, #0x10 - stp x1, x4, [sp, #-32]! - str x14, [sp, #16] - mov v8.16b, v15.16b - mov v15.16b, v0.16b - mov x2, x3 - bl AES_decrypt - ldr x14, [sp, #16] - ldp x1, x4, [sp], #32 - ldr q0, [x1] // load result - eor v0.16b, v0.16b, v8.16b // ^= IV - str q0, [x1] // write output - -.align 4 -.Lcbc_dec_done: - movi v0.16b, #0 - movi v1.16b, #0 -.Lcbc_dec_bzero:// wipe key schedule [if any] - stp q0, q1, [sp], #32 - cmp sp, x14 - bne .Lcbc_dec_bzero - str q15, [x4] // return IV - ldp d8, d9, [sp, #16] - ldp d10, d15, [sp, #32] - ldp fp, lr, [sp], #48 - ret -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - -.globl bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,%function -.align 4 -// On entry: -// x0 -> input text (whole 16-byte blocks) -// x1 -> output text (whole 16-byte blocks) -// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) -// x3 -> key -// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block -// On exit: -// Output text filled in -// No output registers, usual AAPCS64 register preservation -bsaes_ctr32_encrypt_blocks: - - cmp x2, #8 // use plain AES for - blo .Lctr_enc_short // small sizes - - stp fp, lr, [sp, #-80]! - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - - ldr w15, [x3, #240] // get # of rounds - mov x14, sp - - // allocate the key schedule on the stack - add x17, sp, #96 - sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes - - // populate the key schedule - mov x9, x3 // pass key - mov x10, x15 // pass # of rounds - mov sp, x17 // sp is sp - bl _bsaes_key_convert - eor v7.16b, v7.16b, v15.16b // fix up last round key - str q7, [x17] // save last round key - - ldr q0, [x4] // load counter - add x13, x11, #.LREVM0SR-.LM0_bigendian - ldr q4, [sp] // load round0 key - - movi v8.4s, #1 // compose 1<<96 - movi v9.16b, #0 - rev32 v15.16b, v0.16b - rev32 v0.16b, v0.16b - ext v11.16b, v9.16b, v8.16b, #4 - rev32 v4.16b, v4.16b - add v12.4s, v11.4s, v11.4s // compose 2<<96 - str q4, [sp] // save adjusted round0 key - add v13.4s, v11.4s, v12.4s // compose 3<<96 - add v14.4s, v12.4s, v12.4s // compose 4<<96 - b .Lctr_enc_loop - -.align 4 -.Lctr_enc_loop: - // Intermix prologue from _bsaes_encrypt8 to use the opportunity - // to flip byte order in 32-bit counter - - add v1.4s, v15.4s, v11.4s // +1 - add x9, sp, #0x10 // pass next round key - add v2.4s, v15.4s, v12.4s // +2 - ldr q9, [x13] // .LREVM0SR - ldr q8, [sp] // load round0 key - add v3.4s, v15.4s, v13.4s // +3 - mov x10, x15 // pass rounds - sub x11, x13, #.LREVM0SR-.LSR // pass constants - add v6.4s, v2.4s, v14.4s - add v4.4s, v15.4s, v14.4s // +4 - add v7.4s, v3.4s, v14.4s - add v15.4s, v4.4s, v14.4s // next counter - add v5.4s, v1.4s, v14.4s - - bl _bsaes_encrypt8_alt - - subs x2, x2, #8 - blo .Lctr_enc_loop_done - - ldr q16, [x0], #16 - ldr q17, [x0], #16 - eor v1.16b, v1.16b, v17.16b - ldr q17, [x0], #16 - eor v0.16b, v0.16b, v16.16b - eor v4.16b, v4.16b, v17.16b - str q0, [x1], #16 - ldr q16, [x0], #16 - str q1, [x1], #16 - mov v0.16b, v15.16b - str q4, [x1], #16 - ldr q1, [x0], #16 - eor v4.16b, v6.16b, v16.16b - eor v1.16b, v3.16b, v1.16b - ldr q3, [x0], #16 - eor v3.16b, v7.16b, v3.16b - ldr q6, [x0], #16 - eor v2.16b, v2.16b, v6.16b - ldr q6, [x0], #16 - eor v5.16b, v5.16b, v6.16b - str q4, [x1], #16 - str q1, [x1], #16 - str q3, [x1], #16 - str q2, [x1], #16 - str q5, [x1], #16 - - bne .Lctr_enc_loop - b .Lctr_enc_done - -.align 4 -.Lctr_enc_loop_done: - add x2, x2, #8 - ldr q16, [x0], #16 // load input - eor v0.16b, v0.16b, v16.16b - str q0, [x1], #16 // write output - cmp x2, #2 - blo .Lctr_enc_done - ldr q17, [x0], #16 - eor v1.16b, v1.16b, v17.16b - str q1, [x1], #16 - beq .Lctr_enc_done - ldr q18, [x0], #16 - eor v4.16b, v4.16b, v18.16b - str q4, [x1], #16 - cmp x2, #4 - blo .Lctr_enc_done - ldr q19, [x0], #16 - eor v6.16b, v6.16b, v19.16b - str q6, [x1], #16 - beq .Lctr_enc_done - ldr q20, [x0], #16 - eor v3.16b, v3.16b, v20.16b - str q3, [x1], #16 - cmp x2, #6 - blo .Lctr_enc_done - ldr q21, [x0], #16 - eor v7.16b, v7.16b, v21.16b - str q7, [x1], #16 - beq .Lctr_enc_done - ldr q22, [x0] - eor v2.16b, v2.16b, v22.16b - str q2, [x1], #16 - -.Lctr_enc_done: - movi v0.16b, #0 - movi v1.16b, #0 -.Lctr_enc_bzero: // wipe key schedule [if any] - stp q0, q1, [sp], #32 - cmp sp, x14 - bne .Lctr_enc_bzero - - ldp d8, d9, [sp, #16] - ldp d10, d11, [sp, #32] - ldp d12, d13, [sp, #48] - ldp d14, d15, [sp, #64] - ldp fp, lr, [sp], #80 - ret - -.Lctr_enc_short: - stp fp, lr, [sp, #-96]! - stp x19, x20, [sp, #16] - stp x21, x22, [sp, #32] - str x23, [sp, #48] - - mov x19, x0 // copy arguments - mov x20, x1 - mov x21, x2 - mov x22, x3 - ldr w23, [x4, #12] // load counter .LSW - ldr q1, [x4] // load whole counter value -#ifdef __ARMEL__ - rev w23, w23 -#endif - str q1, [sp, #80] // copy counter value - -.Lctr_enc_short_loop: - add x0, sp, #80 // input counter value - add x1, sp, #64 // output on the stack - mov x2, x22 // key - - bl AES_encrypt - - ldr q0, [x19], #16 // load input - ldr q1, [sp, #64] // load encrypted counter - add x23, x23, #1 -#ifdef __ARMEL__ - rev w0, w23 - str w0, [sp, #80+12] // next counter value -#else - str w23, [sp, #80+12] // next counter value -#endif - eor v0.16b, v0.16b, v1.16b - str q0, [x20], #16 // store output - subs x21, x21, #1 - bne .Lctr_enc_short_loop - - movi v0.16b, #0 - movi v1.16b, #0 - stp q0, q1, [sp, #64] - - ldr x23, [sp, #48] - ldp x21, x22, [sp, #32] - ldp x19, x20, [sp, #16] - ldp fp, lr, [sp], #96 - ret -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks - -.globl bsaes_xts_encrypt -.type bsaes_xts_encrypt,%function -.align 4 -// On entry: -// x0 -> input plaintext -// x1 -> output ciphertext -// x2 -> length of text in bytes (must be at least 16) -// x3 -> key1 (used to encrypt the XORed plaintext blocks) -// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) -// x5 -> 16-byte initial vector (typically, sector number) -// On exit: -// Output ciphertext filled in -// No output registers, usual AAPCS64 register preservation -bsaes_xts_encrypt: - // Stack layout: - // sp -> - // nrounds*128-96 bytes: key schedule - // x19 -> - // 16 bytes: frame record - // 4*16 bytes: tweak storage across _bsaes_encrypt8 - // 6*8 bytes: storage for 5 callee-saved general-purpose registers - // 8*8 bytes: storage for 8 callee-saved SIMD registers - stp fp, lr, [sp, #-192]! - stp x19, x20, [sp, #80] - stp x21, x22, [sp, #96] - str x23, [sp, #112] - stp d8, d9, [sp, #128] - stp d10, d11, [sp, #144] - stp d12, d13, [sp, #160] - stp d14, d15, [sp, #176] - - mov x19, sp - mov x20, x0 - mov x21, x1 - mov x22, x2 - mov x23, x3 - - // generate initial tweak - sub sp, sp, #16 - mov x0, x5 // iv[] - mov x1, sp - mov x2, x4 // key2 - bl AES_encrypt - ldr q11, [sp], #16 - - ldr w1, [x23, #240] // get # of rounds - // allocate the key schedule on the stack - add x17, sp, #96 - sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes - - // populate the key schedule - mov x9, x23 // pass key - mov x10, x1 // pass # of rounds - mov sp, x17 - bl _bsaes_key_convert - eor v15.16b, v15.16b, v7.16b // fix up last round key - str q15, [x17] // save last round key - - subs x22, x22, #0x80 - blo .Lxts_enc_short - b .Lxts_enc_loop - -.align 4 -.Lxts_enc_loop: - ldr q8, .Lxts_magic - mov x10, x1 // pass rounds - add x2, x19, #16 - ldr q0, [x20], #16 - sshr v1.2d, v11.2d, #63 - mov x9, sp // pass key schedule - ldr q6, .Lxts_magic+16 - add v2.2d, v11.2d, v11.2d - cmtst v3.2d, v11.2d, v6.2d - and v1.16b, v1.16b, v8.16b - ext v1.16b, v1.16b, v1.16b, #8 - and v3.16b, v3.16b, v8.16b - ldr q4, [x20], #16 - eor v12.16b, v2.16b, v1.16b - eor v1.16b, v4.16b, v12.16b - eor v0.16b, v0.16b, v11.16b - cmtst v2.2d, v12.2d, v6.2d - add v4.2d, v12.2d, v12.2d - add x0, x19, #16 - ext v3.16b, v3.16b, v3.16b, #8 - and v2.16b, v2.16b, v8.16b - eor v13.16b, v4.16b, v3.16b - ldr q3, [x20], #16 - ext v4.16b, v2.16b, v2.16b, #8 - eor v2.16b, v3.16b, v13.16b - ldr q3, [x20], #16 - add v5.2d, v13.2d, v13.2d - cmtst v7.2d, v13.2d, v6.2d - and v7.16b, v7.16b, v8.16b - ldr q9, [x20], #16 - ext v7.16b, v7.16b, v7.16b, #8 - ldr q10, [x20], #16 - eor v14.16b, v5.16b, v4.16b - ldr q16, [x20], #16 - add v4.2d, v14.2d, v14.2d - eor v3.16b, v3.16b, v14.16b - eor v15.16b, v4.16b, v7.16b - add v5.2d, v15.2d, v15.2d - ldr q7, [x20], #16 - cmtst v4.2d, v14.2d, v6.2d - and v17.16b, v4.16b, v8.16b - cmtst v18.2d, v15.2d, v6.2d - eor v4.16b, v9.16b, v15.16b - ext v9.16b, v17.16b, v17.16b, #8 - eor v9.16b, v5.16b, v9.16b - add v17.2d, v9.2d, v9.2d - and v18.16b, v18.16b, v8.16b - eor v5.16b, v10.16b, v9.16b - str q9, [x2], #16 - ext v10.16b, v18.16b, v18.16b, #8 - cmtst v9.2d, v9.2d, v6.2d - and v9.16b, v9.16b, v8.16b - eor v10.16b, v17.16b, v10.16b - cmtst v17.2d, v10.2d, v6.2d - eor v6.16b, v16.16b, v10.16b - str q10, [x2], #16 - ext v9.16b, v9.16b, v9.16b, #8 - add v10.2d, v10.2d, v10.2d - eor v9.16b, v10.16b, v9.16b - str q9, [x2], #16 - eor v7.16b, v7.16b, v9.16b - add v9.2d, v9.2d, v9.2d - and v8.16b, v17.16b, v8.16b - ext v8.16b, v8.16b, v8.16b, #8 - eor v8.16b, v9.16b, v8.16b - str q8, [x2] // next round tweak - - bl _bsaes_encrypt8 - - ldr q8, [x0], #16 - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - ldr q9, [x0], #16 - eor v4.16b, v4.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - ldr q10, [x0], #16 - eor v3.16b, v3.16b, v15.16b - subs x22, x22, #0x80 - str q0, [x21], #16 - ldr q11, [x0] // next round tweak - str q1, [x21], #16 - eor v0.16b, v7.16b, v8.16b - eor v1.16b, v2.16b, v9.16b - str q4, [x21], #16 - eor v2.16b, v5.16b, v10.16b - str q6, [x21], #16 - str q3, [x21], #16 - str q0, [x21], #16 - str q1, [x21], #16 - str q2, [x21], #16 - bpl .Lxts_enc_loop - -.Lxts_enc_short: - adds x22, x22, #0x70 - bmi .Lxts_enc_done - - ldr q8, .Lxts_magic - sshr v1.2d, v11.2d, #63 - add v2.2d, v11.2d, v11.2d - ldr q9, .Lxts_magic+16 - subs x22, x22, #0x10 - ldr q0, [x20], #16 - and v1.16b, v1.16b, v8.16b - cmtst v3.2d, v11.2d, v9.2d - ext v1.16b, v1.16b, v1.16b, #8 - and v3.16b, v3.16b, v8.16b - eor v12.16b, v2.16b, v1.16b - ext v1.16b, v3.16b, v3.16b, #8 - add v2.2d, v12.2d, v12.2d - cmtst v3.2d, v12.2d, v9.2d - eor v13.16b, v2.16b, v1.16b - and v22.16b, v3.16b, v8.16b - bmi .Lxts_enc_1 - - ext v2.16b, v22.16b, v22.16b, #8 - add v3.2d, v13.2d, v13.2d - ldr q1, [x20], #16 - cmtst v4.2d, v13.2d, v9.2d - subs x22, x22, #0x10 - eor v14.16b, v3.16b, v2.16b - and v23.16b, v4.16b, v8.16b - bmi .Lxts_enc_2 - - ext v3.16b, v23.16b, v23.16b, #8 - add v4.2d, v14.2d, v14.2d - ldr q2, [x20], #16 - cmtst v5.2d, v14.2d, v9.2d - eor v0.16b, v0.16b, v11.16b - subs x22, x22, #0x10 - eor v15.16b, v4.16b, v3.16b - and v24.16b, v5.16b, v8.16b - bmi .Lxts_enc_3 - - ext v4.16b, v24.16b, v24.16b, #8 - add v5.2d, v15.2d, v15.2d - ldr q3, [x20], #16 - cmtst v6.2d, v15.2d, v9.2d - eor v1.16b, v1.16b, v12.16b - subs x22, x22, #0x10 - eor v16.16b, v5.16b, v4.16b - and v25.16b, v6.16b, v8.16b - bmi .Lxts_enc_4 - - ext v5.16b, v25.16b, v25.16b, #8 - add v6.2d, v16.2d, v16.2d - add x0, x19, #16 - cmtst v7.2d, v16.2d, v9.2d - ldr q4, [x20], #16 - eor v2.16b, v2.16b, v13.16b - str q16, [x0], #16 - subs x22, x22, #0x10 - eor v17.16b, v6.16b, v5.16b - and v26.16b, v7.16b, v8.16b - bmi .Lxts_enc_5 - - ext v7.16b, v26.16b, v26.16b, #8 - add v18.2d, v17.2d, v17.2d - ldr q5, [x20], #16 - eor v3.16b, v3.16b, v14.16b - str q17, [x0], #16 - subs x22, x22, #0x10 - eor v18.16b, v18.16b, v7.16b - bmi .Lxts_enc_6 - - ldr q6, [x20], #16 - eor v4.16b, v4.16b, v15.16b - eor v5.16b, v5.16b, v16.16b - str q18, [x0] // next round tweak - mov x9, sp // pass key schedule - mov x10, x1 - add x0, x19, #16 - sub x22, x22, #0x10 - eor v6.16b, v6.16b, v17.16b - - bl _bsaes_encrypt8 - - ldr q16, [x0], #16 - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - ldr q17, [x0], #16 - eor v4.16b, v4.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - eor v3.16b, v3.16b, v15.16b - ldr q11, [x0] // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - eor v0.16b, v7.16b, v16.16b - eor v1.16b, v2.16b, v17.16b - str q4, [x21], #16 - str q6, [x21], #16 - str q3, [x21], #16 - str q0, [x21], #16 - str q1, [x21], #16 - b .Lxts_enc_done - -.align 4 -.Lxts_enc_6: - eor v4.16b, v4.16b, v15.16b - eor v5.16b, v5.16b, v16.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_encrypt8 - - ldr q16, [x0], #16 - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - eor v4.16b, v4.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - ldr q11, [x0] // next round tweak - eor v3.16b, v3.16b, v15.16b - str q0, [x21], #16 - str q1, [x21], #16 - eor v0.16b, v7.16b, v16.16b - str q4, [x21], #16 - str q6, [x21], #16 - str q3, [x21], #16 - str q0, [x21], #16 - b .Lxts_enc_done - -.align 4 -.Lxts_enc_5: - eor v3.16b, v3.16b, v14.16b - eor v4.16b, v4.16b, v15.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_encrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - ldr q11, [x0] // next round tweak - eor v4.16b, v4.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - eor v3.16b, v3.16b, v15.16b - str q0, [x21], #16 - str q1, [x21], #16 - str q4, [x21], #16 - str q6, [x21], #16 - str q3, [x21], #16 - b .Lxts_enc_done - -.align 4 -.Lxts_enc_4: - eor v2.16b, v2.16b, v13.16b - eor v3.16b, v3.16b, v14.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_encrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - eor v4.16b, v4.16b, v13.16b - eor v6.16b, v6.16b, v14.16b - mov v11.16b, v15.16b // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - str q4, [x21], #16 - str q6, [x21], #16 - b .Lxts_enc_done - -.align 4 -.Lxts_enc_3: - eor v1.16b, v1.16b, v12.16b - eor v2.16b, v2.16b, v13.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_encrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - eor v4.16b, v4.16b, v13.16b - mov v11.16b, v14.16b // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - str q4, [x21], #16 - b .Lxts_enc_done - -.align 4 -.Lxts_enc_2: - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_encrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - mov v11.16b, v13.16b // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - b .Lxts_enc_done - -.align 4 -.Lxts_enc_1: - eor v0.16b, v0.16b, v11.16b - sub x0, sp, #16 - sub x1, sp, #16 - mov x2, x23 - mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers - mov v14.d[0], v12.d[1] - str q0, [sp, #-16]! - - bl AES_encrypt - - ldr q0, [sp], #16 - trn1 v13.2d, v11.2d, v13.2d - trn1 v11.2d, v12.2d, v14.2d // next round tweak - eor v0.16b, v0.16b, v13.16b - str q0, [x21], #16 - -.Lxts_enc_done: - adds x22, x22, #0x10 - beq .Lxts_enc_ret - - sub x6, x21, #0x10 - // Penultimate plaintext block produces final ciphertext part-block - // plus remaining part of final plaintext block. Move ciphertext part - // to final position and re-use penultimate ciphertext block buffer to - // construct final plaintext block -.Lxts_enc_steal: - ldrb w0, [x20], #1 - ldrb w1, [x21, #-0x10] - strb w0, [x21, #-0x10] - strb w1, [x21], #1 - - subs x22, x22, #1 - bhi .Lxts_enc_steal - - // Finally encrypt the penultimate ciphertext block using the - // last tweak - ldr q0, [x6] - eor v0.16b, v0.16b, v11.16b - str q0, [sp, #-16]! - mov x0, sp - mov x1, sp - mov x2, x23 - mov x21, x6 - mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers - - bl AES_encrypt - - trn1 v11.2d, v11.2d, v13.2d - ldr q0, [sp], #16 - eor v0.16b, v0.16b, v11.16b - str q0, [x21] - -.Lxts_enc_ret: - - movi v0.16b, #0 - movi v1.16b, #0 -.Lxts_enc_bzero: // wipe key schedule - stp q0, q1, [sp], #32 - cmp sp, x19 - bne .Lxts_enc_bzero - - ldp x19, x20, [sp, #80] - ldp x21, x22, [sp, #96] - ldr x23, [sp, #112] - ldp d8, d9, [sp, #128] - ldp d10, d11, [sp, #144] - ldp d12, d13, [sp, #160] - ldp d14, d15, [sp, #176] - ldp fp, lr, [sp], #192 - ret -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -// The assembler doesn't seem capable of de-duplicating these when expressed -// using `ldr qd,=` syntax, so assign a symbolic address -.align 5 -.Lxts_magic: -.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 - -.globl bsaes_xts_decrypt -.type bsaes_xts_decrypt,%function -.align 4 -// On entry: -// x0 -> input ciphertext -// x1 -> output plaintext -// x2 -> length of text in bytes (must be at least 16) -// x3 -> key1 (used to decrypt the XORed ciphertext blocks) -// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) -// x5 -> 16-byte initial vector (typically, sector number) -// On exit: -// Output plaintext filled in -// No output registers, usual AAPCS64 register preservation -bsaes_xts_decrypt: - // Stack layout: - // sp -> - // nrounds*128-96 bytes: key schedule - // x19 -> - // 16 bytes: frame record - // 4*16 bytes: tweak storage across _bsaes_decrypt8 - // 6*8 bytes: storage for 5 callee-saved general-purpose registers - // 8*8 bytes: storage for 8 callee-saved SIMD registers - stp fp, lr, [sp, #-192]! - stp x19, x20, [sp, #80] - stp x21, x22, [sp, #96] - str x23, [sp, #112] - stp d8, d9, [sp, #128] - stp d10, d11, [sp, #144] - stp d12, d13, [sp, #160] - stp d14, d15, [sp, #176] - - mov x19, sp - mov x20, x0 - mov x21, x1 - mov x22, x2 - mov x23, x3 - - // generate initial tweak - sub sp, sp, #16 - mov x0, x5 // iv[] - mov x1, sp - mov x2, x4 // key2 - bl AES_encrypt - ldr q11, [sp], #16 - - ldr w1, [x23, #240] // get # of rounds - // allocate the key schedule on the stack - add x17, sp, #96 - sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes - - // populate the key schedule - mov x9, x23 // pass key - mov x10, x1 // pass # of rounds - mov sp, x17 - bl _bsaes_key_convert - ldr q6, [sp] - str q15, [x17] // save last round key - eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) - str q6, [sp] - - sub x30, x22, #0x10 - tst x22, #0xf // if not multiple of 16 - csel x22, x30, x22, ne // subtract another 16 bytes - subs x22, x22, #0x80 - - blo .Lxts_dec_short - b .Lxts_dec_loop - -.align 4 -.Lxts_dec_loop: - ldr q8, .Lxts_magic - mov x10, x1 // pass rounds - add x2, x19, #16 - ldr q0, [x20], #16 - sshr v1.2d, v11.2d, #63 - mov x9, sp // pass key schedule - ldr q6, .Lxts_magic+16 - add v2.2d, v11.2d, v11.2d - cmtst v3.2d, v11.2d, v6.2d - and v1.16b, v1.16b, v8.16b - ext v1.16b, v1.16b, v1.16b, #8 - and v3.16b, v3.16b, v8.16b - ldr q4, [x20], #16 - eor v12.16b, v2.16b, v1.16b - eor v1.16b, v4.16b, v12.16b - eor v0.16b, v0.16b, v11.16b - cmtst v2.2d, v12.2d, v6.2d - add v4.2d, v12.2d, v12.2d - add x0, x19, #16 - ext v3.16b, v3.16b, v3.16b, #8 - and v2.16b, v2.16b, v8.16b - eor v13.16b, v4.16b, v3.16b - ldr q3, [x20], #16 - ext v4.16b, v2.16b, v2.16b, #8 - eor v2.16b, v3.16b, v13.16b - ldr q3, [x20], #16 - add v5.2d, v13.2d, v13.2d - cmtst v7.2d, v13.2d, v6.2d - and v7.16b, v7.16b, v8.16b - ldr q9, [x20], #16 - ext v7.16b, v7.16b, v7.16b, #8 - ldr q10, [x20], #16 - eor v14.16b, v5.16b, v4.16b - ldr q16, [x20], #16 - add v4.2d, v14.2d, v14.2d - eor v3.16b, v3.16b, v14.16b - eor v15.16b, v4.16b, v7.16b - add v5.2d, v15.2d, v15.2d - ldr q7, [x20], #16 - cmtst v4.2d, v14.2d, v6.2d - and v17.16b, v4.16b, v8.16b - cmtst v18.2d, v15.2d, v6.2d - eor v4.16b, v9.16b, v15.16b - ext v9.16b, v17.16b, v17.16b, #8 - eor v9.16b, v5.16b, v9.16b - add v17.2d, v9.2d, v9.2d - and v18.16b, v18.16b, v8.16b - eor v5.16b, v10.16b, v9.16b - str q9, [x2], #16 - ext v10.16b, v18.16b, v18.16b, #8 - cmtst v9.2d, v9.2d, v6.2d - and v9.16b, v9.16b, v8.16b - eor v10.16b, v17.16b, v10.16b - cmtst v17.2d, v10.2d, v6.2d - eor v6.16b, v16.16b, v10.16b - str q10, [x2], #16 - ext v9.16b, v9.16b, v9.16b, #8 - add v10.2d, v10.2d, v10.2d - eor v9.16b, v10.16b, v9.16b - str q9, [x2], #16 - eor v7.16b, v7.16b, v9.16b - add v9.2d, v9.2d, v9.2d - and v8.16b, v17.16b, v8.16b - ext v8.16b, v8.16b, v8.16b, #8 - eor v8.16b, v9.16b, v8.16b - str q8, [x2] // next round tweak - - bl _bsaes_decrypt8 - - eor v6.16b, v6.16b, v13.16b - eor v0.16b, v0.16b, v11.16b - ldr q8, [x0], #16 - eor v7.16b, v7.16b, v8.16b - str q0, [x21], #16 - eor v0.16b, v1.16b, v12.16b - ldr q1, [x0], #16 - eor v1.16b, v3.16b, v1.16b - subs x22, x22, #0x80 - eor v2.16b, v2.16b, v15.16b - eor v3.16b, v4.16b, v14.16b - ldr q4, [x0], #16 - str q0, [x21], #16 - ldr q11, [x0] // next round tweak - eor v0.16b, v5.16b, v4.16b - str q6, [x21], #16 - str q3, [x21], #16 - str q2, [x21], #16 - str q7, [x21], #16 - str q1, [x21], #16 - str q0, [x21], #16 - bpl .Lxts_dec_loop - -.Lxts_dec_short: - adds x22, x22, #0x70 - bmi .Lxts_dec_done - - ldr q8, .Lxts_magic - sshr v1.2d, v11.2d, #63 - add v2.2d, v11.2d, v11.2d - ldr q9, .Lxts_magic+16 - subs x22, x22, #0x10 - ldr q0, [x20], #16 - and v1.16b, v1.16b, v8.16b - cmtst v3.2d, v11.2d, v9.2d - ext v1.16b, v1.16b, v1.16b, #8 - and v3.16b, v3.16b, v8.16b - eor v12.16b, v2.16b, v1.16b - ext v1.16b, v3.16b, v3.16b, #8 - add v2.2d, v12.2d, v12.2d - cmtst v3.2d, v12.2d, v9.2d - eor v13.16b, v2.16b, v1.16b - and v22.16b, v3.16b, v8.16b - bmi .Lxts_dec_1 - - ext v2.16b, v22.16b, v22.16b, #8 - add v3.2d, v13.2d, v13.2d - ldr q1, [x20], #16 - cmtst v4.2d, v13.2d, v9.2d - subs x22, x22, #0x10 - eor v14.16b, v3.16b, v2.16b - and v23.16b, v4.16b, v8.16b - bmi .Lxts_dec_2 - - ext v3.16b, v23.16b, v23.16b, #8 - add v4.2d, v14.2d, v14.2d - ldr q2, [x20], #16 - cmtst v5.2d, v14.2d, v9.2d - eor v0.16b, v0.16b, v11.16b - subs x22, x22, #0x10 - eor v15.16b, v4.16b, v3.16b - and v24.16b, v5.16b, v8.16b - bmi .Lxts_dec_3 - - ext v4.16b, v24.16b, v24.16b, #8 - add v5.2d, v15.2d, v15.2d - ldr q3, [x20], #16 - cmtst v6.2d, v15.2d, v9.2d - eor v1.16b, v1.16b, v12.16b - subs x22, x22, #0x10 - eor v16.16b, v5.16b, v4.16b - and v25.16b, v6.16b, v8.16b - bmi .Lxts_dec_4 - - ext v5.16b, v25.16b, v25.16b, #8 - add v6.2d, v16.2d, v16.2d - add x0, x19, #16 - cmtst v7.2d, v16.2d, v9.2d - ldr q4, [x20], #16 - eor v2.16b, v2.16b, v13.16b - str q16, [x0], #16 - subs x22, x22, #0x10 - eor v17.16b, v6.16b, v5.16b - and v26.16b, v7.16b, v8.16b - bmi .Lxts_dec_5 - - ext v7.16b, v26.16b, v26.16b, #8 - add v18.2d, v17.2d, v17.2d - ldr q5, [x20], #16 - eor v3.16b, v3.16b, v14.16b - str q17, [x0], #16 - subs x22, x22, #0x10 - eor v18.16b, v18.16b, v7.16b - bmi .Lxts_dec_6 - - ldr q6, [x20], #16 - eor v4.16b, v4.16b, v15.16b - eor v5.16b, v5.16b, v16.16b - str q18, [x0] // next round tweak - mov x9, sp // pass key schedule - mov x10, x1 - add x0, x19, #16 - sub x22, x22, #0x10 - eor v6.16b, v6.16b, v17.16b - - bl _bsaes_decrypt8 - - ldr q16, [x0], #16 - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - ldr q17, [x0], #16 - eor v6.16b, v6.16b, v13.16b - eor v4.16b, v4.16b, v14.16b - eor v2.16b, v2.16b, v15.16b - ldr q11, [x0] // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - eor v0.16b, v7.16b, v16.16b - eor v1.16b, v3.16b, v17.16b - str q6, [x21], #16 - str q4, [x21], #16 - str q2, [x21], #16 - str q0, [x21], #16 - str q1, [x21], #16 - b .Lxts_dec_done - -.align 4 -.Lxts_dec_6: - eor v4.16b, v4.16b, v15.16b - eor v5.16b, v5.16b, v16.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_decrypt8 - - ldr q16, [x0], #16 - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v4.16b, v4.16b, v14.16b - ldr q11, [x0] // next round tweak - eor v2.16b, v2.16b, v15.16b - str q0, [x21], #16 - str q1, [x21], #16 - eor v0.16b, v7.16b, v16.16b - str q6, [x21], #16 - str q4, [x21], #16 - str q2, [x21], #16 - str q0, [x21], #16 - b .Lxts_dec_done - -.align 4 -.Lxts_dec_5: - eor v3.16b, v3.16b, v14.16b - eor v4.16b, v4.16b, v15.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_decrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - ldr q11, [x0] // next round tweak - eor v6.16b, v6.16b, v13.16b - eor v4.16b, v4.16b, v14.16b - eor v2.16b, v2.16b, v15.16b - str q0, [x21], #16 - str q1, [x21], #16 - str q6, [x21], #16 - str q4, [x21], #16 - str q2, [x21], #16 - b .Lxts_dec_done - -.align 4 -.Lxts_dec_4: - eor v2.16b, v2.16b, v13.16b - eor v3.16b, v3.16b, v14.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_decrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v4.16b, v4.16b, v14.16b - mov v11.16b, v15.16b // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - str q6, [x21], #16 - str q4, [x21], #16 - b .Lxts_dec_done - -.align 4 -.Lxts_dec_3: - eor v1.16b, v1.16b, v12.16b - eor v2.16b, v2.16b, v13.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_decrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - mov v11.16b, v14.16b // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - str q6, [x21], #16 - b .Lxts_dec_done - -.align 4 -.Lxts_dec_2: - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - mov x9, sp // pass key schedule - mov x10, x1 // pass rounds - add x0, x19, #16 - - bl _bsaes_decrypt8 - - eor v0.16b, v0.16b, v11.16b - eor v1.16b, v1.16b, v12.16b - mov v11.16b, v13.16b // next round tweak - str q0, [x21], #16 - str q1, [x21], #16 - b .Lxts_dec_done - -.align 4 -.Lxts_dec_1: - eor v0.16b, v0.16b, v11.16b - sub x0, sp, #16 - sub x1, sp, #16 - mov x2, x23 - mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers - mov v14.d[0], v12.d[1] - str q0, [sp, #-16]! - - bl AES_decrypt - - ldr q0, [sp], #16 - trn1 v13.2d, v11.2d, v13.2d - trn1 v11.2d, v12.2d, v14.2d // next round tweak - eor v0.16b, v0.16b, v13.16b - str q0, [x21], #16 - -.Lxts_dec_done: - adds x22, x22, #0x10 - beq .Lxts_dec_ret - - // calculate one round of extra tweak for the stolen ciphertext - ldr q8, .Lxts_magic - sshr v6.2d, v11.2d, #63 - and v6.16b, v6.16b, v8.16b - add v12.2d, v11.2d, v11.2d - ext v6.16b, v6.16b, v6.16b, #8 - eor v12.16b, v12.16b, v6.16b - - // perform the final decryption with the last tweak value - ldr q0, [x20], #16 - eor v0.16b, v0.16b, v12.16b - str q0, [sp, #-16]! - mov x0, sp - mov x1, sp - mov x2, x23 - mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers - mov v14.d[0], v12.d[1] - - bl AES_decrypt - - trn1 v12.2d, v12.2d, v14.2d - trn1 v11.2d, v11.2d, v13.2d - ldr q0, [sp], #16 - eor v0.16b, v0.16b, v12.16b - str q0, [x21] - - mov x6, x21 - // Penultimate ciphertext block produces final plaintext part-block - // plus remaining part of final ciphertext block. Move plaintext part - // to final position and re-use penultimate plaintext block buffer to - // construct final ciphertext block -.Lxts_dec_steal: - ldrb w1, [x21] - ldrb w0, [x20], #1 - strb w1, [x21, #0x10] - strb w0, [x21], #1 - - subs x22, x22, #1 - bhi .Lxts_dec_steal - - // Finally decrypt the penultimate plaintext block using the - // penultimate tweak - ldr q0, [x6] - eor v0.16b, v0.16b, v11.16b - str q0, [sp, #-16]! - mov x0, sp - mov x1, sp - mov x2, x23 - mov x21, x6 - - bl AES_decrypt - - trn1 v11.2d, v11.2d, v13.2d - ldr q0, [sp], #16 - eor v0.16b, v0.16b, v11.16b - str q0, [x21] - -.Lxts_dec_ret: - - movi v0.16b, #0 - movi v1.16b, #0 -.Lxts_dec_bzero: // wipe key schedule - stp q0, q1, [sp], #32 - cmp sp, x19 - bne .Lxts_dec_bzero - - ldp x19, x20, [sp, #80] - ldp x21, x22, [sp, #96] - ldr x23, [sp, #112] - ldp d8, d9, [sp, #128] - ldp d10, d11, [sp, #144] - ldp d12, d13, [sp, #160] - ldp d14, d15, [sp, #176] - ldp fp, lr, [sp], #192 - ret -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt diff --git a/crypto/aes/build.info b/crypto/aes/build.info index edf6c8106e..0b9f499ee6 100644 --- a/crypto/aes/build.info +++ b/crypto/aes/build.info @@ -30,8 +30,8 @@ IF[{- !$disabled{asm} -}] $AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S $AESDEF_armv4=AES_ASM BSAES_ASM - $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S - $AESDEF_aarch64=BSAES_ASM VPAES_ASM + $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S vpaes-armv8.S + $AESDEF_aarch64=VPAES_ASM $AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s $AESDEF_parisc11=AES_ASM @@ -80,7 +80,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}] ENDIF GENERATE[aes-ia64.s]=asm/aes-ia64.S -GENERATE[bsaes-armv8.S]=asm/bsaes-armv8.S GENERATE[aes-586.s]=asm/aes-586.pl DEPEND[aes-586.s]=../perlasm/x86asm.pl |