summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2019-04-17 21:30:39 +0200
committerAndy Polyakov <appro@openssl.org>2019-04-17 21:30:39 +0200
commitd6e4287c9726691e800bff221be71edd894a3c6a (patch)
treea4986119090999cd4b0818fa0c3ddf21d6a9e2eb /crypto/aes
parentARM64 assembly pack: add ThunderX2 results. (diff)
downloadopenssl-d6e4287c9726691e800bff221be71edd894a3c6a.tar.xz
openssl-d6e4287c9726691e800bff221be71edd894a3c6a.zip
aes/asm/aesv8-armx.pl: ~20% improvement on ThunderX2.
Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Richard Levitte <levitte@openssl.org> (Merged from https://github.com/openssl/openssl/pull/8776)
Diffstat (limited to 'crypto/aes')
-rwxr-xr-xcrypto/aes/asm/aesv8-armx.pl395
1 files changed, 389 insertions, 6 deletions
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
index b708a61d50..3b3a53bf30 100755
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -27,19 +27,34 @@
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
#
+# April 2019
+#
+# Key to performance of parallelize-able modes is round instruction
+# interleaving. But which factor to use? There is optimal one for
+# each combination of instruction latency and issue rate, beyond
+# which increasing interleave factor doesn't pay off. While on cons
+# side we have code size increase and resource waste on platforms for
+# which interleave factor is too high. In other words you want it to
+# be just right. So far interleave factor of 3x was serving well all
+# platforms. But for ThunderX2 optimal interleave factor was measured
+# to be 5x...
+#
# Performance in cycles per byte processed with 128-bit key:
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
-# Cortex-A53 1.32 1.29 1.46
-# Cortex-A57(*) 1.95 0.85 0.93
-# Denver 1.96 0.86 0.80
-# Mongoose 1.33 1.20 1.20
-# Kryo 1.26 0.94 1.00
-# ThunderX2 5.95 1.53 1.55
+# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
+# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
+# Cortex-A72 1.33 0.85/0.88 0.92/0.96
+# Denver 1.96 0.65/0.86 0.76/0.80
+# Mongoose 1.33 1.23/1.20 1.30/1.20
+# Kryo 1.26 0.87/0.94 1.00/1.00
+# ThunderX2 5.95 1.25 1.30
#
# (*) original 3.64/1.34/1.32 results were for r0p0 revision
# and are still same even for updated module;
+# (**) numbers after slash are for 32-bit code, which is 3x-
+# interleaved;
$flavour = shift;
$output = shift;
@@ -524,6 +539,13 @@ $code.=<<___;
___
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3); # used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
$code.=<<___;
.align 5
.Lcbc_dec:
@@ -540,7 +562,196 @@ $code.=<<___;
vorr $in0,$dat,$dat
vorr $in1,$dat1,$dat1
vorr $in2,$dat2,$dat2
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#32
+ b.lo .Loop3x_cbc_dec
+
+ vld1.8 {$dat3},[$inp],#16
+ vld1.8 {$dat4},[$inp],#16
+ sub $len,$len,#32 // bias
+ mov $cnt,$rounds
+ vorr $in3,$dat3,$dat3
+ vorr $in4,$dat4,$dat4
+
+.Loop5x_cbc_dec:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop5x_cbc_dec
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ cmp $len,#0x40 // because .Lcbc_tail4x
+ sub $len,$len,#0x50
+
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
+ mov $key_,$key
+
+ aesd $dat0,q10
+ aesimc $dat0,$dat0
+ aesd $dat1,q10
+ aesimc $dat1,$dat1
+ aesd $dat2,q10
+ aesimc $dat2,$dat2
+ aesd $dat3,q10
+ aesimc $dat3,$dat3
+ aesd $dat4,q10
+ aesimc $dat4,$dat4
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat4
+ // are loaded with last "words"
+ add x6,$len,#0x60 // because .Lcbc_tail4x
+
+ aesd $dat0,q11
+ aesimc $dat0,$dat0
+ aesd $dat1,q11
+ aesimc $dat1,$dat1
+ aesd $dat2,q11
+ aesimc $dat2,$dat2
+ aesd $dat3,q11
+ aesimc $dat3,$dat3
+ aesd $dat4,q11
+ aesimc $dat4,$dat4
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ aesd $dat3,q12
+ aesimc $dat3,$dat3
+ aesd $dat4,q12
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ aesd $dat3,q13
+ aesimc $dat3,$dat3
+ aesd $dat4,q13
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ aesd $dat3,q14
+ aesimc $dat3,$dat3
+ aesd $dat4,q14
+ aesimc $dat4,$dat4
+
+ veor $tmp0,$ivec,$rndlast
+ aesd $dat0,q15
+ veor $tmp1,$in0,$rndlast
+ vld1.8 {$in0},[$inp],#16
+ aesd $dat1,q15
+ veor $tmp2,$in1,$rndlast
+ vld1.8 {$in1},[$inp],#16
+ aesd $dat2,q15
+ veor $tmp3,$in2,$rndlast
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat3,q15
+ veor $tmp4,$in3,$rndlast
+ vld1.8 {$in3},[$inp],#16
+ aesd $dat4,q15
+ vorr $ivec,$in4,$in4
+ vld1.8 {$in4},[$inp],#16
+ cbz x6,.Lcbc_tail4x
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ veor $tmp0,$tmp0,$dat0
+ vorr $dat0,$in0,$in0
+ veor $tmp1,$tmp1,$dat1
+ vorr $dat1,$in1,$in1
+ veor $tmp2,$tmp2,$dat2
+ vorr $dat2,$in2,$in2
+ veor $tmp3,$tmp3,$dat3
+ vorr $dat3,$in3,$in3
+ veor $tmp4,$tmp4,$dat4
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat4,$in4,$in4
+ vst1.8 {$tmp1},[$out],#16
+ mov $cnt,$rounds
+ vst1.8 {$tmp2},[$out],#16
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+ b.hs .Loop5x_cbc_dec
+
+ add $len,$len,#0x50
+ cbz $len,.Lcbc_done
+
+ add $cnt,$rounds,#2
+ subs $len,$len,#0x30
+ vorr $dat0,$in2,$in2
+ vorr $in0,$in2,$in2
+ vorr $dat1,$in3,$in3
+ vorr $in1,$in3,$in3
+ vorr $dat2,$in4,$in4
+ vorr $in2,$in4,$in4
+ b.lo .Lcbc_dec_tail
+
+ b .Loop3x_cbc_dec
+
+.align 4
+.Lcbc_tail4x:
+ veor $tmp1,$tmp0,$dat1
+ veor $tmp2,$tmp2,$dat2
+ veor $tmp3,$tmp3,$dat3
+ veor $tmp4,$tmp4,$dat4
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+
+ b .Lcbc_done
+.align 4
+___
+$code.=<<___;
.Loop3x_cbc_dec:
aesd $dat0,q8
aesimc $dat0,$dat0
@@ -701,6 +912,9 @@ my $step="x12"; # aliases with $tctr2
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+# used only in 64-bit mode...
+my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
+
my ($dat,$tmp)=($dat0,$tmp0);
### q8-q15 preloaded key schedule
@@ -753,6 +967,175 @@ $code.=<<___;
rev $tctr2, $ctr
sub $len,$len,#3 // bias
vmov.32 ${dat2}[3],$tctr2
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#2
+ b.lo .Loop3x_ctr32
+
+ add w13,$ctr,#1
+ add w14,$ctr,#2
+ vorr $dat3,$dat0,$dat0
+ rev w13,w13
+ vorr $dat4,$dat0,$dat0
+ rev w14,w14
+ vmov.32 ${dat3}[3],w13
+ sub $len,$len,#2 // bias
+ vmov.32 ${dat4}[3],w14
+ add $ctr,$ctr,#2
+ b .Loop5x_ctr32
+
+.align 4
+.Loop5x_ctr32:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop5x_ctr32
+
+ mov $key_,$key
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ add $tctr0,$ctr,#1
+ add $tctr1,$ctr,#2
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ add $tctr2,$ctr,#3
+ add w13,$ctr,#4
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ add w14,$ctr,#5
+ rev $tctr0,$tctr0
+ aese $dat3,q12
+ aesmc $dat3,$dat3
+ rev $tctr1,$tctr1
+ rev $tctr2,$tctr2
+ aese $dat4,q12
+ aesmc $dat4,$dat4
+ rev w13,w13
+ rev w14,w14
+
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ aese $dat3,q13
+ aesmc $dat3,$dat3
+ aese $dat4,q13
+ aesmc $dat4,$dat4
+
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ vld1.8 {$in0},[$inp],#16
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp],#16
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ vld1.8 {$in2},[$inp],#16
+ aese $dat3,q14
+ aesmc $dat3,$dat3
+ vld1.8 {$in3},[$inp],#16
+ aese $dat4,q14
+ aesmc $dat4,$dat4
+ vld1.8 {$in4},[$inp],#16
+
+ aese $dat0,q15
+ veor $in0,$in0,$rndlast
+ aese $dat1,q15
+ veor $in1,$in1,$rndlast
+ aese $dat2,q15
+ veor $in2,$in2,$rndlast
+ aese $dat3,q15
+ veor $in3,$in3,$rndlast
+ aese $dat4,q15
+ veor $in4,$in4,$rndlast
+
+ veor $in0,$in0,$dat0
+ vorr $dat0,$ivec,$ivec
+ veor $in1,$in1,$dat1
+ vorr $dat1,$ivec,$ivec
+ veor $in2,$in2,$dat2
+ vorr $dat2,$ivec,$ivec
+ veor $in3,$in3,$dat3
+ vorr $dat3,$ivec,$ivec
+ veor $in4,$in4,$dat4
+ vorr $dat4,$ivec,$ivec
+
+ vst1.8 {$in0},[$out],#16
+ vmov.32 ${dat0}[3],$tctr0
+ vst1.8 {$in1},[$out],#16
+ vmov.32 ${dat1}[3],$tctr1
+ vst1.8 {$in2},[$out],#16
+ vmov.32 ${dat2}[3],$tctr2
+ vst1.8 {$in3},[$out],#16
+ vmov.32 ${dat3}[3],w13
+ vst1.8 {$in4},[$out],#16
+ vmov.32 ${dat4}[3],w14
+
+ mov $cnt,$rounds
+ cbz $len,.Lctr32_done
+
+ add $ctr,$ctr,#5
+ subs $len,$len,#5
+ b.hs .Loop5x_ctr32
+
+ add $len,$len,#5
+ sub $ctr,$ctr,#5
+
+ cmp $len,#2
+ mov $step,#16
+ cclr $step,lo
+ b.ls .Lctr32_tail
+
+ sub $len,$len,#3 // bias
+ add $ctr,$ctr,#3
+___
+$code.=<<___;
b .Loop3x_ctr32
.align 4