diff options
author | Andy Polyakov <appro@openssl.org> | 2017-12-01 13:13:25 +0100 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2017-12-04 17:21:15 +0100 |
commit | 9ee020f8dc7813db82a119058d8f57e70e7e8904 (patch) | |
tree | 5aa26351ec7e766694e912a3f9fa61b537d1b552 /crypto/modes/asm | |
parent | modes/asm/ghashv8-armx.pl: implement 4x aggregate factor. (diff) | |
download | openssl-9ee020f8dc7813db82a119058d8f57e70e7e8904.tar.xz openssl-9ee020f8dc7813db82a119058d8f57e70e7e8904.zip |
modes/asm/ghashv8-armx.pl: modulo-schedule loop.
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4830)
Diffstat (limited to 'crypto/modes/asm')
-rw-r--r-- | crypto/modes/asm/ghashv8-armx.pl | 73 |
1 files changed, 70 insertions, 3 deletions
diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl index ef7c74798d..2498a1d646 100644 --- a/crypto/modes/asm/ghashv8-armx.pl +++ b/crypto/modes/asm/ghashv8-armx.pl @@ -423,10 +423,7 @@ gcm_ghash_v8_4x: #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif - b .Loop4x -.align 4 -.Loop4x: vld1.64 {$I0-$j3},[$inp],#64 #ifndef __ARMEB__ vrev64.8 $j1,$j1 @@ -461,6 +458,13 @@ gcm_ghash_v8_4x: veor $Yh,$Yh,$I1 veor $Ym,$Ym,$j1 + subs $len,$len,#64 + b.eq .Ltail4x + + b .Loop4x + +.align 4 +.Loop4x: veor $t0,$I0,$Xl vext.8 $IN,$t0,$t0,#8 @@ -489,9 +493,72 @@ gcm_ghash_v8_4x: veor $Xl,$Xl,$t2 vext.8 $Xl,$Xl,$Xl,#8 + vld1.64 {$I0-$j3},[$inp],#64 +#ifndef __ARMEB__ + vrev64.8 $j1,$j1 + vrev64.8 $j2,$j2 + vrev64.8 $j3,$j3 + vrev64.8 $I0,$I0 +#endif + vext.8 $I3,$j3,$j3,#8 + vext.8 $I2,$j2,$j2,#8 + vext.8 $I1,$j1,$j1,#8 + + vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 + veor $j3,$j3,$I3 + vpmull2.p64 $Yh,$H,$I3 + vpmull.p64 $Ym,$Hhl,$j3 + + vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 + veor $j2,$j2,$I2 + vpmull2.p64 $I2,$H2,$I2 + vpmull2.p64 $j2,$Hhl,$j2 + + veor $Yl,$Yl,$t0 + veor $Yh,$Yh,$I2 + veor $Ym,$Ym,$j2 + + vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 + veor $j1,$j1,$I1 + vpmull2.p64 $I1,$H3,$I1 + vpmull.p64 $j1,$H34,$j1 + + veor $Yl,$Yl,$j3 + veor $Yh,$Yh,$I1 + veor $Ym,$Ym,$j1 + subs $len,$len,#64 b.ne .Loop4x +.Ltail4x: + veor $t0,$I0,$Xl + vext.8 $IN,$t0,$t0,#8 + + vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H4,$IN + vpmull2.p64 $Xm,$H34,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + veor $Xm,$Xm,$Ym + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + vext.8 $Xl,$Xl,$Xl,#8 + #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif |