summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2018-06-02 14:03:27 +0200
committerAndy Polyakov <appro@openssl.org>2018-06-03 21:20:06 +0200
commit41013cd63c068e2f271fabc92702ee67d800f0cb (patch)
tree51fae99bc0bb6a7cb78500bc5109f2f86cd03d83
parent{arm64|x86_64}cpuid.pl: add special 16-byte case to OPENSSL_memcmp. (diff)
downloadopenssl-41013cd63c068e2f271fabc92702ee67d800f0cb.tar.xz
openssl-41013cd63c068e2f271fabc92702ee67d800f0cb.zip
PPC assembly pack: correct POWER9 results.
As it turns out originally published results were skewed by "turbo" mode. VM apparently remains oblivious to dynamic frequency scaling, and reports that processor operates at "base" frequency at all times. While actual frequency gets increased under load. Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6406)
-rwxr-xr-xcrypto/aes/asm/aesp8-ppc.pl3
-rwxr-xr-xcrypto/chacha/asm/chacha-ppc.pl2
-rwxr-xr-xcrypto/modes/asm/ghashp8-ppc.pl2
-rwxr-xr-xcrypto/poly1305/asm/poly1305-ppc.pl2
-rwxr-xr-xcrypto/poly1305/asm/poly1305-ppcfp.pl1
-rw-r--r--crypto/poly1305/poly1305_ieee754.c1
-rwxr-xr-xcrypto/sha/asm/keccak1600-ppc64.pl2
-rwxr-xr-xcrypto/sha/asm/keccak1600p8-ppc.pl2
-rwxr-xr-xcrypto/sha/asm/sha512p8-ppc.pl4
9 files changed, 9 insertions, 10 deletions
diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
index 86709404cf..488b133250 100755
--- a/crypto/aes/asm/aesp8-ppc.pl
+++ b/crypto/aes/asm/aesp8-ppc.pl
@@ -40,7 +40,8 @@
# CBC en-/decrypt CTR XTS
# POWER8[le] 3.96/0.72 0.74 1.1
# POWER8[be] 3.75/0.65 0.66 1.0
-# POWER9[le] 3.05/0.65 0.65 0.80
+# POWER9[le] 4.02/0.86 0.84 1.05
+# POWER9[be] 3.99/0.78 0.79 0.97
$flavour = shift;
diff --git a/crypto/chacha/asm/chacha-ppc.pl b/crypto/chacha/asm/chacha-ppc.pl
index 350d5fae37..6dd05819ad 100755
--- a/crypto/chacha/asm/chacha-ppc.pl
+++ b/crypto/chacha/asm/chacha-ppc.pl
@@ -27,7 +27,7 @@
# PPC970/G5 9.29/+160% ?
# POWER7 8.62/+61% 3.38
# POWER8 8.70/+51% 3.36
-# POWER9 6.61/+29% 3.30(*)
+# POWER9 8.80/+29% 4.50(*)
#
# (*) this is trade-off result, it's possible to improve it, but
# then it would negatively affect all others;
diff --git a/crypto/modes/asm/ghashp8-ppc.pl b/crypto/modes/asm/ghashp8-ppc.pl
index 6df485efcc..6a2ac71295 100755
--- a/crypto/modes/asm/ghashp8-ppc.pl
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@@ -30,7 +30,7 @@
# 2x aggregated reduction improves performance by 50% (resulting
# performance on POWER8 is 1 cycle per processed byte), and 4x
# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
-# POWER9 delivers 0.40 cpb.
+# POWER9 delivers 0.51 cpb.
$flavour=shift;
$output =shift;
diff --git a/crypto/poly1305/asm/poly1305-ppc.pl b/crypto/poly1305/asm/poly1305-ppc.pl
index cb4ae23869..0c6d015d58 100755
--- a/crypto/poly1305/asm/poly1305-ppc.pl
+++ b/crypto/poly1305/asm/poly1305-ppc.pl
@@ -28,7 +28,7 @@
# PPC970 7.00/+114% 3.51/+205%
# POWER7 3.75/+260% 1.93/+100%
# POWER8 - 2.03/+200%
-# POWER9 - 1.56/+150%
+# POWER9 - 2.00/+150%
#
# Do we need floating-point implementation for PPC? Results presented
# in poly1305_ieee754.c are tricky to compare to, because they are for
diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl b/crypto/poly1305/asm/poly1305-ppcfp.pl
index 2abb8e20b4..09f8185848 100755
--- a/crypto/poly1305/asm/poly1305-ppcfp.pl
+++ b/crypto/poly1305/asm/poly1305-ppcfp.pl
@@ -26,7 +26,6 @@
# PPC970 6.03/+80%
# POWER7 3.50/+30%
# POWER8 3.75/+10%
-# POWER9 2.80/+12%
$flavour = shift;
diff --git a/crypto/poly1305/poly1305_ieee754.c b/crypto/poly1305/poly1305_ieee754.c
index 1a06e03558..995a02e5c1 100644
--- a/crypto/poly1305/poly1305_ieee754.c
+++ b/crypto/poly1305/poly1305_ieee754.c
@@ -38,7 +38,6 @@
* POWER6 4.92
* POWER7 4.50
* POWER8 4.10
- * POWER9 3.14
*
* z10 11.2
* z196+ 7.30
diff --git a/crypto/sha/asm/keccak1600-ppc64.pl b/crypto/sha/asm/keccak1600-ppc64.pl
index bc1023e399..30e70c5d6d 100755
--- a/crypto/sha/asm/keccak1600-ppc64.pl
+++ b/crypto/sha/asm/keccak1600-ppc64.pl
@@ -30,7 +30,7 @@
# PPC970/G5 14.6/+120%
# POWER7 10.3/+100%
# POWER8 11.5/+85%
-# POWER9 7.2/+45%
+# POWER9 9.4/+45%
#
# (*) Corresponds to SHA3-256. Percentage after slash is improvement
# over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
diff --git a/crypto/sha/asm/keccak1600p8-ppc.pl b/crypto/sha/asm/keccak1600p8-ppc.pl
index a0aeeb0412..de2bcd660a 100755
--- a/crypto/sha/asm/keccak1600p8-ppc.pl
+++ b/crypto/sha/asm/keccak1600p8-ppc.pl
@@ -23,7 +23,7 @@
# buffer for r=1088, which matches SHA3-256. This is 17% better than
# scalar PPC64 code. It probably should be noted that if POWER8's
# successor can achieve higher scalar instruction issue rate, then
-# this module will loose... And it does on POWER9 with 8.8 vs. 7.2.
+# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
$flavour = shift;
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
index a33ae4dc45..7a8d4358f0 100755
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -37,8 +37,8 @@
# build of sha512-ppc.pl, presented for reference.
#
# POWER8 POWER9
-# SHA256 9.9 [15.8] 9.2 [9.3]
-# SHA512 6.3 [10.3] 5.8 [5.9]
+# SHA256 9.9 [15.8] 12.2 [12.5]
+# SHA512 6.3 [10.3] 7.7 [7.9]
$flavour=shift;
$output =shift;