[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Sat Jul 15 21:05:11 UTC 2017
The branch master has been updated
via d212b98b36d0874eb39c5ef08a18a43b9b006142 (commit)
via 91dbdc63bda4d0d4c5431d14cce70c765cc2bdaf (commit)
from 74df8c4ce3c7ccb4e2809a44791756356f704b66 (commit)
- Log -----------------------------------------------------------------
commit d212b98b36d0874eb39c5ef08a18a43b9b006142
Author: Andy Polyakov <appro at openssl.org>
Date: Wed Jul 12 16:02:42 2017 +0200
sha/asm/keccak1600-avx2.pl: optimized remodelled version.
New register usage pattern allows to achieve sligtly better
performance. Not as much as I hoped for. Performance is believed
to be limited by irreconcilable write-back conflicts, rather than
lack of computational resources or data dependencies.
Reviewed-by: Rich Salz <rsalz at openssl.org>
commit 91dbdc63bda4d0d4c5431d14cce70c765cc2bdaf
Author: Andy Polyakov <appro at openssl.org>
Date: Wed Jul 12 15:55:19 2017 +0200
sha/asm/keccak1600-avx2.pl: remodel register usage.
This gives much more freedom to rearrange instructions. This is
unoptimized version, provided for reference. Basically you need
to compare it to initial 29724d0e15b4934abdf2d7ab71957b05d1a28256
to figure out the key difference.
Reviewed-by: Rich Salz <rsalz at openssl.org>
-----------------------------------------------------------------------
Summary of changes:
crypto/sha/asm/keccak1600-avx2.pl | 180 +++++++++++++++++++-------------------
1 file changed, 89 insertions(+), 91 deletions(-)
diff --git a/crypto/sha/asm/keccak1600-avx2.pl b/crypto/sha/asm/keccak1600-avx2.pl
index 23fe645..82ca672 100755
--- a/crypto/sha/asm/keccak1600-avx2.pl
+++ b/crypto/sha/asm/keccak1600-avx2.pl
@@ -111,8 +111,8 @@ my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
#
# r=1088(*)
#
-# Haswell 8.9/+8%
-# Skylake 7.9/+19%
+# Haswell 8.7/+10%
+# Skylake 7.8/+20%
# Ryzen 17(**)
#
# (*) Corresponds to SHA3-256. Percentage after slash is improvement
@@ -141,14 +141,12 @@ __KeccakF1600:
.Loop_avx2:
######################################### Theta
vpshufd \$0b01001110,$A20,$C00
- vpxor $A31,$A01,$C14
- vpxor $A41,$A21, at T[0]
- vpxor $A11,$C14,$C14
- vpxor @T[0],$C14,$C14 # C[1..4]
+ vpxor $A31,$A41,$C14
+ vpxor $A11,$A21, at T[2]
+ vpxor $A01,$C14,$C14
+ vpxor @T[2],$C14,$C14 # C[1..4]
- vpermq \$0b11111111,$C14, at T[3]
vpermq \$0b10010011,$C14, at T[4]
-
vpxor $A20,$C00,$C00
vpermq \$0b01001110,$C00, at T[0]
@@ -157,7 +155,8 @@ __KeccakF1600:
vpor @T[2], at T[1], at T[1] # ROL64(C[1..4],1)
vpermq \$0b00111001, at T[1],$D14
- vpxor @T[3], at T[1],$D00
+ vpxor @T[4], at T[1],$D00
+ vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
vpxor $A00,$C00,$C00
vpxor @T[0],$C00,$C00 # C[0..0]
@@ -166,7 +165,6 @@ __KeccakF1600:
vpaddq $C00,$C00, at T[1]
vpor @T[0], at T[1], at T[1] # ROL64(C[0..0],1)
- vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
vpxor $D00,$A20,$A20 # ^= D[0..0]
vpxor $D00,$A00,$A00 # ^= D[0..0]
@@ -175,98 +173,98 @@ __KeccakF1600:
vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
######################################### Rho + Pi + pre-Chi shuffle
- vpsllvq 0*32-96(%r8),$A20, at T[0]
+ vpsllvq 0*32-96(%r8),$A20, at T[3]
vpsrlvq 0*32-96(%r9),$A20,$A20
- vpor $A20, at T[0], at T[0] # $A20
+ vpor @T[3],$A20,$A20
- vpxor $D14,$A31,$A31 # ^= D[1..4]
- vpsllvq 2*32-96(%r8),$A31, at T[2]
+ vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta
+ vpsllvq 2*32-96(%r8),$A31, at T[4]
vpsrlvq 2*32-96(%r9),$A31,$A31
- vpor $A31, at T[2], at T[2] # $A31
+ vpor @T[4],$A31,$A31
- vpxor $D14,$A21,$A21 # ^= D[1..4]
- vpsllvq 3*32-96(%r8),$A21, at T[3]
+ vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta
+ vpsllvq 3*32-96(%r8),$A21, at T[5]
vpsrlvq 3*32-96(%r9),$A21,$A21
- vpor $A21, at T[3], at T[3] # $A21
+ vpor @T[5],$A21,$A21
- vpermq \$0b10001101, at T[0],$A31 # $A20 -> $A31
- vpermq \$0b10001101, at T[2],$A21 # $A31 -> $A21
- vpxor $D14,$A41,$A41 # ^= D[1..4]
- vpsllvq 4*32-96(%r8),$A41, at T[4]
+ vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta
+ vpsllvq 4*32-96(%r8),$A41, at T[6]
vpsrlvq 4*32-96(%r9),$A41,$A41
+ vpor @T[6],$A41,$A41
+
+ vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta
+ vpermq \$0b10001101,$A20, at T[3] # $A20 -> future $A31
+ vpermq \$0b10001101,$A31, at T[4] # $A31 -> future $A21
+ vpsllvq 5*32-96(%r8),$A11, at T[7]
+ vpsrlvq 5*32-96(%r9),$A11, at T[1]
+ vpor @T[7], at T[1], at T[1] # $A11 -> future $A01
+
+ vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta
+ vpermq \$0b00011011,$A21, at T[5] # $A21 -> future $A41
+ vpermq \$0b01110010,$A41, at T[6] # $A41 -> future $A11
+ vpsllvq 1*32-96(%r8),$A01, at T[8]
+ vpsrlvq 1*32-96(%r9),$A01, at T[2]
+ vpor @T[8], at T[2], at T[2] # $A01 -> future $A20
- vpxor $D14,$A01,$A01 # ^= D[1..4]
- vpxor $D14,$A11,$T[6] # ^= D[1..4]
- vpsllvq 1*32-96(%r8),$A01, at T[1]
- vpsrlvq 1*32-96(%r9),$A01,$A01
- vpor $A41, at T[4], at T[4] # $A41
- vpor @T[1],$A01,$A20 # $A01 -> $A20
+ ######################################### Chi
+ vpsrldq \$8, at T[1], at T[7]
+ vpandn @T[7], at T[1], at T[0] # tgting [0][0] [0][0] [0][0] [0][0]
+
+ vpblendd \$0b00001100, at T[6], at T[2],$A31 # [4][4] [2][0]
+ vpblendd \$0b00001100, at T[2], at T[4], at T[8] # [4][0] [2][1]
+ vpblendd \$0b00001100, at T[4], at T[3],$A41 # [4][2] [2][4]
+ vpblendd \$0b00001100, at T[3], at T[2], at T[7] # [4][3] [2][0]
+ vpblendd \$0b00110000, at T[4],$A31,$A31 # [1][3] [4][4] [2][0]
+ vpblendd \$0b00110000, at T[5], at T[8], at T[8] # [1][4] [4][0] [2][1]
+ vpblendd \$0b00110000, at T[2],$A41,$A41 # [1][0] [4][2] [2][4]
+ vpblendd \$0b00110000, at T[6], at T[7], at T[7] # [1][1] [4][3] [2][0]
+ vpblendd \$0b11000000, at T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
+ vpblendd \$0b11000000, at T[6], at T[8], at T[8] # [3][3] [1][4] [4][0] [2][1]
+ vpblendd \$0b11000000, at T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
+ vpblendd \$0b11000000, at T[4], at T[7], at T[7] # [3][4] [1][1] [4][3] [2][0]
+ vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
+ vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
+
+ vpblendd \$0b00001100, at T[2], at T[5],$A11 # [4][0] [2][3]
+ vpblendd \$0b00001100, at T[5], at T[3], at T[8] # [4][1] [2][4]
+ vpxor @T[3],$A31,$A31
+ vpblendd \$0b00110000, at T[3],$A11,$A11 # [1][2] [4][0] [2][3]
+ vpblendd \$0b00110000, at T[4], at T[8], at T[8] # [1][3] [4][1] [2][4]
+ vpxor @T[5],$A41,$A41
+ vpblendd \$0b11000000, at T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
+ vpblendd \$0b11000000, at T[2], at T[8], at T[8] # [3][0] [1][3] [4][1] [2][4]
+ vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
+ vpxor @T[6],$A11,$A11
+
+ vpermq \$0b00011110, at T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
+ vpblendd \$0b00110000,$A00,$A21, at T[8] # [0][1] [0][0] [0][4] [0][3]
+ vpermq \$0b00111001, at T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
+ vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
+ vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
+
+ vpblendd \$0b00001100, at T[5], at T[4],$A20 # [4][1] [2][1]
+ vpblendd \$0b00001100, at T[4], at T[6], at T[7] # [4][2] [2][2]
+ vpblendd \$0b00110000, at T[6],$A20,$A20 # [1][1] [4][1] [2][1]
+ vpblendd \$0b00110000, at T[3], at T[7], at T[7] # [1][2] [4][2] [2][2]
+ vpblendd \$0b11000000, at T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
+ vpblendd \$0b11000000, at T[5], at T[7], at T[7] # [3][2] [1][2] [4][2] [2][2]
+ vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
+ vpxor @T[2],$A20,$A20
- vpermq \$0b00011011, at T[3],$A41 # $A21 -> $A41
- vpermq \$0b01110010, at T[4],$A11 # $A41 -> $A11
- vpsllvq 5*32-96(%r8),$T[6], at T[5]
- vpsrlvq 5*32-96(%r9), at T[6], at T[6]
- vpor @T[5], at T[6],$A01 # $A11 -> $A01
+ vpermq \$0b00000000, at T[0], at T[0] # [0][0] [0][0] [0][0] [0][0]
+ vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
+ vpermq \$0b10001101,$A41,$A41
+ vpermq \$0b01110010,$A11,$A11
- ######################################### Chi
- vpsrldq \$8,$A01, at T[0]
- vpandn @T[0],$A01, at T[0] # tgting [0][0]
-
- vpermq \$0b00111001,$A01, at T[1] # [0][1] [0][4] [0][3] [0][2]
- vpermq \$0b00011110,$A01, at T[8] # [0][1] [0][2] [0][4] [0][3]
- vpblendd \$0b11000000,$A00, at T[1], at T[1] # [0][0] [0][4] [0][3] [0][2]
- vpblendd \$0b00110000,$A00, at T[8], at T[8] # [0][1] [0][0] [0][4] [0][3]
- vpxor @T[0],$A00,$A00 # broadcasted below
- vpandn @T[8], at T[1], at T[1] # tgting [0][4] [0][3] [0][2] [0][1]
-
- vpblendd \$0b00001100,$A41,$A21, @T[2] # [4][1] [2][1]
- vpblendd \$0b00001100,$A21,$A11, @T[4] # [4][2] [2][2]
- vpblendd \$0b00110000,$A11, at T[2], at T[2] # [1][1] [4][1] [2][1]
- vpblendd \$0b00110000,$A31, at T[4], at T[4] # [1][2] [4][2] [2][2]
- vpblendd \$0b11000000,$A31, at T[2], at T[2] # [3][1] [1][1] [4][1] [2][1]
- vpblendd \$0b11000000,$A41, at T[4], at T[4] # [3][2] [1][2] [4][2] [2][2]
- vpandn @T[4], at T[2], at T[2] # tgting [3][0] [1][0] [4][0] [2][0]
-
- vpblendd \$0b00001100,$A11,$A20, @T[3] # [4][4] [2][0]
- vpblendd \$0b00001100,$A20,$A21, @T[5] # [4][0] [2][1]
- vpblendd \$0b00110000,$A21, at T[3], at T[3] # [1][3] [4][4] [2][0]
- vpblendd \$0b00110000,$A41, at T[5], at T[5] # [1][4] [4][0] [2][1]
- vpblendd \$0b11000000,$A41, at T[3], at T[3] # [3][2] [1][3] [4][4] [2][0]
- vpblendd \$0b11000000,$A11, at T[5], at T[5] # [3][3] [1][4] [4][0] [2][1]
- vpandn @T[5], at T[3], at T[3] # tgting [3][1] [1][2] [4][3] [2][4]
- vpxor $A31, at T[3], at T[3]
-
- vpblendd \$0b00001100,$A21,$A31, @T[5] # [4][2] [2][4]
- vpblendd \$0b00001100,$A31,$A20, @T[6] # [4][3] [2][0]
- vpblendd \$0b00110000,$A20, at T[5], at T[5] # [1][0] [4][2] [2][4]
- vpblendd \$0b00110000,$A11, at T[6], at T[6] # [1][1] [4][3] [2][0]
- vpblendd \$0b11000000,$A11, at T[5], at T[5] # [3][3] [1][0] [4][2] [2][4]
- vpblendd \$0b11000000,$A21, at T[6], at T[6] # [3][4] [1][1] [4][3] [2][0]
- vpandn @T[6], at T[5], at T[5] # tgting [3][2] [1][4] [4][1] [2][3]
- vpxor $A41, at T[5], at T[5]
-
- vpblendd \$0b00001100,$A20,$A41, @T[6] # [4][0] [2][3]
- vpblendd \$0b00001100,$A41,$A31, @T[7] # [4][1] [2][4]
- vpblendd \$0b00110000,$A31, at T[6], at T[6] # [1][2] [4][0] [2][3]
- vpblendd \$0b00110000,$A21, at T[7], at T[7] # [1][3] [4][1] [2][4]
- vpblendd \$0b11000000,$A21, at T[6], at T[6] # [3][4] [1][2] [4][0] [2][3]
- vpblendd \$0b11000000,$A20, at T[7], at T[7] # [3][0] [1][3] [4][1] [2][4]
- vpblendd \$0b00001100,$A31,$A41, @T[4] # [1][4] [4][3]
- vpblendd \$0b11000000,$A31,$A41, @T[8] # [3][1] [2][3]
- vpandn @T[7], at T[6], at T[6] # tgting [3][3] [1][1] [4][4] [2][2]
- vpermq \$0b00011011, at T[3],$A31 ######### post-Chi shuffle
- vpermq \$0b10001101, at T[5],$A41
- vpxor $A11, at T[6], at T[6]
- vpermq \$0b00000000,$A00,$A00 # broadcast A[0][0]
-
- vpblendd \$0b00000011,$A11, at T[4], at T[4] # [1][4] [4][3] [2][2]
- vpblendd \$0b00001100,$A11, at T[8], at T[8] # [3][1] [4][4] [2][3]
- vpermq \$0b01110010, at T[6],$A11
- vpblendd \$0b11000000,$A20, at T[4], at T[4] # [3][0] [1][4] [4][3] [2][2]
- vpblendd \$0b00110000,$A20, at T[8], at T[8] # [3][1] [1][0] [4][4] [2][3]
- vpandn @T[8], at T[4], at T[4] # tgting [3][4] [1][3] [4][2] [2][1]
+ vpblendd \$0b00001100, at T[3], at T[6],$A21 # [4][3] [2][2]
+ vpblendd \$0b00001100, at T[6], at T[5], at T[7] # [4][4] [2][3]
+ vpblendd \$0b00110000, at T[5],$A21,$A21 # [1][4] [4][3] [2][2]
+ vpblendd \$0b00110000, at T[2], at T[7], at T[7] # [1][0] [4][4] [2][3]
+ vpblendd \$0b11000000, at T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
+ vpblendd \$0b11000000, at T[3], at T[7], at T[7] # [3][1] [1][0] [4][4] [2][3]
+ vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
- vpxor @T[2],$A20,$A20
+ vpxor @T[0],$A00,$A00
vpxor @T[1],$A01,$A01
vpxor @T[4],$A21,$A21
More information about the openssl-commits
mailing list