[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Fri Oct 19 08:44:03 UTC 2018
The branch master has been updated
via 9986bfefa420f0db920768453bef0b40507db595 (commit)
from 03ad7c009e16a233c733098db3169c560142ccd3 (commit)
- Log -----------------------------------------------------------------
commit 9986bfefa420f0db920768453bef0b40507db595
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Sep 22 14:39:51 2018 +0200
sha/asm/keccak1600-armv8.pl: halve the size of hw-assisted subroutine.
Yes, it's second halving, i.e. it's now 1/4 of original size, or more
specifically inner loop. The challenge with Keccak is that you need
more temporary registers than there are available. By reversing the
order in which columns are assigned in Chi, it's possible to use
three of A[][] registers as temporary prior their assigment.
Reviewed-by: Richard Levitte <levitte at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/7294)
-----------------------------------------------------------------------
Summary of changes:
crypto/sha/asm/keccak1600-armv8.pl | 146 ++++++++++++++++++-------------------
1 file changed, 69 insertions(+), 77 deletions(-)
diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl
index 704ab4a..e4e94bc 100755
--- a/crypto/sha/asm/keccak1600-armv8.pl
+++ b/crypto/sha/asm/keccak1600-armv8.pl
@@ -533,30 +533,28 @@ my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
(0, 5, 10, 15, 20));
my @C = map("v$_.16b", (25..31));
+my @D = @C[4,5,6,2,3];
$code.=<<___;
.type KeccakF1600_ce,%function
.align 5
KeccakF1600_ce:
- mov x9,#12
+ mov x9,#24
adr x10,iotas
b .Loop_ce
.align 4
.Loop_ce:
-___
-for($i=0; $i<2; $i++) {
-$code.=<<___;
////////////////////////////////////////////////// Theta
- eor3 $C[0],$A[0][0],$A[1][0],$A[2][0]
- eor3 $C[1],$A[0][1],$A[1][1],$A[2][1]
- eor3 $C[2],$A[0][2],$A[1][2],$A[2][2]
- eor3 $C[3],$A[0][3],$A[1][3],$A[2][3]
- eor3 $C[4],$A[0][4],$A[1][4],$A[2][4]
- eor3 $C[0],$C[0], $A[3][0],$A[4][0]
- eor3 $C[1],$C[1], $A[3][1],$A[4][1]
- eor3 $C[2],$C[2], $A[3][2],$A[4][2]
- eor3 $C[3],$C[3], $A[3][3],$A[4][3]
- eor3 $C[4],$C[4], $A[3][4],$A[4][4]
+ eor3 $C[0],$A[4][0],$A[3][0],$A[2][0]
+ eor3 $C[1],$A[4][1],$A[3][1],$A[2][1]
+ eor3 $C[2],$A[4][2],$A[3][2],$A[2][2]
+ eor3 $C[3],$A[4][3],$A[3][3],$A[2][3]
+ eor3 $C[4],$A[4][4],$A[3][4],$A[2][4]
+ eor3 $C[0],$C[0], $A[1][0],$A[0][0]
+ eor3 $C[1],$C[1], $A[1][1],$A[0][1]
+ eor3 $C[2],$C[2], $A[1][2],$A[0][2]
+ eor3 $C[3],$C[3], $A[1][3],$A[0][3]
+ eor3 $C[4],$C[4], $A[1][4],$A[0][4]
rax1 $C[5],$C[0],$C[2] // D[1]
rax1 $C[6],$C[1],$C[3] // D[2]
@@ -565,81 +563,75 @@ $code.=<<___;
rax1 $C[4],$C[4],$C[1] // D[0]
////////////////////////////////////////////////// Theta+Rho+Pi
- xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1]
- xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
- xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
- xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
- xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
+ xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
- xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
+ xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
+ xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
+ xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
+ xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
+ xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
- xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
- xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
- xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
- xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
- xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
+ xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
- xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
+ xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
+ xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
+ xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
+ xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
+ xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
- eor $A[0][0],$A[0][0],$C[4]
- ldr x11,[x10],#8
+ xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
- xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3]
- xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
- xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
- xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
- xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
+ xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
+ xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
+ xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
+ xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
+ xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
- xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // *
+ xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
- xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
- xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
- xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
- xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
- xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
+ eor $A[0][0],$A[0][0],$D[0]
- xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0]
+ xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
+ xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
+ xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
+ xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
+ xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
////////////////////////////////////////////////// Chi+Iota
- dup $C[6],x11 // borrow C[6]
- bcax $C[3], $A[0][0],$A[0][2],$C[0] // *
- bcax $A[0][1],$C[0], $C[1], $A[0][2] // *
- bcax $A[0][2],$A[0][2],$A[0][4],$C[1]
- bcax $A[0][3],$C[1], $A[0][0],$A[0][4]
- bcax $A[0][4],$A[0][4],$C[0], $A[0][0]
-
- bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // *
- bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // *
- bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3]
- bcax $A[1][3],$A[1][3],$C[2], $A[1][4]
- bcax $A[1][4],$A[1][4],$A[1][1],$C[2]
-
- eor $A[0][0],$C[3],$C[6] // Iota
-
- bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // *
- bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // *
- bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
- bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4]
- bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0]
+ bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1]
+ bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1]
+ bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
+ bcax $A[4][3],$A[4][3],$C[1], $A[4][4]
+ bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1]
+
+ ld1r {$C[1]},[x10],#8
- bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // *
- bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // *
- bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3]
- bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4]
+ bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3]
+ bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3]
bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
+ bcax $A[3][0],$A[3][0],$D[1], $A[3][1]
+ bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3]
+
+ bcax $A[2][0],$C[0], $A[2][2],$D[2]
+ bcax $A[2][1],$D[2], $A[2][3],$A[2][2]
+ bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
+ bcax $A[2][3],$A[2][3],$C[0], $A[2][4]
+ bcax $A[2][4],$A[2][4],$D[2], $C[0]
+
+ bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3]
+ bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3]
+ bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0]
+ bcax $A[1][0],$A[1][0],$D[0], $A[1][1]
+ bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3]
+
+ bcax $A[0][3],$D[3], $A[0][0],$D[4]
+ bcax $A[0][4],$D[4], $A[0][1],$A[0][0]
+ bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1]
+ bcax $A[0][1],$A[0][1],$D[3], $A[0][2]
+ bcax $A[0][2],$A[0][2],$D[4], $D[3]
+
+ eor $A[0][0],$A[0][0],$C[1]
- bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // *
- bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // *
- bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
- bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4]
- bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0]
-___
- ( $A[1][1], $C[0]) = ( $C[0], $A[1][1]);
- ($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
- ($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
- ($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
-}
-$code.=<<___;
subs x9,x9,#1
bne .Loop_ce
@@ -857,7 +849,7 @@ foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
- m/\bdup\b/ and s/\.16b/.2d/g or
+ m/\bld1r\b/ and s/\.16b/.2d/g or
s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
print $_,"\n";
More information about the openssl-commits
mailing list