[openssl-commits] [openssl]  master update
    Andy Polyakov 
    appro at openssl.org
       
    Fri Oct 19 08:44:03 UTC 2018
    
    
  
The branch master has been updated
       via  9986bfefa420f0db920768453bef0b40507db595 (commit)
      from  03ad7c009e16a233c733098db3169c560142ccd3 (commit)
- Log -----------------------------------------------------------------
commit 9986bfefa420f0db920768453bef0b40507db595
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Sep 22 14:39:51 2018 +0200
    sha/asm/keccak1600-armv8.pl: halve the size of hw-assisted subroutine.
    
    Yes, it's second halving, i.e. it's now 1/4 of original size, or more
    specifically inner loop. The challenge with Keccak is that you need
    more temporary registers than there are available. By reversing the
    order in which columns are assigned in Chi, it's possible to use
    three of A[][] registers as temporary prior their assigment.
    
    Reviewed-by: Richard Levitte <levitte at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/7294)
-----------------------------------------------------------------------
Summary of changes:
 crypto/sha/asm/keccak1600-armv8.pl | 146 ++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 77 deletions(-)
diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl
index 704ab4a..e4e94bc 100755
--- a/crypto/sha/asm/keccak1600-armv8.pl
+++ b/crypto/sha/asm/keccak1600-armv8.pl
@@ -533,30 +533,28 @@ my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
             (0, 5, 10, 15, 20));
 
 my @C = map("v$_.16b", (25..31));
+my @D = @C[4,5,6,2,3];
 
 $code.=<<___;
 .type	KeccakF1600_ce,%function
 .align	5
 KeccakF1600_ce:
-	mov	x9,#12
+	mov	x9,#24
 	adr	x10,iotas
 	b	.Loop_ce
 .align	4
 .Loop_ce:
-___
-for($i=0; $i<2; $i++) {
-$code.=<<___;
 	////////////////////////////////////////////////// Theta
-	eor3	$C[0],$A[0][0],$A[1][0],$A[2][0]
-	eor3	$C[1],$A[0][1],$A[1][1],$A[2][1]
-	eor3	$C[2],$A[0][2],$A[1][2],$A[2][2]
-	eor3	$C[3],$A[0][3],$A[1][3],$A[2][3]
-	eor3	$C[4],$A[0][4],$A[1][4],$A[2][4]
-	eor3	$C[0],$C[0],   $A[3][0],$A[4][0]
-	eor3	$C[1],$C[1],   $A[3][1],$A[4][1]
-	eor3	$C[2],$C[2],   $A[3][2],$A[4][2]
-	eor3	$C[3],$C[3],   $A[3][3],$A[4][3]
-	eor3	$C[4],$C[4],   $A[3][4],$A[4][4]
+	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
+	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
+	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
+	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
+	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
+	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
+	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
+	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
+	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
+	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]
 
 	rax1	$C[5],$C[0],$C[2]			// D[1]
 	rax1	$C[6],$C[1],$C[3]			// D[2]
@@ -565,81 +563,75 @@ $code.=<<___;
 	rax1	$C[4],$C[4],$C[1]			// D[0]
 
 	////////////////////////////////////////////////// Theta+Rho+Pi
-	xar	$C[0],   $A[1][1],$C[5],#64-$rhotates[1][1]	// C[0]=A[0][1]
-	xar	$A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
-	xar	$A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
-	xar	$A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
-	xar	$A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
+	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
 
-	xar	$A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
+	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
+	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
+	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
+	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
+	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
 
-	xar	$A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
-	xar	$A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
-	xar	$A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
-	xar	$A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
-	xar	$A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
+	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
 
-	xar	$A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
+	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
+	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
+	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
+	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
+	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
 
-	eor	$A[0][0],$A[0][0],$C[4]
-	ldr	x11,[x10],#8
+	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
 
-	xar	$C[1],   $A[3][3],$C[2],#64-$rhotates[3][3]	// C[1]=A[0][3]
-	xar	$A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
-	xar	$A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
-	xar	$A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
-	xar	$A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
+	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
+	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
+	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
+	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
+	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
 
-	xar	$A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1]	// *
+	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
 
-	xar	$A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
-	xar	$A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
-	xar	$A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
-	xar	$A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
-	xar	$A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
+	eor	$A[0][0],$A[0][0],$D[0]
 
-	xar	$C[2],   $A[0][3],$C[2],#64-$rhotates[0][3]	// C[2]=A[1][0]
+	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
+	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
+	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
+	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
+	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
 
 	////////////////////////////////////////////////// Chi+Iota
-	dup	$C[6],x11				// borrow C[6]
-	bcax	$C[3],   $A[0][0],$A[0][2],$C[0]	// *
-	bcax	$A[0][1],$C[0],   $C[1],   $A[0][2]	// *
-	bcax	$A[0][2],$A[0][2],$A[0][4],$C[1]
-	bcax	$A[0][3],$C[1],   $A[0][0],$A[0][4]
-	bcax	$A[0][4],$A[0][4],$C[0],   $A[0][0]
-
-	bcax	$A[1][0],$C[2],   $A[1][2],$A[1][1]	// *
-	bcax	$C[0],   $A[1][1],$A[1][3],$A[1][2]	// *
-	bcax	$A[1][2],$A[1][2],$A[1][4],$A[1][3]
-	bcax	$A[1][3],$A[1][3],$C[2],   $A[1][4]
-	bcax	$A[1][4],$A[1][4],$A[1][1],$C[2]
-
-	eor	$A[0][0],$C[3],$C[6]			// Iota
-
-	bcax	$C[1],   $A[2][0],$A[2][2],$A[2][1]	// *
-	bcax	$C[2],   $A[2][1],$A[2][3],$A[2][2]	// *
-	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
-	bcax	$A[2][3],$A[2][3],$A[2][0],$A[2][4]
-	bcax	$A[2][4],$A[2][4],$A[2][1],$A[2][0]
+	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
+	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
+	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
+	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
+	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]
+
+	ld1r	{$C[1]},[x10],#8
 
-	bcax	$C[3],   $A[3][0],$A[3][2],$A[3][1]	// *
-	bcax	$C[4],   $A[3][1],$A[3][3],$A[3][2]	// *
-	bcax	$A[3][2],$A[3][2],$A[3][4],$A[3][3]
-	bcax	$A[3][3],$A[3][3],$A[3][0],$A[3][4]
+	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
+	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
 	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
+	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
+	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]
+
+	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
+	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
+	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
+	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
+	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]
+
+	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
+	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
+	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
+	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
+	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]
+
+	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
+	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
+	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
+	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
+	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]
+
+	eor	$A[0][0],$A[0][0],$C[1]
 
-	bcax	$C[5],   $A[4][0],$A[4][2],$A[4][1]	// *
-	bcax	$C[6],   $A[4][1],$A[4][3],$A[4][2]	// *
-	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
-	bcax	$A[4][3],$A[4][3],$A[4][0],$A[4][4]
-	bcax	$A[4][4],$A[4][4],$A[4][1],$A[4][0]
-___
-	(         $A[1][1],       $C[0]) = (      $C[0],          $A[1][1]);
-	($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
-	($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
-	($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
-}
-$code.=<<___;
 	subs	x9,x9,#1
 	bne	.Loop_ce
 
@@ -857,7 +849,7 @@ foreach(split("\n",$code)) {
 
 	s/\`([^\`]*)\`/eval($1)/ge;
 
-	m/\bdup\b/ and s/\.16b/.2d/g	or
+	m/\bld1r\b/ and s/\.16b/.2d/g	or
 	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
 
 	print $_,"\n";
    
    
More information about the openssl-commits
mailing list