[openssl-commits] [openssl] master update

Andy Polyakov appro at openssl.org
Fri Oct 19 08:44:03 UTC 2018


The branch master has been updated
       via  9986bfefa420f0db920768453bef0b40507db595 (commit)
      from  03ad7c009e16a233c733098db3169c560142ccd3 (commit)


- Log -----------------------------------------------------------------
commit 9986bfefa420f0db920768453bef0b40507db595
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Sep 22 14:39:51 2018 +0200

    sha/asm/keccak1600-armv8.pl: halve the size of hw-assisted subroutine.
    
    Yes, it's second halving, i.e. it's now 1/4 of original size, or more
    specifically inner loop. The challenge with Keccak is that you need
    more temporary registers than there are available. By reversing the
    order in which columns are assigned in Chi, it's possible to use
    three of A[][] registers as temporary prior their assigment.
    
    Reviewed-by: Richard Levitte <levitte at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/7294)

-----------------------------------------------------------------------

Summary of changes:
 crypto/sha/asm/keccak1600-armv8.pl | 146 ++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 77 deletions(-)

diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl
index 704ab4a..e4e94bc 100755
--- a/crypto/sha/asm/keccak1600-armv8.pl
+++ b/crypto/sha/asm/keccak1600-armv8.pl
@@ -533,30 +533,28 @@ my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
             (0, 5, 10, 15, 20));
 
 my @C = map("v$_.16b", (25..31));
+my @D = @C[4,5,6,2,3];
 
 $code.=<<___;
 .type	KeccakF1600_ce,%function
 .align	5
 KeccakF1600_ce:
-	mov	x9,#12
+	mov	x9,#24
 	adr	x10,iotas
 	b	.Loop_ce
 .align	4
 .Loop_ce:
-___
-for($i=0; $i<2; $i++) {
-$code.=<<___;
 	////////////////////////////////////////////////// Theta
-	eor3	$C[0],$A[0][0],$A[1][0],$A[2][0]
-	eor3	$C[1],$A[0][1],$A[1][1],$A[2][1]
-	eor3	$C[2],$A[0][2],$A[1][2],$A[2][2]
-	eor3	$C[3],$A[0][3],$A[1][3],$A[2][3]
-	eor3	$C[4],$A[0][4],$A[1][4],$A[2][4]
-	eor3	$C[0],$C[0],   $A[3][0],$A[4][0]
-	eor3	$C[1],$C[1],   $A[3][1],$A[4][1]
-	eor3	$C[2],$C[2],   $A[3][2],$A[4][2]
-	eor3	$C[3],$C[3],   $A[3][3],$A[4][3]
-	eor3	$C[4],$C[4],   $A[3][4],$A[4][4]
+	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
+	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
+	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
+	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
+	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
+	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
+	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
+	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
+	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
+	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]
 
 	rax1	$C[5],$C[0],$C[2]			// D[1]
 	rax1	$C[6],$C[1],$C[3]			// D[2]
@@ -565,81 +563,75 @@ $code.=<<___;
 	rax1	$C[4],$C[4],$C[1]			// D[0]
 
 	////////////////////////////////////////////////// Theta+Rho+Pi
-	xar	$C[0],   $A[1][1],$C[5],#64-$rhotates[1][1]	// C[0]=A[0][1]
-	xar	$A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
-	xar	$A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
-	xar	$A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
-	xar	$A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
+	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
 
-	xar	$A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
+	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
+	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
+	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
+	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
+	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
 
-	xar	$A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
-	xar	$A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
-	xar	$A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
-	xar	$A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
-	xar	$A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
+	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
 
-	xar	$A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
+	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
+	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
+	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
+	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
+	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
 
-	eor	$A[0][0],$A[0][0],$C[4]
-	ldr	x11,[x10],#8
+	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
 
-	xar	$C[1],   $A[3][3],$C[2],#64-$rhotates[3][3]	// C[1]=A[0][3]
-	xar	$A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
-	xar	$A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
-	xar	$A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
-	xar	$A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
+	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
+	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
+	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
+	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
+	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
 
-	xar	$A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1]	// *
+	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
 
-	xar	$A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
-	xar	$A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
-	xar	$A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
-	xar	$A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
-	xar	$A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
+	eor	$A[0][0],$A[0][0],$D[0]
 
-	xar	$C[2],   $A[0][3],$C[2],#64-$rhotates[0][3]	// C[2]=A[1][0]
+	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
+	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
+	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
+	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
+	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
 
 	////////////////////////////////////////////////// Chi+Iota
-	dup	$C[6],x11				// borrow C[6]
-	bcax	$C[3],   $A[0][0],$A[0][2],$C[0]	// *
-	bcax	$A[0][1],$C[0],   $C[1],   $A[0][2]	// *
-	bcax	$A[0][2],$A[0][2],$A[0][4],$C[1]
-	bcax	$A[0][3],$C[1],   $A[0][0],$A[0][4]
-	bcax	$A[0][4],$A[0][4],$C[0],   $A[0][0]
-
-	bcax	$A[1][0],$C[2],   $A[1][2],$A[1][1]	// *
-	bcax	$C[0],   $A[1][1],$A[1][3],$A[1][2]	// *
-	bcax	$A[1][2],$A[1][2],$A[1][4],$A[1][3]
-	bcax	$A[1][3],$A[1][3],$C[2],   $A[1][4]
-	bcax	$A[1][4],$A[1][4],$A[1][1],$C[2]
-
-	eor	$A[0][0],$C[3],$C[6]			// Iota
-
-	bcax	$C[1],   $A[2][0],$A[2][2],$A[2][1]	// *
-	bcax	$C[2],   $A[2][1],$A[2][3],$A[2][2]	// *
-	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
-	bcax	$A[2][3],$A[2][3],$A[2][0],$A[2][4]
-	bcax	$A[2][4],$A[2][4],$A[2][1],$A[2][0]
+	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
+	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
+	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
+	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
+	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]
+
+	ld1r	{$C[1]},[x10],#8
 
-	bcax	$C[3],   $A[3][0],$A[3][2],$A[3][1]	// *
-	bcax	$C[4],   $A[3][1],$A[3][3],$A[3][2]	// *
-	bcax	$A[3][2],$A[3][2],$A[3][4],$A[3][3]
-	bcax	$A[3][3],$A[3][3],$A[3][0],$A[3][4]
+	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
+	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
 	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
+	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
+	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]
+
+	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
+	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
+	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
+	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
+	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]
+
+	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
+	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
+	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
+	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
+	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]
+
+	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
+	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
+	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
+	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
+	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]
+
+	eor	$A[0][0],$A[0][0],$C[1]
 
-	bcax	$C[5],   $A[4][0],$A[4][2],$A[4][1]	// *
-	bcax	$C[6],   $A[4][1],$A[4][3],$A[4][2]	// *
-	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
-	bcax	$A[4][3],$A[4][3],$A[4][0],$A[4][4]
-	bcax	$A[4][4],$A[4][4],$A[4][1],$A[4][0]
-___
-	(         $A[1][1],       $C[0]) = (      $C[0],          $A[1][1]);
-	($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
-	($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
-	($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
-}
-$code.=<<___;
 	subs	x9,x9,#1
 	bne	.Loop_ce
 
@@ -857,7 +849,7 @@ foreach(split("\n",$code)) {
 
 	s/\`([^\`]*)\`/eval($1)/ge;
 
-	m/\bdup\b/ and s/\.16b/.2d/g	or
+	m/\bld1r\b/ and s/\.16b/.2d/g	or
 	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
 
 	print $_,"\n";


More information about the openssl-commits mailing list