[openssl-commits] [openssl] master update

Andy Polyakov appro at openssl.org
Sat Feb 25 17:37:03 UTC 2017


The branch master has been updated
       via  fd910ef9593d4e16dabf4686ecabb351830045b6 (commit)
       via  73e8a5c8261625a6e90e07e567263c69039e3d17 (commit)
       via  c1e1fc500da910dbf4358f902f6b824a3c34b922 (commit)
      from  c749308fc44a0b33b340e23834320dbef9fbf8de (commit)


- Log -----------------------------------------------------------------
commit fd910ef9593d4e16dabf4686ecabb351830045b6
Author: Andy Polyakov <appro at openssl.org>
Date:   Fri Dec 30 00:00:16 2016 +0100

    poly1305/asm/poly1305-x86_64.pl: add VPMADD52 code path.
    
    This is initial and minimal single-block implementation.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>

commit 73e8a5c8261625a6e90e07e567263c69039e3d17
Author: Andy Polyakov <appro at openssl.org>
Date:   Sun Dec 25 13:10:00 2016 +0100

    poly1305/asm/poly1305-x86_64.pl: switch to vpermdd in table expansion.
    
    Effectively it's minor size optimization, 5-6% per affected subroutine.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>

commit c1e1fc500da910dbf4358f902f6b824a3c34b922
Author: Andy Polyakov <appro at openssl.org>
Date:   Sun Dec 25 13:05:35 2016 +0100

    poly1305/asm/poly1305-x86_64.pl: optimize AVX512 code path.
    
    On pre-Skylake best optimization strategy was balancing port-specific
    instructions, while on Skylake minimizing the sheer amount appears
    more sensible.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>

-----------------------------------------------------------------------

Summary of changes:
 crypto/poly1305/asm/poly1305-x86_64.pl | 465 +++++++++++++++++++++++----------
 1 file changed, 325 insertions(+), 140 deletions(-)

diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
index baf3c75..ff4efb3 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -62,13 +62,13 @@ die "can't locate x86_64-xlate.pl";
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
 }
 
 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
-	$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
-	$avx += 1 if ($1==2.11 && $2>=8);
+	$avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
+	$avx += 2 if ($1==2.11 && $2>=8);
 }
 
 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
@@ -178,6 +178,13 @@ $code.=<<___	if ($avx>1);
 	bt	\$`5+32`,%r9		# AVX2?
 	cmovc	%rax,%r10
 ___
+$code.=<<___	if ($avx>3);
+	mov	\$`(1<<31|1<<21|1<<16)`,%rax
+	shr	\$32,%r9
+	and	%rax,%r9
+	cmp	%rax,%r9
+	je	.Linit_base2_44
+___
 $code.=<<___;
 	mov	\$0x0ffffffc0fffffff,%rax
 	mov	\$0x0ffffffc0ffffffc,%rcx
@@ -1631,8 +1638,9 @@ $code.=<<___	if ($win64);
 .Ldo_avx2_body:
 ___
 $code.=<<___;
-	lea		48+64($ctx),$ctx	# size optimization
 	lea		.Lconst(%rip),%rcx
+	lea		48+64($ctx),$ctx	# size optimization
+	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
 
 	# expand and copy pre-calculated table to stack
 	vmovdqu		`16*0-64`($ctx),%x#$T2
@@ -1642,36 +1650,28 @@ $code.=<<___;
 	vmovdqu		`16*3-64`($ctx),%x#$D0
 	vmovdqu		`16*4-64`($ctx),%x#$D1
 	vmovdqu		`16*5-64`($ctx),%x#$D2
+	lea		0x90(%rsp),%rax		# size optimization
 	vmovdqu		`16*6-64`($ctx),%x#$D3
-	vpermq		\$0x15,$T2,$T2		# 00003412 -> 12343434
+	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
 	vmovdqu		`16*7-64`($ctx),%x#$D4
-	vpermq		\$0x15,$T3,$T3
-	vpshufd		\$0xc8,$T2,$T2		# 12343434 -> 14243444
+	vpermd		$T3,$T0,$T3
 	vmovdqu		`16*8-64`($ctx),%x#$MASK
-	vpermq		\$0x15,$T4,$T4
-	vpshufd		\$0xc8,$T3,$T3
+	vpermd		$T4,$T0,$T4
 	vmovdqa		$T2,0x00(%rsp)
-	vpermq		\$0x15,$D0,$D0
-	vpshufd		\$0xc8,$T4,$T4
-	vmovdqa		$T3,0x20(%rsp)
-	vpermq		\$0x15,$D1,$D1
-	vpshufd		\$0xc8,$D0,$D0
-	vmovdqa		$T4,0x40(%rsp)
-	vpermq		\$0x15,$D2,$D2
-	vpshufd		\$0xc8,$D1,$D1
-	vmovdqa		$D0,0x60(%rsp)
-	vpermq		\$0x15,$D3,$D3
-	vpshufd		\$0xc8,$D2,$D2
-	vmovdqa		$D1,0x80(%rsp)
-	vpermq		\$0x15,$D4,$D4
-	vpshufd		\$0xc8,$D3,$D3
-	vmovdqa		$D2,0xa0(%rsp)
-	vpermq		\$0x15,$MASK,$MASK
-	vpshufd		\$0xc8,$D4,$D4
-	vmovdqa		$D3,0xc0(%rsp)
-	vpshufd		\$0xc8,$MASK,$MASK
-	vmovdqa		$D4,0xe0(%rsp)
-	vmovdqa		$MASK,0x100(%rsp)
+	vpermd		$D0,$T0,$D0
+	vmovdqa		$T3,0x20-0x90(%rax)
+	vpermd		$D1,$T0,$D1
+	vmovdqa		$T4,0x40-0x90(%rax)
+	vpermd		$D2,$T0,$D2
+	vmovdqa		$D0,0x60-0x90(%rax)
+	vpermd		$D3,$T0,$D3
+	vmovdqa		$D1,0x80-0x90(%rax)
+	vpermd		$D4,$T0,$D4
+	vmovdqa		$D2,0xa0-0x90(%rax)
+	vpermd		$MASK,$T0,$MASK
+	vmovdqa		$D3,0xc0-0x90(%rax)
+	vmovdqa		$D4,0xe0-0x90(%rax)
+	vmovdqa		$MASK,0x100-0x90(%rax)
 	vmovdqa		64(%rcx),$MASK		# .Lmask26
 
 	################################################################
@@ -1698,7 +1698,6 @@ $code.=<<___;
 	vpand		$MASK,$T3,$T3		# 3
 	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
 
-	lea		0x90(%rsp),%rax		# size optimization
 	vpaddq		$H2,$T2,$H2		# accumulate input
 	sub		\$64,$len
 	jz		.Ltail_avx2
@@ -2055,8 +2054,9 @@ $code.=<<___	if ($win64);
 .Ldo_avx512_body:
 ___
 $code.=<<___;
-	lea		48+64($ctx),$ctx	# size optimization
 	lea		.Lconst(%rip),%rcx
+	lea		48+64($ctx),$ctx	# size optimization
+	vmovdqa		96(%rcx),$T2		# .Lpermd_avx2
 
 	# expand pre-calculated table
 	vmovdqu32	`16*0-64`($ctx),%x#$R0
@@ -2069,33 +2069,23 @@ $code.=<<___;
 	vmovdqu32	`16*6-64`($ctx),%x#$S3
 	vmovdqu32	`16*7-64`($ctx),%x#$R4
 	vmovdqu32	`16*8-64`($ctx),%x#$S4
-	vpermq		\$0x15,$R0,$R0		# 00003412 -> 12343434
+	vpermd		$R0,$T2,$R0		# 00003412 -> 14243444
 	vmovdqa64	64(%rcx),$MASK		# .Lmask26
-	vpermq		\$0x15,$R1,$R1
-	vmovdqa32	128(%rcx),$GATHER	# .Lgather
-	vpermq		\$0x15,$S1,$S1
-	vpshufd		\$0xc8,$R0,$R0		# 12343434 -> 14243444
-	vpermq		\$0x15,$R2,$R2
-	vpshufd		\$0xc8,$R1,$R1
+	vpermd		$R1,$T2,$R1
+	vpermd		$S1,$T2,$S1
+	vpermd		$R2,$T2,$R2
 	vmovdqa32	$R0,0x00(%rsp)		# save in case $len%128 != 0
 	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
-	vpermq		\$0x15,$S2,$S2
-	vpshufd		\$0xc8,$S1,$S1
+	vpermd		$S2,$T2,$S2
 	vmovdqa32	$R1,0x20(%rsp)
 	 vpsrlq		\$32,$R1,$T1
-	vpermq		\$0x15,$R3,$R3
-	vpshufd		\$0xc8,$R2,$R2
+	vpermd		$R3,$T2,$R3
 	vmovdqa32	$S1,0x40(%rsp)
-	vpermq		\$0x15,$S3,$S3
-	vpshufd		\$0xc8,$S2,$S2
-	vpermq		\$0x15,$R4,$R4
-	vpshufd		\$0xc8,$R3,$R3
+	vpermd		$S3,$T2,$S3
+	vpermd		$R4,$T2,$R4
 	vmovdqa32	$R2,0x60(%rsp)
-	vpermq		\$0x15,$S4,$S4
-	vpshufd		\$0xc8,$S3,$S3
+	vpermd		$S4,$T2,$S4
 	vmovdqa32	$S2,0x80(%rsp)
-	vpshufd		\$0xc8,$R4,$R4
-	vpshufd		\$0xc8,$S4,$S4
 	vmovdqa32	$R3,0xa0(%rsp)
 	vmovdqa32	$S3,0xc0(%rsp)
 	vmovdqa32	$R4,0xe0(%rsp)
@@ -2165,10 +2155,9 @@ $code.=<<___;
 
 	################################################################
 	# load input
-	vmovdqu64	16*0($inp),%x#$T0
-	vmovdqu64	16*1($inp),%x#$T1
-	vinserti64x2	\$1,16*2($inp),$T0,$T0
-	vinserti64x2	\$1,16*3($inp),$T1,$T1
+	vmovdqu64	16*0($inp),%z#$T3
+	vmovdqu64	16*4($inp),%z#$T4
+	lea		16*8($inp),$inp
 
 	################################################################
 	# lazy reduction
@@ -2205,50 +2194,51 @@ $code.=<<___;
 	vpaddq		$M3,$D4,$D4		# d3 -> d4
 
 ___
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
 map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
 map(s/%y/%z/,($MASK));
 $code.=<<___;
 	################################################################
-	# load more input
-	vinserti64x2	\$2,16*4($inp),$T0,$T0
-	vinserti64x2	\$2,16*5($inp),$T1,$T1
-	vinserti64x2	\$3,16*6($inp),$T0,$T0
-	vinserti64x2	\$3,16*7($inp),$T1,$T1
-	lea		16*8($inp),$inp
+	# at this point we have 14243444 in $R0-$S4 and 05060708 in
+	# $D0-$D4, ...
 
-	vpbroadcastq	%x#$MASK,$MASK
-	vpbroadcastq	32(%rcx),$PADBIT
+	vpunpcklqdq	$T4,$T3,$T0	# transpose input
+	vpunpckhqdq	$T4,$T3,$T4
 
-	################################################################
-	# at this point we have 14243444 in $R0-$S4 and 05060708 in
-	# $D0-$D4, and the goal is 1828384858687888 in $R0-$S4
+	# ... since input 64-bit lanes are ordered as 73625140, we could
+	# "vperm" it to 76543210 (here and in each loop iteration), *or*
+	# we could just flow along, hence the goal for $R0-$S4 is
+	# 1858286838784888 ...
+
+	mov		\$0b0110011001100110,%eax
+	mov		\$0b1100110011001100,%r8d
+	mov		\$0b0101010101010101,%r9d
+	kmovw		%eax,%k1
+	kmovw		%r8d,%k2
+	kmovw		%r9d,%k3
 
-	mov		\$0x5555,%eax
-	vpbroadcastq	%x#$D0,$M0		# 0808080808080808
+	vpbroadcastq	%x#$D0,$M0	# 0808080808080808
 	vpbroadcastq	%x#$D1,$M1
 	vpbroadcastq	%x#$D2,$M2
 	vpbroadcastq	%x#$D3,$M3
 	vpbroadcastq	%x#$D4,$M4
-	kmovw		%eax,%k3
-	vpsllq		\$32,$D0,$D0		# 05060708 -> 50607080
-	vpsllq		\$32,$D1,$D1
-	vpsllq		\$32,$D2,$D2
-	vpsllq		\$32,$D3,$D3
-	vpsllq		\$32,$D4,$D4
-___
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-$code.=<<___;
-	vinserti64x4	\$1,$R0,$D0,$D0		# 1424344450607080
-	vinserti64x4	\$1,$R1,$D1,$D1
-	vinserti64x4	\$1,$R2,$D2,$D2
-	vinserti64x4	\$1,$R3,$D3,$D3
-	vinserti64x4	\$1,$R4,$D4,$D4
-___
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
-$code.=<<___;
-	vpblendmd	$M0,$D0,${R0}{%k3}	# 1828384858687888
+
+	vpexpandd	$D0,${D0}{%k1}	# 05060708 -> -05--06--07--08-
+	vpexpandd	$D1,${D1}{%k1}
+	vpexpandd	$D2,${D2}{%k1}
+	vpexpandd	$D3,${D3}{%k1}
+	vpexpandd	$D4,${D4}{%k1}
+
+	vpexpandd	$R0,${D0}{%k2}	# -05--06--07--08- -> 145-246-347-448-
+	vpexpandd	$R1,${D1}{%k2}
+	vpexpandd	$R2,${D2}{%k2}
+	vpexpandd	$R3,${D3}{%k2}
+	vpexpandd	$R4,${D4}{%k2}
+
+	vpblendmd	$M0,$D0,${R0}{%k3}	# 1858286838784888
 	vpblendmd	$M1,$D1,${R1}{%k3}
 	vpblendmd	$M2,$D2,${R2}{%k3}
 	vpblendmd	$M3,$D3,${R3}{%k3}
@@ -2263,27 +2253,28 @@ $code.=<<___;
 	vpaddd		$R3,$S3,$S3
 	vpaddd		$R4,$S4,$S4
 
-	vpsrldq		\$6,$T0,$T2		# splat input
-	vpsrldq		\$6,$T1,$T3
-	vpunpckhqdq	$T1,$T0,$T4		# 4
-	vpunpcklqdq	$T3,$T2,$T2		# 2:3
-	vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpbroadcastq	%x#$MASK,$MASK
+	vpbroadcastq	32(%rcx),$PADBIT	# .L129
 
-	vpsrlq		\$30,$T2,$T3
-	vpsrlq		\$4,$T2,$T2
+	vpsrlq		\$52,$T0,$T2		# splat input
+	vpsllq		\$12,$T4,$T3
+	vporq		$T3,$T2,$T2
 	vpsrlq		\$26,$T0,$T1
+	vpsrlq		\$14,$T4,$T3
 	vpsrlq		\$40,$T4,$T4		# 4
 	vpandq		$MASK,$T2,$T2		# 2
 	vpandq		$MASK,$T0,$T0		# 0
-	#vpandq		$MASK,$T1,$T1		# 1
-	#vpandq		$MASK,$T3,$T3		# 3
+	vpandq		$MASK,$T1,$T1		# 1
+	vpandq		$MASK,$T3,$T3		# 3
 	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
 
 	vpaddq		$H2,$T2,$H2		# accumulate input
 	mov		\$0x0f,%eax
 	sub		\$192,$len
 	jbe		.Ltail_avx512
+	jmp		.Loop_avx512
 
+.align	32
 .Loop_avx512:
 	################################################################
 	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
@@ -2315,12 +2306,8 @@ $code.=<<___;
 
 	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
 	 vpaddq		$H0,$T0,$H0
-	  vmovdqu64	16*0($inp),%x#$M0	# load input
 	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
-	 vpandq		$MASK,$T1,$T1		# 1, module-scheduled
-	  vmovdqu64	16*1($inp),%x#$M1
 	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
-	 vpandq		$MASK,$T3,$T3		# 3
 	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
 	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
 	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
@@ -2328,8 +2315,9 @@ $code.=<<___;
 	 vpaddq		$H3,$T3,$H3
 	 vpaddq		$H4,$T4,$H4
 
-	  vinserti64x2	\$1,16*2($inp),$M0,$T0
-	  vinserti64x2	\$1,16*3($inp),$M1,$T1
+	  vmovdqu64	16*0($inp),$T3		# load input
+	  vmovdqu64	16*4($inp),$T4
+	  lea		16*8($inp),$inp
 	vpmuludq	$H0,$R3,$M3
 	vpmuludq	$H0,$R4,$M4
 	vpmuludq	$H0,$R0,$M0
@@ -2339,8 +2327,6 @@ $code.=<<___;
 	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
 	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
 
-	  vinserti64x2	\$2,16*4($inp),$T0,$T0
-	  vinserti64x2	\$2,16*5($inp),$T1,$T1
 	vpmuludq	$H1,$R2,$M3
 	vpmuludq	$H1,$R3,$M4
 	vpmuludq	$H1,$S4,$M0
@@ -2350,8 +2336,9 @@ $code.=<<___;
 	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
 	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
 
-	  vinserti64x2	\$3,16*6($inp),$T0,$T0
-	  vinserti64x2	\$3,16*7($inp),$T1,$T1
+	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
+	  vpunpckhqdq	$T4,$T3,$T4
+
 	vpmuludq	$H3,$R0,$M3
 	vpmuludq	$H3,$R1,$M4
 	vpmuludq	$H1,$R0,$M1
@@ -2361,9 +2348,6 @@ $code.=<<___;
 	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
 	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
 
-	  vpsrldq	\$6,$T0,$T2		# splat input
-	  vpsrldq	\$6,$T1,$T3
-	  vpunpckhqdq	$T1,$T0,$T4		# 4
 	vpmuludq	$H4,$S4,$M3
 	vpmuludq	$H4,$R0,$M4
 	vpmuludq	$H3,$S2,$M0
@@ -2375,9 +2359,6 @@ $code.=<<___;
 	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
 	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
 
-	  vpunpcklqdq	$T1,$T0,$T0		# 0:1
-	  vpunpcklqdq	$T3,$T2,$T3		# 2:3
-	  lea		16*8($inp),$inp
 	vpmuludq	$H4,$S1,$M0
 	vpmuludq	$H4,$S2,$M1
 	vpmuludq	$H4,$S3,$M2
@@ -2386,21 +2367,26 @@ $code.=<<___;
 	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
 
 	################################################################
-	# lazy reduction (interleaved with tail of input splat)
+	# lazy reduction (interleaved with input splat)
+
+	 vpsrlq		\$52,$T0,$T2		# splat input
+	 vpsllq		\$12,$T4,$T3
 
 	vpsrlq		\$26,$D3,$H3
 	vpandq		$MASK,$D3,$D3
 	vpaddq		$H3,$D4,$H4		# h3 -> h4
 
+	 vporq		$T3,$T2,$T2
+
 	vpsrlq		\$26,$H0,$D0
 	vpandq		$MASK,$H0,$H0
 	vpaddq		$D0,$H1,$H1		# h0 -> h1
 
+	 vpandq		$MASK,$T2,$T2		# 2
+
 	vpsrlq		\$26,$H4,$D4
 	vpandq		$MASK,$H4,$H4
 
-	 vpsrlq		\$4,$T3,$T2
-
 	vpsrlq		\$26,$H1,$D1
 	vpandq		$MASK,$H1,$H1
 	vpaddq		$D1,$H2,$H2		# h1 -> h2
@@ -2409,15 +2395,14 @@ $code.=<<___;
 	vpsllq		\$2,$D4,$D4
 	vpaddq		$D4,$H0,$H0		# h4 -> h0
 
-	 vpandq		$MASK,$T2,$T2		# 2
+	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
 	 vpsrlq		\$26,$T0,$T1
 
 	vpsrlq		\$26,$H2,$D2
 	vpandq		$MASK,$H2,$H2
 	vpaddq		$D2,$D3,$H3		# h2 -> h3
 
-	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
-	 vpsrlq		\$30,$T3,$T3
+	 vpsrlq		\$14,$T4,$T3
 
 	vpsrlq		\$26,$H0,$D0
 	vpandq		$MASK,$H0,$H0
@@ -2430,8 +2415,8 @@ $code.=<<___;
 	vpaddq		$D3,$H4,$H4		# h3 -> h4
 
 	 vpandq		$MASK,$T0,$T0		# 0
-	 #vpandq	$MASK,$T1,$T1		# 1
-	 #vpandq	$MASK,$T3,$T3		# 3
+	 vpandq		$MASK,$T1,$T1		# 1
+	 vpandq		$MASK,$T3,$T3		# 3
 	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
 
 	sub		\$128,$len
@@ -2443,7 +2428,7 @@ $code.=<<___;
 	# iteration we multiply least significant lane by r^8 and most
 	# significant one by r, that's why table gets shifted...
 
-	vpsrlq		\$32,$R0,$R0		# 0102030405060708
+	vpsrlq		\$32,$R0,$R0		# 0105020603070408
 	vpsrlq		\$32,$R1,$R1
 	vpsrlq		\$32,$R2,$R2
 	vpsrlq		\$32,$S3,$S3
@@ -2465,8 +2450,6 @@ $code.=<<___;
 	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
 	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
 	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
-	 vpandq		$MASK,$T1,$T1		# 1, module-scheduled
-	 vpandq		$MASK,$T3,$T3		# 3
 	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
 	 vpaddq		$H1,$T1,$H1		# accumulate input
 	 vpaddq		$H3,$T3,$H3
@@ -2621,18 +2604,19 @@ $code.=<<___;
 	vmovd		%x#$H2,`4*2-48-64`($ctx)
 	vmovd		%x#$H3,`4*3-48-64`($ctx)
 	vmovd		%x#$H4,`4*4-48-64`($ctx)
+	vzeroall
 ___
 $code.=<<___	if ($win64);
-	vmovdqa		0x50(%r11),%xmm6
-	vmovdqa		0x60(%r11),%xmm7
-	vmovdqa		0x70(%r11),%xmm8
-	vmovdqa		0x80(%r11),%xmm9
-	vmovdqa		0x90(%r11),%xmm10
-	vmovdqa		0xa0(%r11),%xmm11
-	vmovdqa		0xb0(%r11),%xmm12
-	vmovdqa		0xc0(%r11),%xmm13
-	vmovdqa		0xd0(%r11),%xmm14
-	vmovdqa		0xe0(%r11),%xmm15
+	movdqa		0x50(%r11),%xmm6
+	movdqa		0x60(%r11),%xmm7
+	movdqa		0x70(%r11),%xmm8
+	movdqa		0x80(%r11),%xmm9
+	movdqa		0x90(%r11),%xmm10
+	movdqa		0xa0(%r11),%xmm11
+	movdqa		0xb0(%r11),%xmm12
+	movdqa		0xc0(%r11),%xmm13
+	movdqa		0xd0(%r11),%xmm14
+	movdqa		0xe0(%r11),%xmm15
 	lea		0xf8(%r11),%rsp
 .Ldo_avx512_epilogue:
 ___
@@ -2640,11 +2624,203 @@ $code.=<<___	if (!$win64);
 	lea		8(%r11),%rsp
 ___
 $code.=<<___;
-	vzeroupper
 	ret
 .size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
 ___
-}	}
+if ($avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+$code.=<<___;
+.type	poly1305_init_base2_44,\@function,3
+.align	32
+poly1305_init_base2_44:
+	xor	%rax,%rax
+	mov	%rax,0($ctx)		# initialize hash value
+	mov	%rax,8($ctx)
+	mov	%rax,16($ctx)
+
+.Linit_base2_44:
+	lea	poly1305_blocks_vpmadd52(%rip),%r10
+	lea	poly1305_emit_base2_44(%rip),%r11
+
+	mov	\$0x0ffffffc0fffffff,%rax
+	mov	\$0x0ffffffc0ffffffc,%rcx
+	and	0($inp),%rax
+	mov	\$0x00000fffffffffff,%r8
+	and	8($inp),%rcx
+	mov	\$0x00000fffffffffff,%r9
+	and	%rax,%r8
+	shrd	\$44,%rcx,%rax
+	mov	%r8,40($ctx)		# r0
+	and	%r9,%rax
+	shr	\$24,%rcx
+	mov	%rax,48($ctx)		# r1
+	lea	(%rax,%rax,4),%rax	# *5
+	mov	%rcx,56($ctx)		# r2
+	shl	\$2,%rax		# magic <<2
+	lea	(%rcx,%rcx,4),%rcx	# *5
+	shl	\$2,%rcx		# magic <<2
+	mov	%rax,24($ctx)		# s1
+	mov	%rcx,32($ctx)		# s2
+___
+$code.=<<___	if ($flavour !~ /elf32/);
+	mov	%r10,0(%rdx)
+	mov	%r11,8(%rdx)
+___
+$code.=<<___	if ($flavour =~ /elf32/);
+	mov	%r10d,0(%rdx)
+	mov	%r11d,4(%rdx)
+___
+$code.=<<___;
+	mov	\$1,%eax
+	ret
+.size	poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type	poly1305_blocks_vpmadd52,\@function,4
+.align	32
+poly1305_blocks_vpmadd52:
+	shr	\$4,$len
+	jz	.Lno_data_vpmadd52		# too short
+
+	mov		\$7,%r10d
+	mov		\$1,%r11d
+	kmovw		%r10d,%k7
+	lea		.L2_44_inp_permd(%rip),%r10
+	shl		\$40,$padbit
+	kmovw		%r11d,%k1
+
+	vmovq		$padbit,%x#$PAD
+	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
+	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
+	vpermq		\$0xcf,$PAD,$PAD
+	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
+
+	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
+	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
+	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
+	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
+
+	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
+	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
+
+	jmp		.Loop_vpmadd52
+
+.align	32
+.Loop_vpmadd52:
+	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
+	lea		16($inp),$inp
+
+	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
+	vpsrlvq		$inp_shift,$T0,$T0
+	vpandq		$reduc_mask,$T0,$T0
+	vporq		$PAD,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
+
+	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
+	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
+	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
+
+	vpxord		$Dlo,$Dlo,$Dlo
+	vpxord		$Dhi,$Dhi,$Dhi
+
+	vpmadd52luq	$r2r1r0,$H0,$Dlo
+	vpmadd52huq	$r2r1r0,$H0,$Dhi
+
+	vpmadd52luq	$r1r0s2,$H1,$Dlo
+	vpmadd52huq	$r1r0s2,$H1,$Dhi
+
+	vpmadd52luq	$r0s2s1,$H2,$Dlo
+	vpmadd52huq	$r0s2s1,$H2,$Dhi
+
+	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
+	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
+	vpandq		$reduc_mask,$Dlo,$Dlo
+
+	vpaddq		$T0,$Dhi,$Dhi
+
+	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
+
+	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
+
+	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
+	vpandq		$reduc_mask,$Dlo,$Dlo
+
+	vpermq		\$0b10010011,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo
+
+	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
+
+	vpaddq		$T0,$Dlo,$Dlo
+	vpsllq		\$2,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo
+
+	dec		$len			# len-=16
+	jnz		.Loop_vpmadd52
+
+	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
+
+.Lno_data_vpmadd52:
+	ret
+.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+$code.=<<___;
+.type	poly1305_emit_base2_44,\@function,3
+.align	32
+poly1305_emit_base2_44:
+	mov	0($ctx),%r8	# load hash value
+	mov	8($ctx),%r9
+	mov	16($ctx),%r10
+
+	mov	%r9,%rax
+	shr	\$20,%r9
+	shl	\$44,%rax
+	mov	%r10,%rcx
+	shr	\$40,%r10
+	shl	\$24,%rcx
+
+	add	%rax,%r8
+	adc	%rcx,%r9
+	adc	\$0,%r10
+
+	mov	%r8,%rax
+	add	\$5,%r8		# compare to modulus
+	mov	%r9,%rcx
+	adc	\$0,%r9
+	adc	\$0,%r10
+	shr	\$2,%r10	# did 130-bit value overfow?
+	cmovnz	%r8,%rax
+	cmovnz	%r9,%rcx
+
+	add	0($nonce),%rax	# accumulate nonce
+	adc	8($nonce),%rcx
+	mov	%rax,0($mac)	# write result
+	mov	%rcx,8($mac)
+
+	ret
+.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+}	}	}
 $code.=<<___;
 .align	64
 .Lconst:
@@ -2654,10 +2830,19 @@ $code.=<<___;
 .long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 .Lmask26:
 .long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lfive:
-.long	5,0,5,0,5,0,5,0
-.Lgather:
-.long	0,8, 32,40, 64,72, 96,104
+.Lpermd_avx2:
+.long	2,2,2,3,2,0,2,1
+
+.L2_44_inp_permd:
+.long	0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad	0,12,24,64
+.L2_44_mask:
+.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad	44,44,42,64
+.L2_44_shift_lft:
+.quad	8,8,10,64
 ___
 }
 


More information about the openssl-commits mailing list