[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Sat Feb 25 17:37:03 UTC 2017
The branch master has been updated
via fd910ef9593d4e16dabf4686ecabb351830045b6 (commit)
via 73e8a5c8261625a6e90e07e567263c69039e3d17 (commit)
via c1e1fc500da910dbf4358f902f6b824a3c34b922 (commit)
from c749308fc44a0b33b340e23834320dbef9fbf8de (commit)
- Log -----------------------------------------------------------------
commit fd910ef9593d4e16dabf4686ecabb351830045b6
Author: Andy Polyakov <appro at openssl.org>
Date: Fri Dec 30 00:00:16 2016 +0100
poly1305/asm/poly1305-x86_64.pl: add VPMADD52 code path.
This is initial and minimal single-block implementation.
Reviewed-by: Rich Salz <rsalz at openssl.org>
commit 73e8a5c8261625a6e90e07e567263c69039e3d17
Author: Andy Polyakov <appro at openssl.org>
Date: Sun Dec 25 13:10:00 2016 +0100
poly1305/asm/poly1305-x86_64.pl: switch to vpermdd in table expansion.
Effectively it's minor size optimization, 5-6% per affected subroutine.
Reviewed-by: Rich Salz <rsalz at openssl.org>
commit c1e1fc500da910dbf4358f902f6b824a3c34b922
Author: Andy Polyakov <appro at openssl.org>
Date: Sun Dec 25 13:05:35 2016 +0100
poly1305/asm/poly1305-x86_64.pl: optimize AVX512 code path.
On pre-Skylake best optimization strategy was balancing port-specific
instructions, while on Skylake minimizing the sheer amount appears
more sensible.
Reviewed-by: Rich Salz <rsalz at openssl.org>
-----------------------------------------------------------------------
Summary of changes:
crypto/poly1305/asm/poly1305-x86_64.pl | 465 +++++++++++++++++++++++----------
1 file changed, 325 insertions(+), 140 deletions(-)
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
index baf3c75..ff4efb3 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -62,13 +62,13 @@ die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+ $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
+ $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
+ $avx += 2 if ($1==2.11 && $2>=8);
}
if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
@@ -178,6 +178,13 @@ $code.=<<___ if ($avx>1);
bt \$`5+32`,%r9 # AVX2?
cmovc %rax,%r10
___
+$code.=<<___ if ($avx>3);
+ mov \$`(1<<31|1<<21|1<<16)`,%rax
+ shr \$32,%r9
+ and %rax,%r9
+ cmp %rax,%r9
+ je .Linit_base2_44
+___
$code.=<<___;
mov \$0x0ffffffc0fffffff,%rax
mov \$0x0ffffffc0ffffffc,%rcx
@@ -1631,8 +1638,9 @@ $code.=<<___ if ($win64);
.Ldo_avx2_body:
___
$code.=<<___;
- lea 48+64($ctx),$ctx # size optimization
lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
# expand and copy pre-calculated table to stack
vmovdqu `16*0-64`($ctx),%x#$T2
@@ -1642,36 +1650,28 @@ $code.=<<___;
vmovdqu `16*3-64`($ctx),%x#$D0
vmovdqu `16*4-64`($ctx),%x#$D1
vmovdqu `16*5-64`($ctx),%x#$D2
+ lea 0x90(%rsp),%rax # size optimization
vmovdqu `16*6-64`($ctx),%x#$D3
- vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
+ vpermd $T2,$T0,$T2 # 00003412 -> 14243444
vmovdqu `16*7-64`($ctx),%x#$D4
- vpermq \$0x15,$T3,$T3
- vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
+ vpermd $T3,$T0,$T3
vmovdqu `16*8-64`($ctx),%x#$MASK
- vpermq \$0x15,$T4,$T4
- vpshufd \$0xc8,$T3,$T3
+ vpermd $T4,$T0,$T4
vmovdqa $T2,0x00(%rsp)
- vpermq \$0x15,$D0,$D0
- vpshufd \$0xc8,$T4,$T4
- vmovdqa $T3,0x20(%rsp)
- vpermq \$0x15,$D1,$D1
- vpshufd \$0xc8,$D0,$D0
- vmovdqa $T4,0x40(%rsp)
- vpermq \$0x15,$D2,$D2
- vpshufd \$0xc8,$D1,$D1
- vmovdqa $D0,0x60(%rsp)
- vpermq \$0x15,$D3,$D3
- vpshufd \$0xc8,$D2,$D2
- vmovdqa $D1,0x80(%rsp)
- vpermq \$0x15,$D4,$D4
- vpshufd \$0xc8,$D3,$D3
- vmovdqa $D2,0xa0(%rsp)
- vpermq \$0x15,$MASK,$MASK
- vpshufd \$0xc8,$D4,$D4
- vmovdqa $D3,0xc0(%rsp)
- vpshufd \$0xc8,$MASK,$MASK
- vmovdqa $D4,0xe0(%rsp)
- vmovdqa $MASK,0x100(%rsp)
+ vpermd $D0,$T0,$D0
+ vmovdqa $T3,0x20-0x90(%rax)
+ vpermd $D1,$T0,$D1
+ vmovdqa $T4,0x40-0x90(%rax)
+ vpermd $D2,$T0,$D2
+ vmovdqa $D0,0x60-0x90(%rax)
+ vpermd $D3,$T0,$D3
+ vmovdqa $D1,0x80-0x90(%rax)
+ vpermd $D4,$T0,$D4
+ vmovdqa $D2,0xa0-0x90(%rax)
+ vpermd $MASK,$T0,$MASK
+ vmovdqa $D3,0xc0-0x90(%rax)
+ vmovdqa $D4,0xe0-0x90(%rax)
+ vmovdqa $MASK,0x100-0x90(%rax)
vmovdqa 64(%rcx),$MASK # .Lmask26
################################################################
@@ -1698,7 +1698,6 @@ $code.=<<___;
vpand $MASK,$T3,$T3 # 3
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
- lea 0x90(%rsp),%rax # size optimization
vpaddq $H2,$T2,$H2 # accumulate input
sub \$64,$len
jz .Ltail_avx2
@@ -2055,8 +2054,9 @@ $code.=<<___ if ($win64);
.Ldo_avx512_body:
___
$code.=<<___;
- lea 48+64($ctx),$ctx # size optimization
lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),$T2 # .Lpermd_avx2
# expand pre-calculated table
vmovdqu32 `16*0-64`($ctx),%x#$R0
@@ -2069,33 +2069,23 @@ $code.=<<___;
vmovdqu32 `16*6-64`($ctx),%x#$S3
vmovdqu32 `16*7-64`($ctx),%x#$R4
vmovdqu32 `16*8-64`($ctx),%x#$S4
- vpermq \$0x15,$R0,$R0 # 00003412 -> 12343434
+ vpermd $R0,$T2,$R0 # 00003412 -> 14243444
vmovdqa64 64(%rcx),$MASK # .Lmask26
- vpermq \$0x15,$R1,$R1
- vmovdqa32 128(%rcx),$GATHER # .Lgather
- vpermq \$0x15,$S1,$S1
- vpshufd \$0xc8,$R0,$R0 # 12343434 -> 14243444
- vpermq \$0x15,$R2,$R2
- vpshufd \$0xc8,$R1,$R1
+ vpermd $R1,$T2,$R1
+ vpermd $S1,$T2,$S1
+ vpermd $R2,$T2,$R2
vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0
vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
- vpermq \$0x15,$S2,$S2
- vpshufd \$0xc8,$S1,$S1
+ vpermd $S2,$T2,$S2
vmovdqa32 $R1,0x20(%rsp)
vpsrlq \$32,$R1,$T1
- vpermq \$0x15,$R3,$R3
- vpshufd \$0xc8,$R2,$R2
+ vpermd $R3,$T2,$R3
vmovdqa32 $S1,0x40(%rsp)
- vpermq \$0x15,$S3,$S3
- vpshufd \$0xc8,$S2,$S2
- vpermq \$0x15,$R4,$R4
- vpshufd \$0xc8,$R3,$R3
+ vpermd $S3,$T2,$S3
+ vpermd $R4,$T2,$R4
vmovdqa32 $R2,0x60(%rsp)
- vpermq \$0x15,$S4,$S4
- vpshufd \$0xc8,$S3,$S3
+ vpermd $S4,$T2,$S4
vmovdqa32 $S2,0x80(%rsp)
- vpshufd \$0xc8,$R4,$R4
- vpshufd \$0xc8,$S4,$S4
vmovdqa32 $R3,0xa0(%rsp)
vmovdqa32 $S3,0xc0(%rsp)
vmovdqa32 $R4,0xe0(%rsp)
@@ -2165,10 +2155,9 @@ $code.=<<___;
################################################################
# load input
- vmovdqu64 16*0($inp),%x#$T0
- vmovdqu64 16*1($inp),%x#$T1
- vinserti64x2 \$1,16*2($inp),$T0,$T0
- vinserti64x2 \$1,16*3($inp),$T1,$T1
+ vmovdqu64 16*0($inp),%z#$T3
+ vmovdqu64 16*4($inp),%z#$T4
+ lea 16*8($inp),$inp
################################################################
# lazy reduction
@@ -2205,50 +2194,51 @@ $code.=<<___;
vpaddq $M3,$D4,$D4 # d3 -> d4
___
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
map(s/%y/%z/,($MASK));
$code.=<<___;
################################################################
- # load more input
- vinserti64x2 \$2,16*4($inp),$T0,$T0
- vinserti64x2 \$2,16*5($inp),$T1,$T1
- vinserti64x2 \$3,16*6($inp),$T0,$T0
- vinserti64x2 \$3,16*7($inp),$T1,$T1
- lea 16*8($inp),$inp
+ # at this point we have 14243444 in $R0-$S4 and 05060708 in
+ # $D0-$D4, ...
- vpbroadcastq %x#$MASK,$MASK
- vpbroadcastq 32(%rcx),$PADBIT
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
- ################################################################
- # at this point we have 14243444 in $R0-$S4 and 05060708 in
- # $D0-$D4, and the goal is 1828384858687888 in $R0-$S4
+ # ... since input 64-bit lanes are ordered as 73625140, we could
+ # "vperm" it to 76543210 (here and in each loop iteration), *or*
+ # we could just flow along, hence the goal for $R0-$S4 is
+ # 1858286838784888 ...
+
+ mov \$0b0110011001100110,%eax
+ mov \$0b1100110011001100,%r8d
+ mov \$0b0101010101010101,%r9d
+ kmovw %eax,%k1
+ kmovw %r8d,%k2
+ kmovw %r9d,%k3
- mov \$0x5555,%eax
- vpbroadcastq %x#$D0,$M0 # 0808080808080808
+ vpbroadcastq %x#$D0,$M0 # 0808080808080808
vpbroadcastq %x#$D1,$M1
vpbroadcastq %x#$D2,$M2
vpbroadcastq %x#$D3,$M3
vpbroadcastq %x#$D4,$M4
- kmovw %eax,%k3
- vpsllq \$32,$D0,$D0 # 05060708 -> 50607080
- vpsllq \$32,$D1,$D1
- vpsllq \$32,$D2,$D2
- vpsllq \$32,$D3,$D3
- vpsllq \$32,$D4,$D4
-___
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-$code.=<<___;
- vinserti64x4 \$1,$R0,$D0,$D0 # 1424344450607080
- vinserti64x4 \$1,$R1,$D1,$D1
- vinserti64x4 \$1,$R2,$D2,$D2
- vinserti64x4 \$1,$R3,$D3,$D3
- vinserti64x4 \$1,$R4,$D4,$D4
-___
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
-$code.=<<___;
- vpblendmd $M0,$D0,${R0}{%k3} # 1828384858687888
+
+ vpexpandd $D0,${D0}{%k1} # 05060708 -> -05--06--07--08-
+ vpexpandd $D1,${D1}{%k1}
+ vpexpandd $D2,${D2}{%k1}
+ vpexpandd $D3,${D3}{%k1}
+ vpexpandd $D4,${D4}{%k1}
+
+ vpexpandd $R0,${D0}{%k2} # -05--06--07--08- -> 145-246-347-448-
+ vpexpandd $R1,${D1}{%k2}
+ vpexpandd $R2,${D2}{%k2}
+ vpexpandd $R3,${D3}{%k2}
+ vpexpandd $R4,${D4}{%k2}
+
+ vpblendmd $M0,$D0,${R0}{%k3} # 1858286838784888
vpblendmd $M1,$D1,${R1}{%k3}
vpblendmd $M2,$D2,${R2}{%k3}
vpblendmd $M3,$D3,${R3}{%k3}
@@ -2263,27 +2253,28 @@ $code.=<<___;
vpaddd $R3,$S3,$S3
vpaddd $R4,$S4,$S4
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpbroadcastq %x#$MASK,$MASK
+ vpbroadcastq 32(%rcx),$PADBIT # .L129
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
+ vporq $T3,$T2,$T2
vpsrlq \$26,$T0,$T1
+ vpsrlq \$14,$T4,$T3
vpsrlq \$40,$T4,$T4 # 4
vpandq $MASK,$T2,$T2 # 2
vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
+ vpandq $MASK,$T1,$T1 # 1
+ vpandq $MASK,$T3,$T3 # 3
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpaddq $H2,$T2,$H2 # accumulate input
mov \$0x0f,%eax
sub \$192,$len
jbe .Ltail_avx512
+ jmp .Loop_avx512
+.align 32
.Loop_avx512:
################################################################
# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
@@ -2315,12 +2306,8 @@ $code.=<<___;
vpmuludq $H2,$R1,$D3 # d3 = h2*r1
vpaddq $H0,$T0,$H0
- vmovdqu64 16*0($inp),%x#$M0 # load input
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpandq $MASK,$T1,$T1 # 1, module-scheduled
- vmovdqu64 16*1($inp),%x#$M1
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T3,$T3 # 3
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
@@ -2328,8 +2315,9 @@ $code.=<<___;
vpaddq $H3,$T3,$H3
vpaddq $H4,$T4,$H4
- vinserti64x2 \$1,16*2($inp),$M0,$T0
- vinserti64x2 \$1,16*3($inp),$M1,$T1
+ vmovdqu64 16*0($inp),$T3 # load input
+ vmovdqu64 16*4($inp),$T4
+ lea 16*8($inp),$inp
vpmuludq $H0,$R3,$M3
vpmuludq $H0,$R4,$M4
vpmuludq $H0,$R0,$M0
@@ -2339,8 +2327,6 @@ $code.=<<___;
vpaddq $M0,$D0,$D0 # d0 += h0*r0
vpaddq $M1,$D1,$D1 # d1 += h0*r1
- vinserti64x2 \$2,16*4($inp),$T0,$T0
- vinserti64x2 \$2,16*5($inp),$T1,$T1
vpmuludq $H1,$R2,$M3
vpmuludq $H1,$R3,$M4
vpmuludq $H1,$S4,$M0
@@ -2350,8 +2336,9 @@ $code.=<<___;
vpaddq $M0,$D0,$D0 # d0 += h1*s4
vpaddq $M2,$D2,$D2 # d2 += h0*r2
- vinserti64x2 \$3,16*6($inp),$T0,$T0
- vinserti64x2 \$3,16*7($inp),$T1,$T1
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
+
vpmuludq $H3,$R0,$M3
vpmuludq $H3,$R1,$M4
vpmuludq $H1,$R0,$M1
@@ -2361,9 +2348,6 @@ $code.=<<___;
vpaddq $M1,$D1,$D1 # d1 += h1*r0
vpaddq $M2,$D2,$D2 # d2 += h1*r1
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
vpmuludq $H4,$S4,$M3
vpmuludq $H4,$R0,$M4
vpmuludq $H3,$S2,$M0
@@ -2375,9 +2359,6 @@ $code.=<<___;
vpaddq $M1,$D1,$D1 # d1 += h3*s3
vpaddq $M2,$D2,$D2 # d2 += h3*s4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
- lea 16*8($inp),$inp
vpmuludq $H4,$S1,$M0
vpmuludq $H4,$S2,$M1
vpmuludq $H4,$S3,$M2
@@ -2386,21 +2367,26 @@ $code.=<<___;
vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
################################################################
- # lazy reduction (interleaved with tail of input splat)
+ # lazy reduction (interleaved with input splat)
+
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
vpsrlq \$26,$D3,$H3
vpandq $MASK,$D3,$D3
vpaddq $H3,$D4,$H4 # h3 -> h4
+ vporq $T3,$T2,$T2
+
vpsrlq \$26,$H0,$D0
vpandq $MASK,$H0,$H0
vpaddq $D0,$H1,$H1 # h0 -> h1
+ vpandq $MASK,$T2,$T2 # 2
+
vpsrlq \$26,$H4,$D4
vpandq $MASK,$H4,$H4
- vpsrlq \$4,$T3,$T2
-
vpsrlq \$26,$H1,$D1
vpandq $MASK,$H1,$H1
vpaddq $D1,$H2,$H2 # h1 -> h2
@@ -2409,15 +2395,14 @@ $code.=<<___;
vpsllq \$2,$D4,$D4
vpaddq $D4,$H0,$H0 # h4 -> h0
- vpandq $MASK,$T2,$T2 # 2
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
vpsrlq \$26,$T0,$T1
vpsrlq \$26,$H2,$D2
vpandq $MASK,$H2,$H2
vpaddq $D2,$D3,$H3 # h2 -> h3
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$30,$T3,$T3
+ vpsrlq \$14,$T4,$T3
vpsrlq \$26,$H0,$D0
vpandq $MASK,$H0,$H0
@@ -2430,8 +2415,8 @@ $code.=<<___;
vpaddq $D3,$H4,$H4 # h3 -> h4
vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
+ vpandq $MASK,$T1,$T1 # 1
+ vpandq $MASK,$T3,$T3 # 3
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
sub \$128,$len
@@ -2443,7 +2428,7 @@ $code.=<<___;
# iteration we multiply least significant lane by r^8 and most
# significant one by r, that's why table gets shifted...
- vpsrlq \$32,$R0,$R0 # 0102030405060708
+ vpsrlq \$32,$R0,$R0 # 0105020603070408
vpsrlq \$32,$R1,$R1
vpsrlq \$32,$R2,$R2
vpsrlq \$32,$S3,$S3
@@ -2465,8 +2450,6 @@ $code.=<<___;
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vpandq $MASK,$T1,$T1 # 1, module-scheduled
- vpandq $MASK,$T3,$T3 # 3
vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpaddq $H1,$T1,$H1 # accumulate input
vpaddq $H3,$T3,$H3
@@ -2621,18 +2604,19 @@ $code.=<<___;
vmovd %x#$H2,`4*2-48-64`($ctx)
vmovd %x#$H3,`4*3-48-64`($ctx)
vmovd %x#$H4,`4*4-48-64`($ctx)
+ vzeroall
___
$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
+ movdqa 0x50(%r11),%xmm6
+ movdqa 0x60(%r11),%xmm7
+ movdqa 0x70(%r11),%xmm8
+ movdqa 0x80(%r11),%xmm9
+ movdqa 0x90(%r11),%xmm10
+ movdqa 0xa0(%r11),%xmm11
+ movdqa 0xb0(%r11),%xmm12
+ movdqa 0xc0(%r11),%xmm13
+ movdqa 0xd0(%r11),%xmm14
+ movdqa 0xe0(%r11),%xmm15
lea 0xf8(%r11),%rsp
.Ldo_avx512_epilogue:
___
@@ -2640,11 +2624,203 @@ $code.=<<___ if (!$win64);
lea 8(%r11),%rsp
___
$code.=<<___;
- vzeroupper
ret
.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
___
-} }
+if ($avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+$code.=<<___;
+.type poly1305_init_base2_44,\@function,3
+.align 32
+poly1305_init_base2_44:
+ xor %rax,%rax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+.Linit_base2_44:
+ lea poly1305_blocks_vpmadd52(%rip),%r10
+ lea poly1305_emit_base2_44(%rip),%r11
+
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ mov \$0x00000fffffffffff,%r8
+ and 8($inp),%rcx
+ mov \$0x00000fffffffffff,%r9
+ and %rax,%r8
+ shrd \$44,%rcx,%rax
+ mov %r8,40($ctx) # r0
+ and %r9,%rax
+ shr \$24,%rcx
+ mov %rax,48($ctx) # r1
+ lea (%rax,%rax,4),%rax # *5
+ mov %rcx,56($ctx) # r2
+ shl \$2,%rax # magic <<2
+ lea (%rcx,%rcx,4),%rcx # *5
+ shl \$2,%rcx # magic <<2
+ mov %rax,24($ctx) # s1
+ mov %rcx,32($ctx) # s2
+___
+$code.=<<___ if ($flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+ ret
+.size poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52,\@function,4
+.align 32
+poly1305_blocks_vpmadd52:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52 # too short
+
+ mov \$7,%r10d
+ mov \$1,%r11d
+ kmovw %r10d,%k7
+ lea .L2_44_inp_permd(%rip),%r10
+ shl \$40,$padbit
+ kmovw %r11d,%k1
+
+ vmovq $padbit,%x#$PAD
+ vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
+ vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
+ vpermq \$0xcf,$PAD,$PAD
+ vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
+
+ vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
+ vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
+ vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
+ vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
+
+ vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
+ vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
+
+ jmp .Loop_vpmadd52
+
+.align 32
+.Loop_vpmadd52:
+ vmovdqu32 0($inp),%x#$T0 # load input as ----3210
+ lea 16($inp),$inp
+
+ vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
+ vpsrlvq $inp_shift,$T0,$T0
+ vpandq $reduc_mask,$T0,$T0
+ vporq $PAD,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo # accumulate input
+
+ vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
+ vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
+ vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
+
+ vpxord $Dlo,$Dlo,$Dlo
+ vpxord $Dhi,$Dhi,$Dhi
+
+ vpmadd52luq $r2r1r0,$H0,$Dlo
+ vpmadd52huq $r2r1r0,$H0,$Dhi
+
+ vpmadd52luq $r1r0s2,$H1,$Dlo
+ vpmadd52huq $r1r0s2,$H1,$Dhi
+
+ vpmadd52luq $r0s2s1,$H2,$Dlo
+ vpmadd52huq $r0s2s1,$H2,$Dhi
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
+ vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpaddq $T0,$Dhi,$Dhi
+
+ vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
+
+ vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
+
+ vpaddq $T0,$Dlo,$Dlo
+ vpsllq \$2,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ dec $len # len-=16
+ jnz .Loop_vpmadd52
+
+ vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
+
+.Lno_data_vpmadd52:
+ ret
+.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+$code.=<<___;
+.type poly1305_emit_base2_44,\@function,3
+.align 32
+poly1305_emit_base2_44:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r9,%rax
+ shr \$20,%r9
+ shl \$44,%rax
+ mov %r10,%rcx
+ shr \$40,%r10
+ shl \$24,%rcx
+
+ add %rax,%r8
+ adc %rcx,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overfow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ ret
+.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+} } }
$code.=<<___;
.align 64
.Lconst:
@@ -2654,10 +2830,19 @@ $code.=<<___;
.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
.Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lfive:
-.long 5,0,5,0,5,0,5,0
-.Lgather:
-.long 0,8, 32,40, 64,72, 96,104
+.Lpermd_avx2:
+.long 2,2,2,3,2,0,2,1
+
+.L2_44_inp_permd:
+.long 0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad 0,12,24,64
+.L2_44_mask:
+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad 44,44,42,64
+.L2_44_shift_lft:
+.quad 8,8,10,64
___
}
More information about the openssl-commits
mailing list