[openssl-commits] [openssl] OpenSSL source code branch master updated. 9e557ab2624d5c5e8d799c123f5e8211664d8845
Andy Polyakov
appro at openssl.org
Sun Jan 4 22:22:27 UTC 2015
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "OpenSSL source code".
The branch, master has been updated
via 9e557ab2624d5c5e8d799c123f5e8211664d8845 (commit)
from 2c60925d1ccc0b96287bdc9acb90198e7180d642 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 9e557ab2624d5c5e8d799c123f5e8211664d8845
Author: Andy Polyakov <appro at openssl.org>
Date: Sun Jan 4 21:29:50 2015 +0100
ecp_nistz256-x86_64.pl: fix occasional failures.
RT: 3607
Reviewed-by: Adam Langley <agl at google.com>
Reviewed-by: Emilia Kasper <emilia at openssl.org>
-----------------------------------------------------------------------
Summary of changes:
crypto/ec/asm/ecp_nistz256-x86_64.pl | 481 ++++++++++++++--------------------
1 file changed, 191 insertions(+), 290 deletions(-)
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 4486a5e..cdff22a 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -31,15 +31,16 @@
# Further optimization by <appro at openssl.org>:
#
# this/original
-# Opteron +8-33%
-# Bulldozer +10-30%
-# P4 +14-38%
-# Westmere +8-23%
-# Sandy Bridge +8-24%
-# Ivy Bridge +7-25%
-# Haswell +5-25%
-# Atom +10-32%
-# VIA Nano +37-130%
+# Opteron +12-49%
+# Bulldozer +14-45%
+# P4 +18-46%
+# Westmere +12-34%
+# Sandy Bridge +9-35%
+# Ivy Bridge +9-35%
+# Haswell +8-37%
+# Broadwell +18-58%
+# Atom +15-50%
+# VIA Nano +43-160%
#
# Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, relatively
@@ -550,28 +551,20 @@ __ecp_nistz256_mul_montq:
# and add the result to the acc.
# Due to the special form of p256 we do some optimizations
#
- # acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
- # then we add acc[0] and get acc[0] x 2^64
-
- mulq $poly1
- xor $t0, $t0
- add $acc0, $acc1 # +=acc[0]*2^64
- adc \$0, %rdx
- add %rax, $acc1
- mov $acc0, %rax
-
- # acc[0] x p256[2] = 0
- adc %rdx, $acc2
- adc \$0, $t0
+ # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
+ # then we add acc[0] and get acc[0] x 2^96
+ mov $acc0, $t1
+ shl \$32, $acc0
mulq $poly3
- xor $acc0, $acc0
- add $t0, $acc3
- adc \$0, %rdx
- add %rax, $acc3
+ shr \$32, $t1
+ add $acc0, $acc1 # +=acc[0]<<96
+ adc $t1, $acc2
+ adc %rax, $acc3
mov 8*1($b_ptr), %rax
adc %rdx, $acc4
adc \$0, $acc5
+ xor $acc0, $acc0
########################################################################
# Multiply by b[1]
@@ -608,23 +601,17 @@ __ecp_nistz256_mul_montq:
########################################################################
# Second reduction step
- mulq $poly1
- xor $t0, $t0
- add $acc1, $acc2
- adc \$0, %rdx
- add %rax, $acc2
- mov $acc1, %rax
- adc %rdx, $acc3
- adc \$0, $t0
-
+ mov $acc1, $t1
+ shl \$32, $acc1
mulq $poly3
- xor $acc1, $acc1
- add $t0, $acc4
- adc \$0, %rdx
- add %rax, $acc4
+ shr \$32, $t1
+ add $acc1, $acc2
+ adc $t1, $acc3
+ adc %rax, $acc4
mov 8*2($b_ptr), %rax
adc %rdx, $acc5
adc \$0, $acc0
+ xor $acc1, $acc1
########################################################################
# Multiply by b[2]
@@ -661,23 +648,17 @@ __ecp_nistz256_mul_montq:
########################################################################
# Third reduction step
- mulq $poly1
- xor $t0, $t0
- add $acc2, $acc3
- adc \$0, %rdx
- add %rax, $acc3
- mov $acc2, %rax
- adc %rdx, $acc4
- adc \$0, $t0
-
+ mov $acc2, $t1
+ shl \$32, $acc2
mulq $poly3
- xor $acc2, $acc2
- add $t0, $acc5
- adc \$0, %rdx
- add %rax, $acc5
+ shr \$32, $t1
+ add $acc2, $acc3
+ adc $t1, $acc4
+ adc %rax, $acc5
mov 8*3($b_ptr), %rax
adc %rdx, $acc0
adc \$0, $acc1
+ xor $acc2, $acc2
########################################################################
# Multiply by b[3]
@@ -714,20 +695,14 @@ __ecp_nistz256_mul_montq:
########################################################################
# Final reduction step
- mulq $poly1
- #xor $t0, $t0
- add $acc3, $acc4
- adc \$0, %rdx
- add %rax, $acc4
- mov $acc3, %rax
- adc %rdx, $acc5
- #adc \$0, $t0 # doesn't overflow
-
+ mov $acc3, $t1
+ shl \$32, $acc3
mulq $poly3
- #add $t0, $acc0
- #adc \$0, %rdx
+ shr \$32, $t1
+ add $acc3, $acc4
+ adc $t1, $acc5
mov $acc4, $t0
- add %rax, $acc0
+ adc %rax, $acc0
adc %rdx, $acc1
mov $acc5, $t1
adc \$0, $acc2
@@ -740,14 +715,14 @@ __ecp_nistz256_mul_montq:
sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t3
sbb $poly3, $acc1 # .Lpoly[3]
- neg $acc2
+ sbb \$0, $acc2
- cmovnc $t0, $acc4
- cmovnc $t1, $acc5
+ cmovc $t0, $acc4
+ cmovc $t1, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $t2, $acc0
+ cmovc $t2, $acc0
mov $acc5, 8*1($r_ptr)
- cmovnc $t3, $acc1
+ cmovc $t3, $acc1
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
@@ -897,89 +872,62 @@ __ecp_nistz256_sqr_montq:
##########################################
# Now the reduction
# First iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc0, $acc1
- adc \$0, %rdx
- add %rax, $acc1
- mov $acc0, %rax
- adc %rdx, $acc2 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc0, $t0
+ shl \$32, $acc0
mulq $t1
- xor $acc0, $acc0
- #add $t0, $acc3
- #adc \$0, %rdx
- add %rax, $acc3
+ shr \$32, $t0
+ add $acc0, $acc1 # +=acc[0]<<96
+ adc $t0, $acc2
+ adc %rax, $acc3
mov $acc1, %rax
- adc %rdx, $acc4
- adc \$0, $acc0
+ adc \$0, %rdx
##########################################
# Second iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc1, $acc2
- adc \$0, %rdx
- add %rax, $acc2
- mov $acc1, %rax
- adc %rdx, $acc3 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc1, $t0
+ shl \$32, $acc1
+ mov %rdx, $acc0
mulq $t1
- xor $acc1, $acc1
- #add $t0, $acc4
- #adc \$0, %rdx
- add %rax, $acc4
+ shr \$32, $t0
+ add $acc1, $acc2
+ adc $t0, $acc3
+ adc %rax, $acc0
mov $acc2, %rax
- adc %rdx, $acc0
- adc \$0, $acc1
+ adc \$0, %rdx
##########################################
# Third iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc2, $acc3
- adc \$0, %rdx
- add %rax, $acc3
- mov $acc2, %rax
- adc %rdx, $acc4 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc2, $t0
+ shl \$32, $acc2
+ mov %rdx, $acc1
mulq $t1
- xor $acc2, $acc2
- #add $t0, $acc0
- #adc \$0, %rdx
- add %rax, $acc0
+ shr \$32, $t0
+ add $acc2, $acc3
+ adc $t0, $acc0
+ adc %rax, $acc1
mov $acc3, %rax
- adc %rdx, $acc1
- adc \$0, $acc2
+ adc \$0, %rdx
###########################################
# Last iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc3, $acc4
- adc \$0, %rdx
- add %rax, $acc4
- mov $acc3, %rax
- adc %rdx, $acc0 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc3, $t0
+ shl \$32, $acc3
+ mov %rdx, $acc2
mulq $t1
+ shr \$32, $t0
+ add $acc3, $acc0
+ adc $t0, $acc1
+ adc %rax, $acc2
+ adc \$0, %rdx
xor $acc3, $acc3
- #add $t0, $acc1
- #adc \$0, %rdx
- add %rax, $acc1
- adc %rdx, $acc2
- adc \$0, $acc3
############################################
# Add the rest of the acc
- add $acc0, $acc5
+ add $acc0, $acc4
+ adc $acc1, $acc5
mov $acc4, $acc0
- adc $acc1, $acc6
- adc $acc2, $acc7
+ adc $acc2, $acc6
+ adc %rdx, $acc7
mov $acc5, $acc1
adc \$0, $acc3
@@ -989,14 +937,14 @@ __ecp_nistz256_sqr_montq:
sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $t0
sbb $t1, $acc7 # .Lpoly[3]
- neg $acc3
+ sbb \$0, $acc3
- cmovnc $acc0, $acc4
- cmovnc $acc1, $acc5
+ cmovc $acc0, $acc4
+ cmovc $acc1, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $acc2, $acc6
+ cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr)
- cmovnc $t0, $acc7
+ cmovc $t0, $acc7
mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr)
@@ -1028,18 +976,15 @@ __ecp_nistz256_mul_montx:
########################################################################
# First reduction step
- xor $acc0, $acc0 # $acc0=0,cf=0,of=0
- adox $t1, $acc1
- adox $t0, $acc2
+ add $t1, $acc1
+ adc $t0, $acc2
mulx $poly3, $t0, $t1
mov 8*1($b_ptr), %rdx
- adox $t0, $acc3
- adcx $t1, $acc4
-
- adox $acc0, $acc4
- adcx $acc0, $acc5 # cf=0
- adox $acc0, $acc5 # of=0
+ adc $t0, $acc3
+ adc $t1, $acc4
+ adc \$0, $acc5
+ xor $acc0, $acc0 # $acc0=0,cf=0,of=0
########################################################################
# Multiply by b[1]
@@ -1068,18 +1013,15 @@ __ecp_nistz256_mul_montx:
########################################################################
# Second reduction step
- xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
- adox $t0, $acc2
- adox $t1, $acc3
+ add $t0, $acc2
+ adc $t1, $acc3
mulx $poly3, $t0, $t1
mov 8*2($b_ptr), %rdx
- adox $t0, $acc4
- adcx $t1, $acc5
-
- adox $acc1, $acc5
- adcx $acc1, $acc0 # cf=0
- adox $acc1, $acc0 # of=0
+ adc $t0, $acc4
+ adc $t1, $acc5
+ adc \$0, $acc0
+ xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
########################################################################
# Multiply by b[2]
@@ -1108,18 +1050,15 @@ __ecp_nistz256_mul_montx:
########################################################################
# Third reduction step
- xor $acc2, $acc2 # $acc2=0,cf=0,of=0
- adox $t0, $acc3
- adox $t1, $acc4
+ add $t0, $acc3
+ adc $t1, $acc4
mulx $poly3, $t0, $t1
mov 8*3($b_ptr), %rdx
- adox $t0, $acc5
- adcx $t1, $acc0
-
- adox $acc2, $acc0
- adcx $acc2, $acc1 # cf=0
- adox $acc2, $acc1 # of=0
+ adc $t0, $acc5
+ adc $t1, $acc0
+ adc \$0, $acc1
+ xor $acc2, $acc2 # $acc2=0,cf=0,of=0
########################################################################
# Multiply by b[3]
@@ -1148,38 +1087,34 @@ __ecp_nistz256_mul_montx:
########################################################################
# Fourth reduction step
- xor $acc3, $acc3 # $acc3=0,cf=0,of=0
- adox $t0, $acc4
- adox $t1, $acc5
+ add $t0, $acc4
+ adc $t1, $acc5
mulx $poly3, $t0, $t1
mov $acc4, $t2
mov .Lpoly+8*1(%rip), $poly1
- adcx $t0, $acc0
- adox $t1, $acc1
+ adc $t0, $acc0
mov $acc5, $t3
-
- adcx $acc3, $acc1
- adox $acc3, $acc2
+ adc $t1, $acc1
adc \$0, $acc2
- mov $acc0, $t0
########################################################################
# Branch-less conditional subtraction of P
xor %eax, %eax
+ mov $acc0, $t0
sbb \$-1, $acc4 # .Lpoly[0]
sbb $poly1, $acc5 # .Lpoly[1]
sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t1
sbb $poly3, $acc1 # .Lpoly[3]
+ sbb \$0, $acc2
- bt \$0,$acc2
- cmovnc $t2, $acc4
- cmovnc $t3, $acc5
+ cmovc $t2, $acc4
+ cmovc $t3, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $t0, $acc0
+ cmovc $t0, $acc0
mov $acc5, 8*1($r_ptr)
- cmovnc $t1, $acc1
+ cmovc $t1, $acc1
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
@@ -1247,52 +1182,44 @@ __ecp_nistz256_sqr_montx:
mov .Lpoly+8*3(%rip), $t1
# reduction step 1
- xor $acc0, $acc0
- adcx $t0, $acc1
- adcx $t4, $acc2
+ add $t0, $acc1
+ adc $t4, $acc2
- mulx $t1, $t0, $t4
+ mulx $t1, $t0, $acc0
mov $acc1, %rdx
- adcx $t0, $acc3
+ adc $t0, $acc3
shlx $a_ptr, $acc1, $t0
- adox $t4, $acc0
- shrx $a_ptr, $acc1, $t4
adc \$0, $acc0
+ shrx $a_ptr, $acc1, $t4
# reduction step 2
- xor $acc1, $acc1
- adcx $t0, $acc2
- adcx $t4, $acc3
+ add $t0, $acc2
+ adc $t4, $acc3
- mulx $t1, $t0, $t4
+ mulx $t1, $t0, $acc1
mov $acc2, %rdx
- adcx $t0, $acc0
+ adc $t0, $acc0
shlx $a_ptr, $acc2, $t0
- adox $t4, $acc1
- shrx $a_ptr, $acc2, $t4
adc \$0, $acc1
+ shrx $a_ptr, $acc2, $t4
# reduction step 3
- xor $acc2, $acc2
- adcx $t0, $acc3
- adcx $t4, $acc0
+ add $t0, $acc3
+ adc $t4, $acc0
- mulx $t1, $t0, $t4
+ mulx $t1, $t0, $acc2
mov $acc3, %rdx
- adcx $t0, $acc1
+ adc $t0, $acc1
shlx $a_ptr, $acc3, $t0
- adox $t4, $acc2
- shrx $a_ptr, $acc3, $t4
adc \$0, $acc2
+ shrx $a_ptr, $acc3, $t4
# reduction step 4
- xor $acc3, $acc3
- adcx $t0, $acc0
- adcx $t4, $acc1
+ add $t0, $acc0
+ adc $t4, $acc1
- mulx $t1, $t0, $t4
- adcx $t0, $acc2
- adox $t4, $acc3
+ mulx $t1, $t0, $acc3
+ adc $t0, $acc2
adc \$0, $acc3
xor $t3, $t3 # cf=0
@@ -1312,14 +1239,14 @@ __ecp_nistz256_sqr_montx:
sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $acc3
sbb $t1, $acc7 # .Lpoly[3]
+ sbb \$0, $t3
- bt \$0,$t3
- cmovnc $acc0, $acc4
- cmovnc $acc1, $acc5
+ cmovc $acc0, $acc4
+ cmovc $acc1, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $acc2, $acc6
+ cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr)
- cmovnc $acc3, $acc7
+ cmovc $acc3, $acc7
mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr)
@@ -1330,8 +1257,8 @@ ___
}
{
my ($r_ptr,$in_ptr)=("%rdi","%rsi");
-my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
-my ($t0,$t1)=("%rcx","%rsi");
+my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
$code.=<<___;
################################################################################
@@ -1348,109 +1275,83 @@ ecp_nistz256_from_mont:
push %r13
mov 8*0($in_ptr), %rax
+ mov .Lpoly+8*3(%rip), $t2
mov 8*1($in_ptr), $acc1
mov 8*2($in_ptr), $acc2
mov 8*3($in_ptr), $acc3
- lea .Lpoly(%rip), $in_ptr
- xor $acc4, $acc4
mov %rax, $acc0
+ mov .Lpoly+8*1(%rip), $t1
#########################################
# First iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
+ mov %rax, $t0
+ shl \$32, $acc0
+ mulq $t2
+ shr \$32, $t0
add $acc0, $acc1
- adc \$0, %rdx
- add %rax, $acc1
- mov $acc0, %rax
- adc %rdx, $acc2
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- xor $acc0, $acc0
- add $t0, $acc3
- adc \$0, %rdx
- add %rax, $acc3
+ adc $t0, $acc2
+ adc %rax, $acc3
mov $acc1, %rax
- adc %rdx, $acc4
- adc \$0, $acc0
+ adc \$0, %rdx
#########################################
# Second iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
+ mov $acc1, $t0
+ shl \$32, $acc1
+ mov %rdx, $acc0
+ mulq $t2
+ shr \$32, $t0
add $acc1, $acc2
- adc \$0, %rdx
- add %rax, $acc2
- mov $acc1, %rax
- adc %rdx, $acc3
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- xor $acc1, $acc1
- add $t0, $acc4
- adc \$0, %rdx
- add %rax, $acc4
+ adc $t0, $acc3
+ adc %rax, $acc0
mov $acc2, %rax
- adc %rdx, $acc0
- adc \$0, $acc1
+ adc \$0, %rdx
##########################################
# Third iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
+ mov $acc2, $t0
+ shl \$32, $acc2
+ mov %rdx, $acc1
+ mulq $t2
+ shr \$32, $t0
add $acc2, $acc3
- adc \$0, %rdx
- add %rax, $acc3
- mov $acc2, %rax
- adc %rdx, $acc4
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- xor $acc2, $acc2
- add $t0, $acc0
- adc \$0, %rdx
- add %rax, $acc0
+ adc $t0, $acc0
+ adc %rax, $acc1
mov $acc3, %rax
- adc %rdx, $acc1
- adc \$0, $acc2
+ adc \$0, %rdx
###########################################
# Last iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
- add $acc3, $acc4
- adc \$0, %rdx
- add %rax, $acc4
- mov $acc3, %rax
- adc %rdx, $acc0
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- add $t0, $acc1
+ mov $acc3, $t0
+ shl \$32, $acc3
+ mov %rdx, $acc2
+ mulq $t2
+ shr \$32, $t0
+ add $acc3, $acc0
+ adc $t0, $acc1
+ mov $acc0, $t0
+ adc %rax, $acc2
+ mov $acc1, $in_ptr
adc \$0, %rdx
- add %rax, $acc1
- adc %rdx, $acc2
- sbb $acc3, $acc3
- mov 0*8($in_ptr), %rax
- mov 1*8($in_ptr), %rdx
- mov 2*8($in_ptr), $t0
- mov 3*8($in_ptr), $t1
-
- and $acc3, %rax
- and $acc3, %rdx
- and $acc3, $t0
- and $acc3, $t1
-
- sub %rax, $acc4
- sbb %rdx, $acc0
- mov $acc4, 8*0($r_ptr)
- sbb $t0, $acc1
- mov $acc0, 8*1($r_ptr)
- sbb $t1, $acc2
- mov $acc1, 8*2($r_ptr)
- mov $acc2, 8*3($r_ptr)
+ ###########################################
+ # Branch-less conditional subtraction
+ sub \$-1, $acc0
+ mov $acc2, %rax
+ sbb $t1, $acc1
+ sbb \$0, $acc2
+ mov %rdx, $acc3
+ sbb $t2, %rdx
+ sbb $t2, $t2
+
+ cmovnz $t0, $acc0
+ cmovnz $in_ptr, $acc1
+ mov $acc0, 8*0($r_ptr)
+ cmovnz %rax, $acc2
+ mov $acc1, 8*1($r_ptr)
+ cmovz %rdx, $acc3
+ mov $acc2, 8*2($r_ptr)
+ mov $acc3, 8*3($r_ptr)
pop %r13
pop %r12
hooks/post-receive
--
OpenSSL source code
More information about the openssl-commits
mailing list