[openssl-commits] [openssl] master update
Matt Caswell
matt at openssl.org
Wed Aug 24 09:46:49 UTC 2016
The branch master has been updated
via dfde4219fdebbb5a8a17602fea036f7690e517ea (commit)
via b62b2454fadfccaf5e055a1810d72174c2633b8f (commit)
from 9e421962e1cd58e302ebd8aca5d5a44198194243 (commit)
- Log -----------------------------------------------------------------
commit dfde4219fdebbb5a8a17602fea036f7690e517ea
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Aug 20 22:10:24 2016 +0200
ec/asm/ecp_nistz256-*.pl: addition to perform stricter reduction.
Addition was not preserving inputs' property of being fully reduced.
Thanks to Brian Smith for reporting this.
Reviewed-by: Rich Salz <rsalz at openssl.org>
commit b62b2454fadfccaf5e055a1810d72174c2633b8f
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Aug 20 22:04:21 2016 +0200
ec/asm/ecp_nistz256-x86_64.pl: addition to perform stricter reduction.
Addition was not preserving inputs' property of being fully reduced.
Thanks to Brian Smith for reporting this.
Reviewed-by: Rich Salz <rsalz at openssl.org>
-----------------------------------------------------------------------
Summary of changes:
crypto/ec/asm/ecp_nistz256-armv4.pl | 124 ++++++++++++++++++++--------------
crypto/ec/asm/ecp_nistz256-armv8.pl | 12 ++--
crypto/ec/asm/ecp_nistz256-sparcv9.pl | 80 ++++++++++++++--------
crypto/ec/asm/ecp_nistz256-x86.pl | 31 +++++++--
crypto/ec/asm/ecp_nistz256-x86_64.pl | 115 ++++++++++++++++---------------
crypto/ec/ecp_nistz256.c | 31 +++++++--
6 files changed, 242 insertions(+), 151 deletions(-)
diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl
index 73b7a55..de3cd5c 100755
--- a/crypto/ec/asm/ecp_nistz256-armv4.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv4.pl
@@ -174,10 +174,7 @@ __ecp_nistz256_mul_by_2:
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
-#ifdef __thumb2__
- it cs
-#endif
- movcs $ff,#-1 @ $ff = carry ? -1 : 0
+ adc $ff,$ff,#0
b .Lreduce_by_sub
.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
@@ -228,35 +225,45 @@ __ecp_nistz256_add:
adcs $a6,$a6,$t2
mov $ff,#0
adcs $a7,$a7,$t3
-#ifdef __thumb2__
- it cs
-#endif
- movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
+ adc $ff,$ff,#0
ldr lr,[sp],#4 @ pop lr
.Lreduce_by_sub:
- @ if a+b carries, subtract modulus.
+ @ if a+b >= modulus, subtract modulus.
@
+ @ But since comparison implies subtraction, we subtract
+ @ modulus and then add it back if subraction borrowed.
+
+ subs $a0,$a0,#-1
+ sbcs $a1,$a1,#-1
+ sbcs $a2,$a2,#-1
+ sbcs $a3,$a3,#0
+ sbcs $a4,$a4,#0
+ sbcs $a5,$a5,#0
+ sbcs $a6,$a6,#1
+ sbcs $a7,$a7,#-1
+ sbc $ff,$ff,#0
+
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
- @ using value of broadcasted carry as a whole or extracting
- @ single bit. Follow $ff register...
+ @ using value of borrow as a whole or extracting single bit.
+ @ Follow $ff register...
- subs $a0,$a0,$ff @ subtract synthesized modulus
- sbcs $a1,$a1,$ff
+ adds $a0,$a0,$ff @ add synthesized modulus
+ adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
- sbcs $a2,$a2,$ff
+ adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
- sbcs $a3,$a3,#0
+ adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
- sbcs $a4,$a4,#0
+ adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
- sbcs $a5,$a5,#0
+ adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
- sbcs $a6,$a6,$ff,lsr#31
+ adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
- sbcs $a7,$a7,$ff
+ adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
@@ -304,26 +311,29 @@ __ecp_nistz256_mul_by_3:
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
-#ifdef __thumb2__
- it cs
-#endif
- movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
-
- subs $a0,$a0,$ff @ subtract synthesized modulus, see
- @ .Lreduce_by_sub for details, except
- @ that we don't write anything to
- @ memory, but keep intermediate
- @ results in registers...
- sbcs $a1,$a1,$ff
- sbcs $a2,$a2,$ff
+ adc $ff,$ff,#0
+
+ subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
+ sbcs $a1,$a1,#-1
+ sbcs $a2,$a2,#-1
sbcs $a3,$a3,#0
sbcs $a4,$a4,#0
- ldr $b_ptr,[$a_ptr,#0]
sbcs $a5,$a5,#0
+ sbcs $a6,$a6,#1
+ sbcs $a7,$a7,#-1
+ sbc $ff,$ff,#0
+
+ adds $a0,$a0,$ff @ add synthesized modulus
+ adcs $a1,$a1,$ff
+ adcs $a2,$a2,$ff
+ adcs $a3,$a3,#0
+ adcs $a4,$a4,#0
+ ldr $b_ptr,[$a_ptr,#0]
+ adcs $a5,$a5,#0
ldr $t1,[$a_ptr,#4]
- sbcs $a6,$a6,$ff,lsr#31
+ adcs $a6,$a6,$ff,lsr#31
ldr $t2,[$a_ptr,#8]
- sbcs $a7,$a7,$ff
+ adc $a7,$a7,$ff
ldr $t0,[$a_ptr,#12]
adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
@@ -339,10 +349,7 @@ __ecp_nistz256_mul_by_3:
adcs $a6,$a6,$t2
mov $ff,#0
adcs $a7,$a7,$t3
-#ifdef __thumb2__
- it cs
-#endif
- movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
+ adc $ff,$ff,#0
ldr lr,[sp],#4 @ pop lr
b .Lreduce_by_sub
@@ -1210,25 +1217,42 @@ __ecp_nistz256_add_self:
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
-#ifdef __thumb2__
- it cs
-#endif
- movcs $ff,#-1 @ $ff = carry ? -1 : 0
+ adc $ff,$ff,#0
+
+ @ if a+b >= modulus, subtract modulus.
+ @
+ @ But since comparison implies subtraction, we subtract
+ @ modulus and then add it back if subraction borrowed.
+
+ subs $a0,$a0,#-1
+ sbcs $a1,$a1,#-1
+ sbcs $a2,$a2,#-1
+ sbcs $a3,$a3,#0
+ sbcs $a4,$a4,#0
+ sbcs $a5,$a5,#0
+ sbcs $a6,$a6,#1
+ sbcs $a7,$a7,#-1
+ sbc $ff,$ff,#0
- subs $a0,$a0,$ff @ subtract synthesized modulus
- sbcs $a1,$a1,$ff
+ @ Note that because mod has special form, i.e. consists of
+ @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
+ @ using value of borrow as a whole or extracting single bit.
+ @ Follow $ff register...
+
+ adds $a0,$a0,$ff @ add synthesized modulus
+ adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
- sbcs $a2,$a2,$ff
+ adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
- sbcs $a3,$a3,#0
+ adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
- sbcs $a4,$a4,#0
+ adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
- sbcs $a5,$a5,#0
+ adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
- sbcs $a6,$a6,$ff,lsr#31
+ adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
- sbcs $a7,$a7,$ff
+ adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl
index c5c1280..1362586 100644
--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
@@ -583,14 +583,14 @@ __ecp_nistz256_add:
adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
sbcs $t1,$acc1,$poly1
sbcs $t2,$acc2,xzr
- sbc $t3,$acc3,$poly3
- cmp $ap,xzr // did addition carry?
+ sbcs $t3,$acc3,$poly3
+ sbcs xzr,$ap,xzr // did subtraction borrow?
- csel $acc0,$acc0,$t0,eq // ret = carry ? ret-modulus : ret
- csel $acc1,$acc1,$t1,eq
- csel $acc2,$acc2,$t2,eq
+ csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $acc1,$acc1,$t1,lo
+ csel $acc2,$acc2,$t2,lo
stp $acc0,$acc1,[$rp]
- csel $acc3,$acc3,$t3,eq
+ csel $acc3,$acc3,$t3,lo
stp $acc2,$acc3,[$rp,#16]
ret
diff --git a/crypto/ec/asm/ecp_nistz256-sparcv9.pl b/crypto/ec/asm/ecp_nistz256-sparcv9.pl
index 3f39088..3c7ff50 100755
--- a/crypto/ec/asm/ecp_nistz256-sparcv9.pl
+++ b/crypto/ec/asm/ecp_nistz256-sparcv9.pl
@@ -406,33 +406,44 @@ __ecp_nistz256_add:
addccc @acc[5],$t5, at acc[5]
addccc @acc[6],$t6, at acc[6]
addccc @acc[7],$t7, at acc[7]
- subc %g0,%g0,$carry ! broadcast carry bit
+ addc %g0,%g0,$carry
.Lreduce_by_sub:
- ! if a+b carries, subtract modulus.
+ ! if a+b >= modulus, subtract modulus.
!
+ ! But since comparison implies subtraction, we subtract
+ ! modulus and then add it back if subraction borrowed.
+
+ subcc @acc[0],-1, at acc[0]
+ subccc @acc[1],-1, at acc[1]
+ subccc @acc[2],-1, at acc[2]
+ subccc @acc[3], 0, at acc[3]
+ subccc @acc[4], 0, at acc[4]
+ subccc @acc[5], 0, at acc[5]
+ subccc @acc[6], 1, at acc[6]
+ subccc @acc[7],-1, at acc[7]
+ subc $carry,0,$carry
+
! Note that because mod has special form, i.e. consists of
! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
- ! using value of broadcasted borrow and the borrow bit itself.
- ! To minimize dependency chain we first broadcast and then
- ! extract the bit by negating (follow $bi).
+ ! using value of borrow and its negative.
- subcc @acc[0],$carry, at acc[0] ! subtract synthesized modulus
- subccc @acc[1],$carry, at acc[1]
+ addcc @acc[0],$carry, at acc[0] ! add synthesized modulus
+ addccc @acc[1],$carry, at acc[1]
neg $carry,$bi
st @acc[0],[$rp]
- subccc @acc[2],$carry, at acc[2]
+ addccc @acc[2],$carry, at acc[2]
st @acc[1],[$rp+4]
- subccc @acc[3],0, at acc[3]
+ addccc @acc[3],0, at acc[3]
st @acc[2],[$rp+8]
- subccc @acc[4],0, at acc[4]
+ addccc @acc[4],0, at acc[4]
st @acc[3],[$rp+12]
- subccc @acc[5],0, at acc[5]
+ addccc @acc[5],0, at acc[5]
st @acc[4],[$rp+16]
- subccc @acc[6],$bi, at acc[6]
+ addccc @acc[6],$bi, at acc[6]
st @acc[5],[$rp+20]
- subc @acc[7],$carry, at acc[7]
+ addc @acc[7],$carry, at acc[7]
st @acc[6],[$rp+24]
retl
st @acc[7],[$rp+28]
@@ -469,7 +480,7 @@ __ecp_nistz256_mul_by_2:
addccc @acc[6], at acc[6], at acc[6]
addccc @acc[7], at acc[7], at acc[7]
b .Lreduce_by_sub
- subc %g0,%g0,$carry ! broadcast carry bit
+ addc %g0,%g0,$carry
.type __ecp_nistz256_mul_by_2,#function
.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
@@ -502,17 +513,27 @@ __ecp_nistz256_mul_by_3:
addccc @acc[5], at acc[5],$t5
addccc @acc[6], at acc[6],$t6
addccc @acc[7], at acc[7],$t7
- subc %g0,%g0,$carry ! broadcast carry bit
+ addc %g0,%g0,$carry
- subcc $t0,$carry,$t0 ! .Lreduce_by_sub but without stores
+ subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores
+ subccc $t1,-1,$t1
+ subccc $t2,-1,$t2
+ subccc $t3, 0,$t3
+ subccc $t4, 0,$t4
+ subccc $t5, 0,$t5
+ subccc $t6, 1,$t6
+ subccc $t7,-1,$t7
+ subc $carry,0,$carry
+
+ addcc $t0,$carry,$t0 ! add synthesized modulus
+ addccc $t1,$carry,$t1
neg $carry,$bi
- subccc $t1,$carry,$t1
- subccc $t2,$carry,$t2
- subccc $t3,0,$t3
- subccc $t4,0,$t4
- subccc $t5,0,$t5
- subccc $t6,$bi,$t6
- subc $t7,$carry,$t7
+ addccc $t2,$carry,$t2
+ addccc $t3,0,$t3
+ addccc $t4,0,$t4
+ addccc $t5,0,$t5
+ addccc $t6,$bi,$t6
+ addc $t7,$carry,$t7
addcc $t0, at acc[0], at acc[0] ! 2*a+a=3*a
addccc $t1, at acc[1], at acc[1]
@@ -523,7 +544,7 @@ __ecp_nistz256_mul_by_3:
addccc $t6, at acc[6], at acc[6]
addccc $t7, at acc[7], at acc[7]
b .Lreduce_by_sub
- subc %g0,%g0,$carry ! broadcast carry bit
+ addc %g0,%g0,$carry
.type __ecp_nistz256_mul_by_3,#function
.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
@@ -1662,14 +1683,15 @@ __ecp_nistz256_add_noload_vis3:
addcc $acc0,1,$t0 ! add -modulus, i.e. subtract
addxccc $acc1,$poly1,$t1
addxccc $acc2,$minus1,$t2
- addxc $acc3,$poly3,$t3
+ addxccc $acc3,$poly3,$t3
+ addxc $acc4,$minus1,$acc4
- movrnz $acc4,$t0,$acc0 ! if a+b carried, ret = ret-mod
- movrnz $acc4,$t1,$acc1
+ movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus
+ movrz $acc4,$t1,$acc1
stx $acc0,[$rp]
- movrnz $acc4,$t2,$acc2
+ movrz $acc4,$t2,$acc2
stx $acc1,[$rp+8]
- movrnz $acc4,$t3,$acc3
+ movrz $acc4,$t3,$acc3
stx $acc2,[$rp+16]
retl
stx $acc3,[$rp+24]
diff --git a/crypto/ec/asm/ecp_nistz256-x86.pl b/crypto/ec/asm/ecp_nistz256-x86.pl
index e9fa038..b96b1aa 100755
--- a/crypto/ec/asm/ecp_nistz256-x86.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86.pl
@@ -284,18 +284,41 @@ for(1..37) {
&mov (&DWP(16,"edi"),"eax");
&adc ("ecx",&DWP(24,"ebp"));
&mov (&DWP(20,"edi"),"ebx");
+ &mov ("esi",0);
&adc ("edx",&DWP(28,"ebp"));
&mov (&DWP(24,"edi"),"ecx");
- &sbb ("esi","esi"); # broadcast carry bit
+ &adc ("esi",0);
&mov (&DWP(28,"edi"),"edx");
- # if a+b carries, subtract modulus.
+ # if a+b >= modulus, subtract modulus.
#
+ # But since comparison implies subtraction, we subtract modulus
+ # to see if it borrows, and then subtract it for real if
+ # subtraction didn't borrow.
+
+ &mov ("eax",&DWP(0,"edi"));
+ &mov ("ebx",&DWP(4,"edi"));
+ &mov ("ecx",&DWP(8,"edi"));
+ &sub ("eax",-1);
+ &mov ("edx",&DWP(12,"edi"));
+ &sbb ("ebx",-1);
+ &mov ("eax",&DWP(16,"edi"));
+ &sbb ("ecx",-1);
+ &mov ("ebx",&DWP(20,"edi"));
+ &sbb ("edx",0);
+ &mov ("ecx",&DWP(24,"edi"));
+ &sbb ("eax",0);
+ &mov ("edx",&DWP(28,"edi"));
+ &sbb ("ebx",0);
+ &sbb ("ecx",1);
+ &sbb ("edx",-1);
+ &sbb ("esi",0);
+
# Note that because mod has special form, i.e. consists of
# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
- # assigning carry bit to one register, %ebp, and its negative
- # to another, %esi. But we started by calculating %esi...
+ # by using borrow.
+ ¬ ("esi");
&mov ("eax",&DWP(0,"edi"));
&mov ("ebp","esi");
&mov ("ebx",&DWP(4,"edi"));
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index cce92b9..cc7b976 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -135,6 +135,7 @@ ecp_nistz256_mul_by_2:
push %r13
mov 8*0($a_ptr), $a0
+ xor $t4,$t4
mov 8*1($a_ptr), $a1
add $a0, $a0 # a0:a3+a0:a3
mov 8*2($a_ptr), $a2
@@ -145,7 +146,7 @@ ecp_nistz256_mul_by_2:
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub 8*0($a_ptr), $a0
mov $a2, $t2
@@ -153,14 +154,14 @@ ecp_nistz256_mul_by_2:
sbb 8*2($a_ptr), $a2
mov $a3, $t3
sbb 8*3($a_ptr), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovb $t0, $a0
+ cmovb $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovb $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovb $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -257,12 +258,12 @@ ecp_nistz256_mul_by_3:
sbb \$0, $a2
mov $a3, $t3
sbb .Lpoly+8*3(%rip), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
- cmovz $t2, $a2
- cmovz $t3, $a3
+ cmovb $t0, $a0
+ cmovb $t1, $a1
+ cmovb $t2, $a2
+ cmovb $t3, $a3
xor $t4, $t4
add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
@@ -279,14 +280,14 @@ ecp_nistz256_mul_by_3:
sbb \$0, $a2
mov $a3, $t3
sbb .Lpoly+8*3(%rip), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovb $t0, $a0
+ cmovb $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovb $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovb $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -325,14 +326,14 @@ ecp_nistz256_add:
sbb 8*2($a_ptr), $a2
mov $a3, $t3
sbb 8*3($a_ptr), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovb $t0, $a0
+ cmovb $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovb $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovb $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -1890,13 +1891,14 @@ $code.=<<___;
.type __ecp_nistz256_add_toq,\@abi-omnipotent
.align 32
__ecp_nistz256_add_toq:
+ xor $t4,$t4
add 8*0($b_ptr), $a0
adc 8*1($b_ptr), $a1
mov $a0, $t0
adc 8*2($b_ptr), $a2
adc 8*3($b_ptr), $a3
mov $a1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $a0
mov $a2, $t2
@@ -1904,14 +1906,14 @@ __ecp_nistz256_add_toq:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovb $t0, $a0
+ cmovb $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovb $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovb $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -1979,13 +1981,14 @@ __ecp_nistz256_subq:
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
.align 32
__ecp_nistz256_mul_by_2q:
+ xor $t4, $t4
add $a0, $a0 # a0:a3+a0:a3
adc $a1, $a1
mov $a0, $t0
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $a0
mov $a2, $t2
@@ -1993,14 +1996,14 @@ __ecp_nistz256_mul_by_2q:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovb $t0, $a0
+ cmovb $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovb $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovb $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -2455,6 +2458,7 @@ $code.=<<___;
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+ xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
@@ -2462,7 +2466,7 @@ $code.=<<___;
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
@@ -2470,15 +2474,15 @@ $code.=<<___;
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $acc0
+ cmovb $t0, $acc0
mov 8*0($a_ptr), $t0
- cmovz $t1, $acc1
+ cmovb $t1, $acc1
mov 8*1($a_ptr), $t1
- cmovz $t2, $acc2
+ cmovb $t2, $acc2
mov 8*2($a_ptr), $t2
- cmovz $t3, $acc3
+ cmovb $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
@@ -2760,6 +2764,7 @@ $code.=<<___;
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+ xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
@@ -2767,7 +2772,7 @@ $code.=<<___;
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
@@ -2775,15 +2780,15 @@ $code.=<<___;
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $acc0
+ cmovb $t0, $acc0
mov 8*0($a_ptr), $t0
- cmovz $t1, $acc1
+ cmovb $t1, $acc1
mov 8*1($a_ptr), $t1
- cmovz $t2, $acc2
+ cmovb $t2, $acc2
mov 8*2($a_ptr), $t2
- cmovz $t3, $acc3
+ cmovb $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
@@ -2935,14 +2940,14 @@ __ecp_nistz256_add_tox:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
+ sbb \$0, $t4
- bt \$0, $t4
- cmovnc $t0, $a0
- cmovnc $t1, $a1
+ cmovb $t0, $a0
+ cmovb $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovnc $t2, $a2
+ cmovb $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovnc $t3, $a3
+ cmovb $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
@@ -3030,14 +3035,14 @@ __ecp_nistz256_mul_by_2x:
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
+ sbb \$0, $t4
- bt \$0, $t4
- cmovnc $t0, $a0
- cmovnc $t1, $a1
+ cmovb $t0, $a0
+ cmovb $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovnc $t2, $a2
+ cmovb $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovnc $t3, $a3
+ cmovb $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
index d2fabe5..564a889 100644
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -89,19 +89,36 @@ struct nistz256_pre_comp_st {
};
/* Functions implemented in assembly */
+/*
+ * Most of below mentioned functions *preserve* the property of inputs
+ * being fully reduced, i.e. being in [0, modulus) range. Simply put if
+ * inputs are fully reduced, then output is too. Note that reverse is
+ * not true, in sense that given partially reduced inputs output can be
+ * either, not unlikely reduced. And "most" in first sentence refers to
+ * the fact that given the calculations flow one can tolerate that
+ * addition, 1st function below, produces partially reduced result *if*
+ * multiplications by 2 and 3, which customarily use addition, fully
+ * reduce it. This effectively gives two options: a) addition produces
+ * fully reduced result [as long as inputs are, just like remaining
+ * functions]; b) addition is allowed to produce partially reduced
+ * result, but multiplications by 2 and 3 perform additional reduction
+ * step. Choice between the two can be platform-specific, but it was a)
+ * in all cases so far...
+ */
+/* Modular add: res = a+b mod P */
+void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS]);
/* Modular mul by 2: res = 2*a mod P */
void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]);
-/* Modular div by 2: res = a/2 mod P */
-void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
- const BN_ULONG a[P256_LIMBS]);
/* Modular mul by 3: res = 3*a mod P */
void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]);
-/* Modular add: res = a+b mod P */
-void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
- const BN_ULONG a[P256_LIMBS],
- const BN_ULONG b[P256_LIMBS]);
+
+/* Modular div by 2: res = a/2 mod P */
+void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS]);
/* Modular sub: res = a-b mod P */
void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
More information about the openssl-commits
mailing list