[openssl-commits] [openssl] OpenSSL source code branch master updated. 9e557ab2624d5c5e8d799c123f5e8211664d8845

Sun Jan 4 22:22:27 UTC 2015

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "OpenSSL source code".

The branch, master has been updated
       via  9e557ab2624d5c5e8d799c123f5e8211664d8845 (commit)
      from  2c60925d1ccc0b96287bdc9acb90198e7180d642 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 9e557ab2624d5c5e8d799c123f5e8211664d8845
Author: Andy Polyakov <appro at openssl.org>
Date:   Sun Jan 4 21:29:50 2015 +0100

    ecp_nistz256-x86_64.pl: fix occasional failures.
    
    RT: 3607
    Reviewed-by: Adam Langley <agl at google.com>
    Reviewed-by: Emilia Kasper <emilia at openssl.org>

-----------------------------------------------------------------------

Summary of changes:
 crypto/ec/asm/ecp_nistz256-x86_64.pl |  481 ++++++++++++++--------------------
 1 file changed, 191 insertions(+), 290 deletions(-)

diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 4486a5e..cdff22a 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -31,15 +31,16 @@
 # Further optimization by <appro at openssl.org>:
 #
 #		this/original
-# Opteron	+8-33%
-# Bulldozer	+10-30%
-# P4		+14-38%
-# Westmere	+8-23%
-# Sandy Bridge	+8-24%
-# Ivy Bridge	+7-25%
-# Haswell	+5-25%
-# Atom		+10-32%
-# VIA Nano	+37-130%
+# Opteron	+12-49%
+# Bulldozer	+14-45%
+# P4		+18-46%
+# Westmere	+12-34%
+# Sandy Bridge	+9-35%
+# Ivy Bridge	+9-35%
+# Haswell	+8-37%
+# Broadwell	+18-58%
+# Atom		+15-50%
+# VIA Nano	+43-160%
 #
 # Ranges denote minimum and maximum improvement coefficients depending
 # on benchmark. Lower coefficients are for ECDSA sign, relatively
@@ -550,28 +551,20 @@ __ecp_nistz256_mul_montq:
 	# and add the result to the acc.
 	# Due to the special form of p256 we do some optimizations
 	#
-	# acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
-	# then we add acc[0] and get acc[0] x 2^64
-
-	mulq	$poly1
-	xor	$t0, $t0
-	add	$acc0, $acc1		# +=acc[0]*2^64
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-
-	# acc[0] x p256[2] = 0
-	adc	%rdx, $acc2
-	adc	\$0, $t0
+	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
+	# then we add acc[0] and get acc[0] x 2^96
 
+	mov	$acc0, $t1
+	shl	\$32, $acc0
 	mulq	$poly3
-	xor	$acc0, $acc0
-	add	$t0, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
+	shr	\$32, $t1
+	add	$acc0, $acc1		# +=acc[0]<<96
+	adc	$t1, $acc2
+	adc	%rax, $acc3
 	 mov	8*1($b_ptr), %rax
 	adc	%rdx, $acc4
 	adc	\$0, $acc5
+	xor	$acc0, $acc0
 
 	########################################################################
 	# Multiply by b[1]
@@ -608,23 +601,17 @@ __ecp_nistz256_mul_montq:
 
 	########################################################################
 	# Second reduction step	
-	mulq	$poly1
-	xor	$t0, $t0
-	add	$acc1, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3
-	adc	\$0, $t0
-
+	mov	$acc1, $t1
+	shl	\$32, $acc1
 	mulq	$poly3
-	xor	$acc1, $acc1
-	add	$t0, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
+	shr	\$32, $t1
+	add	$acc1, $acc2
+	adc	$t1, $acc3
+	adc	%rax, $acc4
 	 mov	8*2($b_ptr), %rax
 	adc	%rdx, $acc5
 	adc	\$0, $acc0
+	xor	$acc1, $acc1
 
 	########################################################################
 	# Multiply by b[2]
@@ -661,23 +648,17 @@ __ecp_nistz256_mul_montq:
 
 	########################################################################
 	# Third reduction step	
-	mulq	$poly1
-	xor	$t0, $t0
-	add	$acc2, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $t0
-
+	mov	$acc2, $t1
+	shl	\$32, $acc2
 	mulq	$poly3
-	xor	$acc2, $acc2
-	add	$t0, $acc5
-	adc	\$0, %rdx
-	add	%rax, $acc5
+	shr	\$32, $t1
+	add	$acc2, $acc3
+	adc	$t1, $acc4
+	adc	%rax, $acc5
 	 mov	8*3($b_ptr), %rax
 	adc	%rdx, $acc0
 	adc	\$0, $acc1
+	xor	$acc2, $acc2
 
 	########################################################################
 	# Multiply by b[3]
@@ -714,20 +695,14 @@ __ecp_nistz256_mul_montq:
 
 	########################################################################
 	# Final reduction step	
-	mulq	$poly1
-	#xor	$t0, $t0
-	add	$acc3, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc5
-	#adc	\$0, $t0		# doesn't overflow
-
+	mov	$acc3, $t1
+	shl	\$32, $acc3
 	mulq	$poly3
-	#add	$t0, $acc0
-	#adc	\$0, %rdx
+	shr	\$32, $t1
+	add	$acc3, $acc4
+	adc	$t1, $acc5
 	 mov	$acc4, $t0
-	add	%rax, $acc0
+	adc	%rax, $acc0
 	adc	%rdx, $acc1
 	 mov	$acc5, $t1
 	adc	\$0, $acc2
@@ -740,14 +715,14 @@ __ecp_nistz256_mul_montq:
 	sbb	\$0, $acc0		# .Lpoly[2]
 	 mov	$acc1, $t3
 	sbb	$poly3, $acc1		# .Lpoly[3]
-	neg	$acc2
+	sbb	\$0, $acc2
 
-	cmovnc	$t0, $acc4
-	cmovnc	$t1, $acc5
+	cmovc	$t0, $acc4
+	cmovc	$t1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$t2, $acc0
+	cmovc	$t2, $acc0
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t3, $acc1
+	cmovc	$t3, $acc1
 	mov	$acc0, 8*2($r_ptr)
 	mov	$acc1, 8*3($r_ptr)
 
@@ -897,89 +872,62 @@ __ecp_nistz256_sqr_montq:
 	##########################################
 	# Now the reduction
 	# First iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc0, $acc1
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-	adc	%rdx, $acc2	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc0, $t0
+	shl	\$32, $acc0
 	mulq	$t1
-	xor	$acc0, $acc0
-	#add	$t0, $acc3
-	#adc	\$0, %rdx
-	add	%rax, $acc3
+	shr	\$32, $t0
+	add	$acc0, $acc1		# +=acc[0]<<96
+	adc	$t0, $acc2
+	adc	%rax, $acc3
 	 mov	$acc1, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $acc0
+	adc	\$0, %rdx
 
 	##########################################
 	# Second iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc1, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc1, $t0
+	shl	\$32, $acc1
+	mov	%rdx, $acc0
 	mulq	$t1
-	xor	$acc1, $acc1
-	#add	$t0, $acc4
-	#adc	\$0, %rdx
-	add	%rax, $acc4
+	shr	\$32, $t0
+	add	$acc1, $acc2
+	adc	$t0, $acc3
+	adc	%rax, $acc0
 	 mov	$acc2, %rax
-	adc	%rdx, $acc0
-	adc	\$0, $acc1
+	adc	\$0, %rdx
 
 	##########################################
 	# Third iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc2, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc2, $t0
+	shl	\$32, $acc2
+	mov	%rdx, $acc1
 	mulq	$t1
-	xor	$acc2, $acc2
-	#add	$t0, $acc0
-	#adc	\$0, %rdx
-	add	%rax, $acc0
+	shr	\$32, $t0
+	add	$acc2, $acc3
+	adc	$t0, $acc0
+	adc	%rax, $acc1
 	 mov	$acc3, %rax
-	adc	%rdx, $acc1
-	adc	\$0, $acc2
+	adc	\$0, %rdx
 
 	###########################################
 	# Last iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc3, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc0	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc3, $t0
+	shl	\$32, $acc3
+	mov	%rdx, $acc2
 	mulq	$t1
+	shr	\$32, $t0
+	add	$acc3, $acc0
+	adc	$t0, $acc1
+	adc	%rax, $acc2
+	adc	\$0, %rdx
 	xor	$acc3, $acc3
-	#add	$t0, $acc1
-	#adc	\$0, %rdx
-	add	%rax, $acc1
-	adc	%rdx, $acc2
-	adc	\$0, $acc3
 
 	############################################
 	# Add the rest of the acc
-	add	$acc0, $acc5
+	add	$acc0, $acc4
+	adc	$acc1, $acc5
 	 mov	$acc4, $acc0
-	adc	$acc1, $acc6
-	adc	$acc2, $acc7
+	adc	$acc2, $acc6
+	adc	%rdx, $acc7
 	 mov	$acc5, $acc1
 	adc	\$0, $acc3
 
@@ -989,14 +937,14 @@ __ecp_nistz256_sqr_montq:
 	sbb	\$0, $acc6		# .Lpoly[2]
 	 mov	$acc7, $t0
 	sbb	$t1, $acc7		# .Lpoly[3]
-	neg	$acc3
+	sbb	\$0, $acc3
 
-	cmovnc	$acc0, $acc4
-	cmovnc	$acc1, $acc5
+	cmovc	$acc0, $acc4
+	cmovc	$acc1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$acc2, $acc6
+	cmovc	$acc2, $acc6
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t0, $acc7
+	cmovc	$t0, $acc7
 	mov	$acc6, 8*2($r_ptr)
 	mov	$acc7, 8*3($r_ptr)
 
@@ -1028,18 +976,15 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# First reduction step
-	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
-	adox	$t1, $acc1
-	adox	$t0, $acc2
+	add	$t1, $acc1
+	adc	$t0, $acc2
 
 	mulx	$poly3, $t0, $t1
 	 mov	8*1($b_ptr), %rdx
-	adox	$t0, $acc3
-	adcx	$t1, $acc4
-
-	adox	$acc0, $acc4
-	adcx	$acc0, $acc5		# cf=0
-	adox	$acc0, $acc5		# of=0
+	adc	$t0, $acc3
+	adc	$t1, $acc4
+	adc	\$0, $acc5
+	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
 
 	########################################################################
 	# Multiply by b[1]
@@ -1068,18 +1013,15 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# Second reduction step
-	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
-	adox	$t0, $acc2
-	adox	$t1, $acc3
+	add	$t0, $acc2
+	adc	$t1, $acc3
 
 	mulx	$poly3, $t0, $t1
 	 mov	8*2($b_ptr), %rdx
-	adox	$t0, $acc4
-	adcx	$t1, $acc5
-
-	adox	$acc1, $acc5
-	adcx	$acc1, $acc0		# cf=0
-	adox	$acc1, $acc0		# of=0
+	adc	$t0, $acc4
+	adc	$t1, $acc5
+	adc	\$0, $acc0
+	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
 
 	########################################################################
 	# Multiply by b[2]
@@ -1108,18 +1050,15 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# Third reduction step
-	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
-	adox	$t0, $acc3
-	adox	$t1, $acc4
+	add	$t0, $acc3
+	adc	$t1, $acc4
 
 	mulx	$poly3, $t0, $t1
 	 mov	8*3($b_ptr), %rdx
-	adox	$t0, $acc5
-	adcx	$t1, $acc0
-
-	adox	$acc2, $acc0
-	adcx	$acc2, $acc1		# cf=0
-	adox	$acc2, $acc1		# of=0
+	adc	$t0, $acc5
+	adc	$t1, $acc0
+	adc	\$0, $acc1
+	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
 
 	########################################################################
 	# Multiply by b[3]
@@ -1148,38 +1087,34 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# Fourth reduction step
-	xor	$acc3, $acc3		# $acc3=0,cf=0,of=0
-	adox	$t0, $acc4
-	adox	$t1, $acc5
+	add	$t0, $acc4
+	adc	$t1, $acc5
 
 	mulx	$poly3, $t0, $t1
 	 mov	$acc4, $t2
 	mov	.Lpoly+8*1(%rip), $poly1
-	adcx	$t0, $acc0
-	adox	$t1, $acc1
+	adc	$t0, $acc0
 	 mov	$acc5, $t3
-
-	adcx	$acc3, $acc1
-	adox	$acc3, $acc2
+	adc	$t1, $acc1
 	adc	\$0, $acc2
-	 mov	$acc0, $t0
 
 	########################################################################
 	# Branch-less conditional subtraction of P
 	xor	%eax, %eax
+	 mov	$acc0, $t0
 	sbb	\$-1, $acc4		# .Lpoly[0]
 	sbb	$poly1, $acc5		# .Lpoly[1]
 	sbb	\$0, $acc0		# .Lpoly[2]
 	 mov	$acc1, $t1
 	sbb	$poly3, $acc1		# .Lpoly[3]
+	sbb	\$0, $acc2
 
-	bt	\$0,$acc2
-	cmovnc	$t2, $acc4
-	cmovnc	$t3, $acc5
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$t0, $acc0
+	cmovc	$t0, $acc0
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t1, $acc1
+	cmovc	$t1, $acc1
 	mov	$acc0, 8*2($r_ptr)
 	mov	$acc1, 8*3($r_ptr)
 
@@ -1247,52 +1182,44 @@ __ecp_nistz256_sqr_montx:
 	 mov	.Lpoly+8*3(%rip), $t1
 
 	# reduction step 1
-	xor	$acc0, $acc0
-	adcx	$t0, $acc1
-	adcx	$t4, $acc2
+	add	$t0, $acc1
+	adc	$t4, $acc2
 
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc0
 	 mov	$acc1, %rdx
-	adcx	$t0, $acc3
+	adc	$t0, $acc3
 	 shlx	$a_ptr, $acc1, $t0
-	adox	$t4, $acc0
-	 shrx	$a_ptr, $acc1, $t4
 	adc	\$0, $acc0
+	 shrx	$a_ptr, $acc1, $t4
 
 	# reduction step 2
-	xor	$acc1, $acc1
-	adcx	$t0, $acc2
-	adcx	$t4, $acc3
+	add	$t0, $acc2
+	adc	$t4, $acc3
 
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc1
 	 mov	$acc2, %rdx
-	adcx	$t0, $acc0
+	adc	$t0, $acc0
 	 shlx	$a_ptr, $acc2, $t0
-	adox	$t4, $acc1
-	 shrx	$a_ptr, $acc2, $t4
 	adc	\$0, $acc1
+	 shrx	$a_ptr, $acc2, $t4
 
 	# reduction step 3
-	xor	$acc2, $acc2
-	adcx	$t0, $acc3
-	adcx	$t4, $acc0
+	add	$t0, $acc3
+	adc	$t4, $acc0
 
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc2
 	 mov	$acc3, %rdx
-	adcx	$t0, $acc1
+	adc	$t0, $acc1
 	 shlx	$a_ptr, $acc3, $t0
-	adox	$t4, $acc2
-	 shrx	$a_ptr, $acc3, $t4
 	adc	\$0, $acc2
+	 shrx	$a_ptr, $acc3, $t4
 
 	# reduction step 4
-	xor	$acc3, $acc3
-	adcx	$t0, $acc0
-	adcx	$t4, $acc1
+	add	$t0, $acc0
+	adc	$t4, $acc1
 
-	mulx	$t1, $t0, $t4
-	adcx	$t0, $acc2
-	adox	$t4, $acc3
+	mulx	$t1, $t0, $acc3
+	adc	$t0, $acc2
 	adc	\$0, $acc3
 
 	xor	$t3, $t3		# cf=0
@@ -1312,14 +1239,14 @@ __ecp_nistz256_sqr_montx:
 	sbb	\$0, $acc6		# .Lpoly[2]
 	 mov	$acc7, $acc3
 	sbb	$t1, $acc7		# .Lpoly[3]
+	sbb	\$0, $t3
 
-	bt	\$0,$t3
-	cmovnc	$acc0, $acc4
-	cmovnc	$acc1, $acc5
+	cmovc	$acc0, $acc4
+	cmovc	$acc1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$acc2, $acc6
+	cmovc	$acc2, $acc6
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$acc3, $acc7
+	cmovc	$acc3, $acc7
 	mov	$acc6, 8*2($r_ptr)
 	mov	$acc7, 8*3($r_ptr)
 
@@ -1330,8 +1257,8 @@ ___
 }
 {
 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
-my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
-my ($t0,$t1)=("%rcx","%rsi");
+my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
 
 $code.=<<___;
 ################################################################################
@@ -1348,109 +1275,83 @@ ecp_nistz256_from_mont:
 	push	%r13
 
 	mov	8*0($in_ptr), %rax
+	mov	.Lpoly+8*3(%rip), $t2
 	mov	8*1($in_ptr), $acc1
 	mov	8*2($in_ptr), $acc2
 	mov	8*3($in_ptr), $acc3
-	lea	.Lpoly(%rip), $in_ptr
-	xor	$acc4, $acc4
 	mov	%rax, $acc0
+	mov	.Lpoly+8*1(%rip), $t1
 
 	#########################################
 	# First iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
+	mov	%rax, $t0
+	shl	\$32, $acc0
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc0, $acc1
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-	adc	%rdx, $acc2
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	xor	$acc0, $acc0
-	add	$t0, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
+	adc	$t0, $acc2
+	adc	%rax, $acc3
 	 mov	$acc1, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $acc0
+	adc	\$0, %rdx
 
 	#########################################
 	# Second iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
+	mov	$acc1, $t0
+	shl	\$32, $acc1
+	mov	%rdx, $acc0
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc1, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	xor	$acc1, $acc1
-	add	$t0, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
+	adc	$t0, $acc3
+	adc	%rax, $acc0
 	 mov	$acc2, %rax
-	adc	%rdx, $acc0
-	adc	\$0, $acc1
+	adc	\$0, %rdx
 
 	##########################################
 	# Third iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
+	mov	$acc2, $t0
+	shl	\$32, $acc2
+	mov	%rdx, $acc1
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc2, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	xor	$acc2, $acc2
-	add	$t0, $acc0
-	adc	\$0, %rdx
-	add	%rax, $acc0
+	adc	$t0, $acc0
+	adc	%rax, $acc1
 	 mov	$acc3, %rax
-	adc	%rdx, $acc1
-	adc	\$0, $acc2
+	adc	\$0, %rdx
 
 	###########################################
 	# Last iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
-	add	$acc3, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc0
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	add	$t0, $acc1
+	mov	$acc3, $t0
+	shl	\$32, $acc3
+	mov	%rdx, $acc2
+	mulq	$t2
+	shr	\$32, $t0
+	add	$acc3, $acc0
+	adc	$t0, $acc1
+	 mov	$acc0, $t0
+	adc	%rax, $acc2
+	 mov	$acc1, $in_ptr
 	adc	\$0, %rdx
-	add	%rax, $acc1
-	adc	%rdx, $acc2
-	sbb	$acc3, $acc3
 
-	mov	0*8($in_ptr), %rax
-	mov	1*8($in_ptr), %rdx
-	mov	2*8($in_ptr), $t0
-	mov	3*8($in_ptr), $t1
-
-	and	$acc3, %rax
-	and	$acc3, %rdx
-	and	$acc3, $t0
-	and	$acc3, $t1
-
-	sub	%rax, $acc4
-	sbb	%rdx, $acc0
-	mov	$acc4, 8*0($r_ptr)
-	sbb	$t0, $acc1
-	mov	$acc0, 8*1($r_ptr)
-	sbb	$t1, $acc2
-	mov	$acc1, 8*2($r_ptr)
-	mov	$acc2, 8*3($r_ptr)
+	###########################################
+	# Branch-less conditional subtraction
+	sub	\$-1, $acc0
+	 mov	$acc2, %rax
+	sbb	$t1, $acc1
+	sbb	\$0, $acc2
+	 mov	%rdx, $acc3
+	sbb	$t2, %rdx
+	sbb	$t2, $t2
+
+	cmovnz	$t0, $acc0
+	cmovnz	$in_ptr, $acc1
+	mov	$acc0, 8*0($r_ptr)
+	cmovnz	%rax, $acc2
+	mov	$acc1, 8*1($r_ptr)
+	cmovz	%rdx, $acc3
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
 
 	pop	%r13
 	pop	%r12


hooks/post-receive
-- 
OpenSSL source code