[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Sun Jan 7 20:33:34 UTC 2018
The branch master has been updated
via 10bc3409459a525654d6b986b3cd49d22dd95460 (commit)
via ab4f2026b7ff8902d70ddd75adc080fc66ffd642 (commit)
via d5e11843fe430dfa89bdf83b6f7805c709dcdb41 (commit)
via eb7916960bf50f436593abe3d5f2e0592d291017 (commit)
from 617b49db14fa4c1211bfc5d0e88294d0f159c9a9 (commit)
- Log -----------------------------------------------------------------
commit 10bc3409459a525654d6b986b3cd49d22dd95460
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Dec 30 20:15:44 2017 +0100
ec/ecp_nistz256.c: switch to faster addition chain in scalar inversion.
[and improve formatting]
Reviewed-by: Rich Salz <rsalz at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5001)
commit ab4f2026b7ff8902d70ddd75adc080fc66ffd642
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Dec 30 15:11:25 2017 +0100
ec/asm/ecp_nistz256-armv8.pl: add optimized inversion.
Reviewed-by: Rich Salz <rsalz at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5001)
commit d5e11843fe430dfa89bdf83b6f7805c709dcdb41
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Dec 30 15:51:55 2017 +0100
ec/asm/ecp_nistz256-x86_64.pl: add .cfi and SEH handlers to new functions.
Reviewed-by: Rich Salz <rsalz at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5001)
commit eb7916960bf50f436593abe3d5f2e0592d291017
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Dec 30 15:08:31 2017 +0100
ec/ecp_nistz256.c: improve ECDSA sign by 30-40%.
This is based on RT#3810, which added dedicated modular inversion.
ECDSA verify results improves as well, but not as much.
Reviewed-by: Rich Salz <rsalz at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5001)
-----------------------------------------------------------------------
Summary of changes:
crypto/ec/asm/ecp_nistz256-armv8.pl | 309 +++++++-
crypto/ec/asm/ecp_nistz256-x86_64.pl | 1391 ++++++++++++++++++++++++++++++----
crypto/ec/ec_err.c | 2 +
crypto/ec/ec_lcl.h | 7 +-
crypto/ec/ec_lib.c | 13 +-
crypto/ec/ecdsa_ossl.c | 60 +-
crypto/ec/ecp_nistz256.c | 192 ++++-
crypto/err/openssl.txt | 1 +
include/openssl/ecerr.h | 1 +
9 files changed, 1809 insertions(+), 167 deletions(-)
diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl
index d93c4fe..2f1eb21 100644
--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
@@ -22,11 +22,10 @@
# http://eprint.iacr.org/2013/816.
#
# with/without -DECP_NISTZ256_ASM
-# Apple A7 +120-360%
-# Cortex-A53 +120-400%
-# Cortex-A57 +120-350%
-# X-Gene +200-330%
-# Denver +140-400%
+# Apple A7 +190-360%
+# Cortex-A53 +190-400%
+# Cortex-A57 +190-350%
+# Denver +230-400%
#
# Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, server-side
@@ -109,6 +108,10 @@ $code.=<<___;
.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
.Lone:
.quad 1,0,0,0
+.Lord:
+.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
@@ -1309,6 +1312,302 @@ $code.=<<___;
ret
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
___
+}
+if (1) {
+my ($ord0,$ord1) = ($poly1,$poly3);
+my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
+my $acc7 = $bi;
+
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t b[4]);
+.globl ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,%function
+.align 4
+ecp_nistz256_ord_mul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adr $ordk,.Lord
+ ldr $bi,[$bp] // bp[0]
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+
+ ldp $ord0,$ord1,[$ordk,#0]
+ ldp $ord2,$ord3,[$ordk,#16]
+ ldr $ordk,[$ordk,#32]
+
+ mul $acc0,$a0,$bi // a[0]*b[0]
+ umulh $t0,$a0,$bi
+
+ mul $acc1,$a1,$bi // a[1]*b[0]
+ umulh $t1,$a1,$bi
+
+ mul $acc2,$a2,$bi // a[2]*b[0]
+ umulh $t2,$a2,$bi
+
+ mul $acc3,$a3,$bi // a[3]*b[0]
+ umulh $acc4,$a3,$bi
+
+ mul $t4,$acc0,$ordk
+
+ adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
+ adcs $acc2,$acc2,$t1
+ adcs $acc3,$acc3,$t2
+ adc $acc4,$acc4,xzr
+ mov $acc5,xzr
+___
+for ($i=1;$i<4;$i++) {
+ ################################################################
+ # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
+ # * abcdefgh
+ # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+ #
+ # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+ # rewrite above as:
+ #
+ # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+ # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
+ # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
+$code.=<<___;
+ ldr $bi,[$bp,#8*$i] // b[i]
+
+ lsl $t0,$t4,#32
+ subs $acc2,$acc2,$t4
+ lsr $t1,$t4,#32
+ sbcs $acc3,$acc3,$t0
+ sbcs $acc4,$acc4,$t1
+ sbc $acc5,$acc5,xzr
+
+ subs xzr,$acc0,#1
+ umulh $t1,$ord0,$t4
+ mul $t2,$ord1,$t4
+ umulh $t3,$ord1,$t4
+
+ adcs $t2,$t2,$t1
+ mul $t0,$a0,$bi
+ adc $t3,$t3,xzr
+ mul $t1,$a1,$bi
+
+ adds $acc0,$acc1,$t2
+ mul $t2,$a2,$bi
+ adcs $acc1,$acc2,$t3
+ mul $t3,$a3,$bi
+ adcs $acc2,$acc3,$t4
+ adcs $acc3,$acc4,$t4
+ adc $acc4,$acc5,xzr
+
+ adds $acc0,$acc0,$t0 // accumulate low parts
+ umulh $t0,$a0,$bi
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$a1,$bi
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$a2,$bi
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$a3,$bi
+ adc $acc4,$acc4,xzr
+ mul $t4,$acc0,$ordk
+ adds $acc1,$acc1,$t0 // accumulate high parts
+ adcs $acc2,$acc2,$t1
+ adcs $acc3,$acc3,$t2
+ adcs $acc4,$acc4,$t3
+ adc $acc5,xzr,xzr
+___
+}
+$code.=<<___;
+ lsl $t0,$t4,#32 // last reduction
+ subs $acc2,$acc2,$t4
+ lsr $t1,$t4,#32
+ sbcs $acc3,$acc3,$t0
+ sbcs $acc4,$acc4,$t1
+ sbc $acc5,$acc5,xzr
+
+ subs xzr,$acc0,#1
+ umulh $t1,$ord0,$t4
+ mul $t2,$ord1,$t4
+ umulh $t3,$ord1,$t4
+
+ adcs $t2,$t2,$t1
+ adc $t3,$t3,xzr
+
+ adds $acc0,$acc1,$t2
+ adcs $acc1,$acc2,$t3
+ adcs $acc2,$acc3,$t4
+ adcs $acc3,$acc4,$t4
+ adc $acc4,$acc5,xzr
+
+ subs $t0,$acc0,$ord0 // ret -= modulus
+ sbcs $t1,$acc1,$ord1
+ sbcs $t2,$acc2,$ord2
+ sbcs $t3,$acc3,$ord3
+ sbcs xzr,$acc4,xzr
+
+ csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $acc1,$acc1,$t1,lo
+ csel $acc2,$acc2,$t2,lo
+ stp $acc0,$acc1,[$rp]
+ csel $acc3,$acc3,$t3,lo
+ stp $acc2,$acc3,[$rp,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+// int rep);
+.globl ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,%function
+.align 4
+ecp_nistz256_ord_sqr_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adr $ordk,.Lord
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+
+ ldp $ord0,$ord1,[$ordk,#0]
+ ldp $ord2,$ord3,[$ordk,#16]
+ ldr $ordk,[$ordk,#32]
+ b .Loop_ord_sqr
+
+.align 4
+.Loop_ord_sqr:
+ sub $bp,$bp,#1
+ ////////////////////////////////////////////////////////////////
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul $acc1,$a1,$a0 // a[1]*a[0]
+ umulh $t1,$a1,$a0
+ mul $acc2,$a2,$a0 // a[2]*a[0]
+ umulh $t2,$a2,$a0
+ mul $acc3,$a3,$a0 // a[3]*a[0]
+ umulh $acc4,$a3,$a0
+
+ adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
+ mul $t0,$a2,$a1 // a[2]*a[1]
+ umulh $t1,$a2,$a1
+ adcs $acc3,$acc3,$t2
+ mul $t2,$a3,$a1 // a[3]*a[1]
+ umulh $t3,$a3,$a1
+ adc $acc4,$acc4,xzr // can't overflow
+
+ mul $acc5,$a3,$a2 // a[3]*a[2]
+ umulh $acc6,$a3,$a2
+
+ adds $t1,$t1,$t2 // accumulate high parts of multiplication
+ mul $acc0,$a0,$a0 // a[0]*a[0]
+ adc $t2,$t3,xzr // can't overflow
+
+ adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
+ umulh $a0,$a0,$a0
+ adcs $acc4,$acc4,$t1
+ mul $t1,$a1,$a1 // a[1]*a[1]
+ adcs $acc5,$acc5,$t2
+ umulh $a1,$a1,$a1
+ adc $acc6,$acc6,xzr // can't overflow
+
+ adds $acc1,$acc1,$acc1 // acc[1-6]*=2
+ mul $t2,$a2,$a2 // a[2]*a[2]
+ adcs $acc2,$acc2,$acc2
+ umulh $a2,$a2,$a2
+ adcs $acc3,$acc3,$acc3
+ mul $t3,$a3,$a3 // a[3]*a[3]
+ adcs $acc4,$acc4,$acc4
+ umulh $a3,$a3,$a3
+ adcs $acc5,$acc5,$acc5
+ adcs $acc6,$acc6,$acc6
+ adc $acc7,xzr,xzr
+
+ adds $acc1,$acc1,$a0 // +a[i]*a[i]
+ mul $t4,$acc0,$ordk
+ adcs $acc2,$acc2,$t1
+ adcs $acc3,$acc3,$a1
+ adcs $acc4,$acc4,$t2
+ adcs $acc5,$acc5,$a2
+ adcs $acc6,$acc6,$t3
+ adc $acc7,$acc7,$a3
+___
+for($i=0; $i<4; $i++) { # reductions
+$code.=<<___;
+ subs xzr,$acc0,#1
+ umulh $t1,$ord0,$t4
+ mul $t2,$ord1,$t4
+ umulh $t3,$ord1,$t4
+
+ adcs $t2,$t2,$t1
+ adc $t3,$t3,xzr
+
+ adds $acc0,$acc1,$t2
+ adcs $acc1,$acc2,$t3
+ adcs $acc2,$acc3,$t4
+ adc $acc3,xzr,$t4 // can't overflow
+___
+$code.=<<___ if ($i<3);
+ mul $t3,$acc0,$ordk
+___
+$code.=<<___;
+ lsl $t0,$t4,#32
+ subs $acc1,$acc1,$t4
+ lsr $t1,$t4,#32
+ sbcs $acc2,$acc2,$t0
+ sbc $acc3,$acc3,$t1 // can't borrow
+___
+ ($t3,$t4) = ($t4,$t3);
+}
+$code.=<<___;
+ adds $acc0,$acc0,$acc4 // accumulate upper half
+ adcs $acc1,$acc1,$acc5
+ adcs $acc2,$acc2,$acc6
+ adcs $acc3,$acc3,$acc7
+ adc $acc4,xzr,xzr
+
+ subs $t0,$acc0,$ord0 // ret -= modulus
+ sbcs $t1,$acc1,$ord1
+ sbcs $t2,$acc2,$ord2
+ sbcs $t3,$acc3,$ord3
+ sbcs xzr,$acc4,xzr
+
+ csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $a1,$acc1,$t1,lo
+ csel $a2,$acc2,$t2,lo
+ csel $a3,$acc3,$t3,lo
+
+ cbnz $bp,.Loop_ord_sqr
+
+ stp $a0,$a1,[$rp]
+ stp $a2,$a3,[$rp,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
} }
########################################################################
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 48d6464..96b5dba 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -1,15 +1,17 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
-# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
# (1) Intel Corporation, Israel Development Center, Haifa, Israel
# (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
#
# Reference:
# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -18,23 +20,25 @@
# Further optimization by <appro at openssl.org>:
#
# this/original with/without -DECP_NISTZ256_ASM(*)
-# Opteron +12-49% +110-150%
-# Bulldozer +14-45% +175-210%
-# P4 +18-46% n/a :-(
-# Westmere +12-34% +80-87%
-# Sandy Bridge +9-35% +110-120%
-# Ivy Bridge +9-35% +110-125%
-# Haswell +8-37% +140-160%
-# Broadwell +18-58% +145-210%
-# Atom +15-50% +130-180%
-# VIA Nano +43-160% +300-480%
+# Opteron +15-49% +150-195%
+# Bulldozer +18-45% +175-240%
+# P4 +24-46% +100-150%
+# Westmere +18-34% +87-160%
+# Sandy Bridge +14-35% +120-185%
+# Ivy Bridge +11-35% +125-180%
+# Haswell +10-37% +160-200%
+# Broadwell +24-58% +210-270%
+# Atom +20-50% +180-240%
+# VIA Nano +50-160% +480-480%
#
# (*) "without -DECP_NISTZ256_ASM" refers to build with
# "enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
$flavour = shift;
$output = shift;
@@ -95,6 +99,12 @@ $code.=<<___;
.long 3,3,3,3,3,3,3,3
.LONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
___
{
@@ -334,152 +344,1231 @@ ecp_nistz256_add:
mov $a1, $t1
adc \$0, $t4
- sub 8*0($a_ptr), $a0
- mov $a2, $t2
- sbb 8*1($a_ptr), $a1
- sbb 8*2($a_ptr), $a2
- mov $a3, $t3
- sbb 8*3($a_ptr), $a3
- sbb \$0, $t4
+ sub 8*0($a_ptr), $a0
+ mov $a2, $t2
+ sbb 8*1($a_ptr), $a1
+ sbb 8*2($a_ptr), $a2
+ mov $a3, $t3
+ sbb 8*3($a_ptr), $a3
+ sbb \$0, $t4
+
+ cmovc $t0, $a0
+ cmovc $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovc $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovc $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Ladd_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_add,.-ecp_nistz256_add
+
+################################################################################
+# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
+.globl ecp_nistz256_sub
+.type ecp_nistz256_sub,\@function,3
+.align 32
+ecp_nistz256_sub:
+.cfi_startproc
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+.Lsub_body:
+
+ mov 8*0($a_ptr), $a0
+ xor $t4, $t4
+ mov 8*1($a_ptr), $a1
+ mov 8*2($a_ptr), $a2
+ mov 8*3($a_ptr), $a3
+ lea .Lpoly(%rip), $a_ptr
+
+ sub 8*0($b_ptr), $a0
+ sbb 8*1($b_ptr), $a1
+ mov $a0, $t0
+ sbb 8*2($b_ptr), $a2
+ sbb 8*3($b_ptr), $a3
+ mov $a1, $t1
+ sbb \$0, $t4
+
+ add 8*0($a_ptr), $a0
+ mov $a2, $t2
+ adc 8*1($a_ptr), $a1
+ adc 8*2($a_ptr), $a2
+ mov $a3, $t3
+ adc 8*3($a_ptr), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lsub_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_sub,.-ecp_nistz256_sub
+
+################################################################################
+# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
+.globl ecp_nistz256_neg
+.type ecp_nistz256_neg,\@function,2
+.align 32
+ecp_nistz256_neg:
+.cfi_startproc
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+.Lneg_body:
+
+ xor $a0, $a0
+ xor $a1, $a1
+ xor $a2, $a2
+ xor $a3, $a3
+ xor $t4, $t4
+
+ sub 8*0($a_ptr), $a0
+ sbb 8*1($a_ptr), $a1
+ sbb 8*2($a_ptr), $a2
+ mov $a0, $t0
+ sbb 8*3($a_ptr), $a3
+ lea .Lpoly(%rip), $a_ptr
+ mov $a1, $t1
+ sbb \$0, $t4
+
+ add 8*0($a_ptr), $a0
+ mov $a2, $t2
+ adc 8*1($a_ptr), $a1
+ adc 8*2($a_ptr), $a2
+ mov $a3, $t3
+ adc 8*3($a_ptr), $a3
+ test $t4, $t4
+
+ cmovz $t0, $a0
+ cmovz $t1, $a1
+ mov $a0, 8*0($r_ptr)
+ cmovz $t2, $a2
+ mov $a1, 8*1($r_ptr)
+ cmovz $t3, $a3
+ mov $a2, 8*2($r_ptr)
+ mov $a3, 8*3($r_ptr)
+
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lneg_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_neg,.-ecp_nistz256_neg
+___
+}
+{
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
+my ($poly1,$poly3)=($acc6,$acc7);
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_ord_mul_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# uint64_t b[4]);
+
+.globl ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,\@function,3
+.align 32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_mul_body:
+
+ mov 8*0($b_org), %rax
+ mov $b_org, $b_ptr
+ lea .Lord(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# * b[0]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ mov %rax, $acc0
+ mov $t0, %rax
+ mov %rdx, $acc1
+
+ mulq 8*1($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc2
+
+ mulq 8*2($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc0, $acc5
+ imulq %r15,$acc0
+
+ mov %rdx, $acc3
+ mulq 8*3($a_ptr)
+ add %rax, $acc3
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# First reduction step
+ mulq 8*0(%r14)
+ mov $acc0, $t1
+ add %rax, $acc5 # guaranteed to be zero
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $acc0 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $t1, %rax
+ adc %rdx, $acc2
+ mov $t1, %rdx
+ adc \$0, $acc0 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*1($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc0, $acc3
+ adc $t1, $acc4
+ adc \$0, $acc5
+
+ ################################# * b[1]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc1, $t0
+ imulq %r15, $acc1
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ xor $acc0, $acc0
+ add %rax, $acc4
+ mov $acc1, %rax
+ adc %rdx, $acc5
+ adc \$0, $acc0
+
+ ################################# Second reduction step
+ mulq 8*0(%r14)
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc1, %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $acc1 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t1, %rax
+ adc %rdx, $acc3
+ mov $t1, %rdx
+ adc \$0, $acc1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc4
+ mov 8*2($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc1, $acc4
+ adc $t1, $acc5
+ adc \$0, $acc0
+
+ ################################## * b[2]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc2, $t0
+ imulq %r15, $acc2
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ xor $acc1, $acc1
+ add %rax, $acc5
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ adc \$0, $acc1
+
+ ################################# Third reduction step
+ mulq 8*0(%r14)
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc2, %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc4
+ sbb \$0, $acc2 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t1, %rax
+ adc %rdx, $acc4
+ mov $t1, %rdx
+ adc \$0, $acc2 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc5
+ mov 8*3($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc2, $acc5
+ adc $t1, $acc0
+ adc \$0, $acc1
+
+ ################################# * b[3]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ add %rax, $acc5
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc3, $t0
+ imulq %r15, $acc3
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc0
+ adc \$0, %rdx
+ xor $acc2, $acc2
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ adc \$0, $acc2
+
+ ################################# Last reduction step
+ mulq 8*0(%r14)
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc3, %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc5
+ sbb \$0, $acc3 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t1, %rax
+ adc %rdx, $acc5
+ mov $t1, %rdx
+ adc \$0, $acc3 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc3, $acc0
+ adc $t1, $acc1
+ adc \$0, $acc2
+
+ ################################# Subtract ord
+ mov $acc4, $a_ptr
+ sub 8*0(%r14), $acc4
+ mov $acc5, $acc3
+ sbb 8*1(%r14), $acc5
+ mov $acc0, $t0
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
+
+ cmovc $a_ptr, $acc4
+ cmovc $acc3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mul_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# int rep);
+
+.globl ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_sqr_body:
+
+ mov 8*0($a_ptr), $acc0
+ mov 8*1($a_ptr), %rax
+ mov 8*2($a_ptr), $acc6
+ mov 8*3($a_ptr), $acc7
+ lea .Lord(%rip), $a_ptr # pointer to modulus
+ mov $b_org, $b_ptr
+ jmp .Loop_ord_sqr
+
+.align 32
+.Loop_ord_sqr:
+ ################################# a[1:] * a[0]
+ mov %rax, $t1 # put aside a[1]
+ mul $acc0 # a[1] * a[0]
+ mov %rax, $acc1
+ movq $t1, %xmm1 # offload a[1]
+ mov $acc6, %rax
+ mov %rdx, $acc2
+
+ mul $acc0 # a[2] * a[0]
+ add %rax, $acc2
+ mov $acc7, %rax
+ movq $acc6, %xmm2 # offload a[2]
+ adc \$0, %rdx
+ mov %rdx, $acc3
+
+ mul $acc0 # a[3] * a[0]
+ add %rax, $acc3
+ mov $acc7, %rax
+ movq $acc7, %xmm3 # offload a[3]
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# a[3] * a[2]
+ mul $acc6 # a[3] * a[2]
+ mov %rax, $acc5
+ mov $acc6, %rax
+ mov %rdx, $acc6
+
+ ################################# a[2:] * a[1]
+ mul $t1 # a[2] * a[1]
+ add %rax, $acc3
+ mov $acc7, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc7
+
+ mul $t1 # a[3] * a[1]
+ add %rax, $acc4
+ adc \$0, %rdx
+
+ add $acc7, $acc4
+ adc %rdx, $acc5
+ adc \$0, $acc6 # can't overflow
+
+ ################################# *2
+ xor $acc7, $acc7
+ mov $acc0, %rax
+ add $acc1, $acc1
+ adc $acc2, $acc2
+ adc $acc3, $acc3
+ adc $acc4, $acc4
+ adc $acc5, $acc5
+ adc $acc6, $acc6
+ adc \$0, $acc7
+
+ ################################# Missing products
+ mul %rax # a[0] * a[0]
+ mov %rax, $acc0
+ movq %xmm1, %rax
+ mov %rdx, $t1
+
+ mul %rax # a[1] * a[1]
+ add $t1, $acc1
+ adc %rax, $acc2
+ movq %xmm2, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mul %rax # a[2] * a[2]
+ add $t1, $acc3
+ adc %rax, $acc4
+ movq %xmm3, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mov $acc0, $t0
+ imulq 8*4($a_ptr), $acc0 # *= .LordK
+
+ mul %rax # a[3] * a[3]
+ add $t1, $acc5
+ adc %rax, $acc6
+ mov 8*0($a_ptr), %rax # modulus[0]
+ adc %rdx, $acc7 # can't overflow
+
+ ################################# First reduction step
+ mul $acc0
+ mov $acc0, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax # modulus[1]
+ adc %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc0
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $acc0, %rax
+ adc %rdx, $acc2
+ mov $acc0, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc1, $t0
+ imulq 8*4($a_ptr), $acc1 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc0 # can't borrow
+
+ add $t1, $acc3
+ adc \$0, $acc0 # can't overflow
+
+ ################################# Second reduction step
+ mul $acc1
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc1
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $acc1, %rax
+ adc %rdx, $acc3
+ mov $acc1, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc2, $t0
+ imulq 8*4($a_ptr), $acc2 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc1 # can't borrow
+
+ add $t1, $acc0
+ adc \$0, $acc1 # can't overflow
+
+ ################################# Third reduction step
+ mul $acc2
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc0
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc2
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ mov $acc2, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc3, $t0
+ imulq 8*4($a_ptr), $acc3 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc1
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc2 # can't borrow
+
+ add $t1, $acc1
+ adc \$0, $acc2 # can't overflow
+
+ ################################# Last reduction step
+ mul $acc3
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc1
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc3
+ add $t0, $acc0
+ adc \$0, %rdx
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ mov $acc3, %rdx
+ adc \$0, $t1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc2
+ sbb %rdx, $acc3 # can't borrow
+
+ add $t1, $acc2
+ adc \$0, $acc3 # can't overflow
+
+ ################################# Add bits [511:256] of the sqr result
+ xor %rdx, %rdx
+ add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc0, $acc4
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, %rax
+ adc \$0, %rdx
+
+ ################################# Compare to modulus
+ sub 8*0($a_ptr), $acc0
+ mov $acc2, $acc6
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc7
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rdx
+
+ cmovc $acc4, $acc0
+ cmovnc $acc1, %rax
+ cmovnc $acc2, $acc6
+ cmovnc $acc3, $acc7
+
+ dec $b_ptr
+ jnz .Loop_ord_sqr
+
+ mov $acc0, 8*0($r_ptr)
+ mov %rax, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc6, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc7, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqr_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+
+$code.=<<___ if ($addx);
+################################################################################
+.type ecp_nistz256_ord_mul_montx,\@function,3
+.align 32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_mulx_body:
+
+ mov $b_org, $b_ptr
+ mov 8*0($b_org), %rdx
+ mov 8*0($a_ptr), $acc1
+ mov 8*1($a_ptr), $acc2
+ mov 8*2($a_ptr), $acc3
+ mov 8*3($a_ptr), $acc4
+ lea -128($a_ptr), $a_ptr # control u-op density
+ lea .Lord-128(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# Multiply by b[0]
+ mulx $acc1, $acc0, $acc1
+ mulx $acc2, $t0, $acc2
+ mulx $acc3, $t1, $acc3
+ add $t0, $acc1
+ mulx $acc4, $t0, $acc4
+ mov $acc0, %rdx
+ mulx %r15, %rdx, %rax
+ adc $t1, $acc2
+ adc $t0, $acc3
+ adc \$0, $acc4
+
+ ################################# reduction
+ xor $acc5, $acc5 # $acc5=0, cf=0, of=0
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*1($b_ptr), %rdx
+ adcx $t0, $acc3
+ adox $t1, $acc4
+ adcx $acc0, $acc4
+ adox $acc0, $acc5
+ adc \$0, $acc5 # cf=0, of=0
+
+ ################################# Multiply by b[1]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc1, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ adcx $acc0, $acc5
+ adox $acc0, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc1 # guaranteed to be zero
+ adox $t1, $acc2
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*2($b_ptr), %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adcx $acc1, $acc5
+ adox $acc1, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# Multiply by b[2]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc2, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ adcx $acc1, $acc0
+ adox $acc1, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*3($b_ptr), %rdx
+ adcx $t0, $acc5
+ adox $t1, $acc0
+ adcx $acc2, $acc0
+ adox $acc2, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# Multiply by b[3]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc3, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc0
+ adox $t1, $acc1
+
+ adcx $acc2, $acc1
+ adox $acc2, $acc2
+ adc \$0, $acc2 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc3 # guranteed to be zero
+ adox $t1, $acc4
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128(%r14), $t0, $t1
+ lea 128(%r14),%r14
+ mov $acc4, $t2
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mov $acc5, $t3
+ adcx $acc3, $acc1
+ adox $acc3, $acc2
+ adc \$0, $acc2
+
+ #################################
+ # Branch-less conditional subtraction of P
+ mov $acc0, $t0
+ sub 8*0(%r14), $acc4
+ sbb 8*1(%r14), $acc5
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
- cmovc $t0, $a0
- cmovc $t1, $a1
- mov $a0, 8*0($r_ptr)
- cmovc $t2, $a2
- mov $a1, 8*1($r_ptr)
- cmovc $t3, $a3
- mov $a2, 8*2($r_ptr)
- mov $a3, 8*3($r_ptr)
+ cmovc $t2, $acc4
+ cmovc $t3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
- mov 0(%rsp),%r13
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
.cfi_restore %r13
- mov 8(%rsp),%r12
+ mov 24(%rsp),%r12
.cfi_restore %r12
- lea 16(%rsp),%rsp
-.cfi_adjust_cfa_offset -16
-.Ladd_epilogue:
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mulx_epilogue:
ret
.cfi_endproc
-.size ecp_nistz256_add,.-ecp_nistz256_add
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
-################################################################################
-# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
-.globl ecp_nistz256_sub
-.type ecp_nistz256_sub,\@function,3
+.type ecp_nistz256_ord_sqr_montx,\@function,3
.align 32
-ecp_nistz256_sub:
+ecp_nistz256_ord_sqr_montx:
.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
-.Lsub_body:
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_sqrx_body:
- mov 8*0($a_ptr), $a0
- xor $t4, $t4
- mov 8*1($a_ptr), $a1
- mov 8*2($a_ptr), $a2
- mov 8*3($a_ptr), $a3
- lea .Lpoly(%rip), $a_ptr
+ mov $b_org, $b_ptr
+ mov 8*0($a_ptr), %rdx
+ mov 8*1($a_ptr), $acc6
+ mov 8*2($a_ptr), $acc7
+ mov 8*3($a_ptr), $acc0
+ lea .Lord(%rip), $a_ptr
+ jmp .Loop_ord_sqrx
- sub 8*0($b_ptr), $a0
- sbb 8*1($b_ptr), $a1
- mov $a0, $t0
- sbb 8*2($b_ptr), $a2
- sbb 8*3($b_ptr), $a3
- mov $a1, $t1
- sbb \$0, $t4
+.align 32
+.Loop_ord_sqrx:
+ mulx $acc6, $acc1, $acc2 # a[0]*a[1]
+ mulx $acc7, $t0, $acc3 # a[0]*a[2]
+ mov %rdx, %rax # offload a[0]
+ movq $acc6, %xmm1 # offload a[1]
+ mulx $acc0, $t1, $acc4 # a[0]*a[3]
+ mov $acc6, %rdx
+ add $t0, $acc2
+ movq $acc7, %xmm2 # offload a[2]
+ adc $t1, $acc3
+ adc \$0, $acc4
+ xor $acc5, $acc5 # $acc5=0,cf=0,of=0
+ #################################
+ mulx $acc7, $t0, $t1 # a[1]*a[2]
+ adcx $t0, $acc3
+ adox $t1, $acc4
- add 8*0($a_ptr), $a0
- mov $a2, $t2
- adc 8*1($a_ptr), $a1
- adc 8*2($a_ptr), $a2
- mov $a3, $t3
- adc 8*3($a_ptr), $a3
- test $t4, $t4
+ mulx $acc0, $t0, $t1 # a[1]*a[3]
+ mov $acc7, %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adc \$0, $acc5
+ #################################
+ mulx $acc0, $t0, $acc6 # a[2]*a[3]
+ mov %rax, %rdx
+ movq $acc0, %xmm3 # offload a[3]
+ xor $acc7, $acc7 # $acc7=0,cf=0,of=0
+ adcx $acc1, $acc1 # acc1:6<<1
+ adox $t0, $acc5
+ adcx $acc2, $acc2
+ adox $acc7, $acc6 # of=0
- cmovz $t0, $a0
- cmovz $t1, $a1
- mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
- mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
- mov $a2, 8*2($r_ptr)
- mov $a3, 8*3($r_ptr)
+ ################################# a[i]*a[i]
+ mulx %rdx, $acc0, $t1
+ movq %xmm1, %rdx
+ adcx $acc3, $acc3
+ adox $t1, $acc1
+ adcx $acc4, $acc4
+ mulx %rdx, $t0, $t4
+ movq %xmm2, %rdx
+ adcx $acc5, $acc5
+ adox $t0, $acc2
+ adcx $acc6, $acc6
+ mulx %rdx, $t0, $t1
+ .byte 0x67
+ movq %xmm3, %rdx
+ adox $t4, $acc3
+ adcx $acc7, $acc7
+ adox $t0, $acc4
+ adox $t1, $acc5
+ mulx %rdx, $t0, $t4
+ adox $t0, $acc6
+ adox $t4, $acc7
- mov 0(%rsp),%r13
-.cfi_restore %r13
- mov 8(%rsp),%r12
-.cfi_restore %r12
- lea 16(%rsp),%rsp
-.cfi_adjust_cfa_offset -16
-.Lsub_epilogue:
- ret
-.cfi_endproc
-.size ecp_nistz256_sub,.-ecp_nistz256_sub
+ ################################# reduction
+ mov $acc0, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
-################################################################################
-# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
-.globl ecp_nistz256_neg
-.type ecp_nistz256_neg,\@function,2
-.align 32
-ecp_nistz256_neg:
-.cfi_startproc
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
-.Lneg_body:
+ xor %rax, %rax # cf=0, of=0
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0 # of=0
+ adcx %rax, $acc0 # cf=0
- xor $a0, $a0
- xor $a1, $a1
- xor $a2, $a2
- xor $a3, $a3
- xor $t4, $t4
+ #################################
+ mov $acc1, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
- sub 8*0($a_ptr), $a0
- sbb 8*1($a_ptr), $a1
- sbb 8*2($a_ptr), $a2
- mov $a0, $t0
- sbb 8*3($a_ptr), $a3
- lea .Lpoly(%rip), $a_ptr
- mov $a1, $t1
- sbb \$0, $t4
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc1 # guaranteed to be zero
+ adcx $t1, $acc2
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc3
+ adcx $t1, $acc0
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1 # cf=0
+ adox %rax, $acc1 # of=0
- add 8*0($a_ptr), $a0
- mov $a2, $t2
- adc 8*1($a_ptr), $a1
- adc 8*2($a_ptr), $a2
- mov $a3, $t3
- adc 8*3($a_ptr), $a3
- test $t4, $t4
+ #################################
+ mov $acc2, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
- cmovz $t0, $a0
- cmovz $t1, $a1
- mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
- mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
- mov $a2, 8*2($r_ptr)
- mov $a3, 8*3($r_ptr)
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2 # of=0
+ adcx %rax, $acc2 # cf=0
- mov 0(%rsp),%r13
+ #################################
+ mov $acc3, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc3 # guaranteed to be zero
+ adcx $t1, $acc0
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc1
+ adcx $t1, $acc2
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ adox %rax, $acc3
+
+ ################################# accumulate upper half
+ add $acc0, $acc4 # add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc4, %rdx
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, $acc6
+ adc \$0, %rax
+
+ ################################# compare to modulus
+ sub 8*0($a_ptr), $acc4
+ mov $acc2, $acc7
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc0
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rax
+
+ cmovnc $acc4, %rdx
+ cmovnc $acc1, $acc6
+ cmovnc $acc2, $acc7
+ cmovnc $acc3, $acc0
+
+ dec $b_ptr
+ jnz .Loop_ord_sqrx
+
+ mov %rdx, 8*0($r_ptr)
+ mov $acc6, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc7, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc0, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
.cfi_restore %r13
- mov 8(%rsp),%r12
+ mov 24(%rsp),%r12
.cfi_restore %r12
- lea 16(%rsp),%rsp
-.cfi_adjust_cfa_offset -16
-.Lneg_epilogue:
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqrx_epilogue:
ret
.cfi_endproc
-.size ecp_nistz256_neg,.-ecp_nistz256_neg
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
___
-}
-{
-my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
-my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
-my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
-my ($poly1,$poly3)=($acc6,$acc7);
$code.=<<___;
################################################################################
@@ -3366,6 +4455,24 @@ full_handler:
.rva .LSEH_end_ecp_nistz256_neg
.rva .LSEH_info_ecp_nistz256_neg
+ .rva .LSEH_begin_ecp_nistz256_ord_mul_mont
+ .rva .LSEH_end_ecp_nistz256_ord_mul_mont
+ .rva .LSEH_info_ecp_nistz256_ord_mul_mont
+
+ .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont
+ .rva .LSEH_end_ecp_nistz256_ord_sqr_mont
+ .rva .LSEH_info_ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___ if ($addx);
+ .rva .LSEH_begin_ecp_nistz256_ord_mul_montx
+ .rva .LSEH_end_ecp_nistz256_ord_mul_montx
+ .rva .LSEH_info_ecp_nistz256_ord_mul_montx
+
+ .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx
+ .rva .LSEH_end_ecp_nistz256_ord_sqr_montx
+ .rva .LSEH_info_ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
.rva .LSEH_begin_ecp_nistz256_to_mont
.rva .LSEH_end_ecp_nistz256_to_mont
.rva .LSEH_info_ecp_nistz256_to_mont
@@ -3453,6 +4560,30 @@ $code.=<<___;
.byte 9,0,0,0
.rva short_handler
.rva .Lneg_body,.Lneg_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_ord_mul_mont:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[]
+ .long 48,0
+.LSEH_info_ecp_nistz256_ord_sqr_mont:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[]
+ .long 48,0
+___
+$code.=<<___ if ($addx);
+.LSEH_info_ecp_nistz256_ord_mul_montx:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[]
+ .long 48,0
+.LSEH_info_ecp_nistz256_ord_sqr_montx:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[]
+ .long 48,0
+___
+$code.=<<___;
.LSEH_info_ecp_nistz256_to_mont:
.byte 9,0,0,0
.rva full_handler
diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c
index 9f82b4e..efec5a7 100644
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c
@@ -48,6 +48,8 @@ static const ERR_STRING_DATA EC_str_functs[] = {
"ECPKParameters_print_fp"},
{ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_GET_AFFINE, 0),
"ecp_nistz256_get_affine"},
+ {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_INV_MOD_ORD, 0),
+ "ecp_nistz256_inv_mod_ord"},
{ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 0),
"ecp_nistz256_mult_precompute"},
{ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_POINTS_MUL, 0),
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index 6cc0190..540aa53 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -155,6 +155,9 @@ struct ec_method_st {
/* custom ECDH operation */
int (*ecdh_compute_key)(unsigned char **pout, size_t *poutlen,
const EC_POINT *pub_key, const EC_KEY *ecdh);
+ /* Inverse modulo order */
+ int (*field_inverse_mod_ord)(const EC_GROUP *, BIGNUM *r, BIGNUM *x,
+ BN_CTX *ctx);
};
/*
@@ -520,7 +523,6 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,
unsigned char *digit, unsigned char in);
#endif
-int ec_precompute_mont_data(EC_GROUP *);
int ec_group_simple_order_bits(const EC_GROUP *group);
#ifdef ECP_NISTZ256_ASM
@@ -604,3 +606,6 @@ int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
const uint8_t peer_public_value[32]);
void X25519_public_from_private(uint8_t out_public_value[32],
const uint8_t private_key[32]);
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+ BIGNUM *x, BN_CTX *ctx);
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index 7ae48cf..8d508dd 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -261,6 +261,8 @@ int EC_METHOD_get_field_type(const EC_METHOD *meth)
return meth->field_type;
}
+static int ec_precompute_mont_data(EC_GROUP *);
+
int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
const BIGNUM *order, const BIGNUM *cofactor)
{
@@ -961,7 +963,7 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group)
* ec_precompute_mont_data sets |group->mont_data| from |group->order| and
* returns one on success. On error it returns zero.
*/
-int ec_precompute_mont_data(EC_GROUP *group)
+static int ec_precompute_mont_data(EC_GROUP *group)
{
BN_CTX *ctx = BN_CTX_new();
int ret = 0;
@@ -1006,3 +1008,12 @@ int ec_group_simple_order_bits(const EC_GROUP *group)
return 0;
return BN_num_bits(group->order);
}
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+ BIGNUM *x, BN_CTX *ctx)
+{
+ if (group->meth->field_inverse_mod_ord != NULL)
+ return group->meth->field_inverse_mod_ord(group, res, x, ctx);
+ else
+ return 0;
+}
diff --git a/crypto/ec/ecdsa_ossl.c b/crypto/ec/ecdsa_ossl.c
index 30458f1..a405d38 100644
--- a/crypto/ec/ecdsa_ossl.c
+++ b/crypto/ec/ecdsa_ossl.c
@@ -153,30 +153,33 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
}
while (BN_is_zero(r));
- /* compute the inverse of k */
- if (EC_GROUP_get_mont_data(group) != NULL) {
- /*
- * We want inverse in constant time, therefore we utilize the fact
- * order must be prime and use Fermat's Little Theorem instead.
- */
- if (!BN_set_word(X, 2)) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
- }
- if (!BN_mod_sub(X, order, X, order, ctx)) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
- }
- BN_set_flags(X, BN_FLG_CONSTTIME);
- if (!BN_mod_exp_mont_consttime
- (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
- }
- } else {
- if (!BN_mod_inverse(k, k, order, ctx)) {
- ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
- goto err;
+ /* Check if optimized inverse is implemented */
+ if (EC_GROUP_do_inverse_ord(group, k, k, ctx) == 0) {
+ /* compute the inverse of k */
+ if (group->mont_data != NULL) {
+ /*
+ * We want inverse in constant time, therefore we utilize the fact
+ * order must be prime and use Fermats Little Theorem instead.
+ */
+ if (!BN_set_word(X, 2)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ if (!BN_mod_sub(X, order, X, order, ctx)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ BN_set_flags(X, BN_FLG_CONSTTIME);
+ if (!BN_mod_exp_mont_consttime(k, k, X, order, ctx,
+ group->mont_data)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
+ } else {
+ if (!BN_mod_inverse(k, k, order, ctx)) {
+ ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+ goto err;
+ }
}
}
@@ -407,9 +410,12 @@ int ossl_ecdsa_verify_sig(const unsigned char *dgst, int dgst_len,
goto err;
}
/* calculate tmp1 = inv(S) mod order */
- if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
- ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
- goto err;
+ /* Check if optimized inverse is implemented */
+ if (EC_GROUP_do_inverse_ord(group, u2, sig->s, ctx) == 0) {
+ if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
+ ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
+ goto err;
+ }
}
/* digest -> m */
i = BN_num_bits(order);
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
index 3863a61..08a7e84 100644
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -1,15 +1,17 @@
/*
* Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
* Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+ * Copyright (c) 2015, CloudFlare, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
- * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
* (2) University of Haifa, Israel
+ * (3) CloudFlare, Inc.
*
* Reference:
* S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -908,7 +910,7 @@ __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
*/
#if defined(ECP_NISTZ256_AVX2)
# if !(defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_AMD64) || defined(_MX64)) || \
+ defined(_M_AMD64) || defined(_M_X64)) || \
!(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
# undef ECP_NISTZ256_AVX2
# else
@@ -1495,6 +1497,189 @@ static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
return HAVEPRECOMP(group, nistz256);
}
+#if defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__powerpc64__) || defined(_ARCH_PP64) || \
+ defined(__aarch64__)
+/*
+ * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P)
+ */
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ int rep);
+
+static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r,
+ BIGNUM *x, BN_CTX *ctx)
+{
+ /* RR = 2^512 mod ord(p256) */
+ static const BN_ULONG RR[P256_LIMBS] = {
+ TOBN(0x83244c95,0xbe79eea2), TOBN(0x4699799c,0x49bd6fa6),
+ TOBN(0x2845b239,0x2b6bec59), TOBN(0x66e12d94,0xf3d95620)
+ };
+ /* The constant 1 (unlike ONE that is one in Montgomery representation) */
+ static const BN_ULONG one[P256_LIMBS] = {
+ TOBN(0,1), TOBN(0,0), TOBN(0,0), TOBN(0,0)
+ };
+ /*
+ * We don't use entry 0 in the table, so we omit it and address
+ * with -1 offset.
+ */
+ BN_ULONG table[15][P256_LIMBS];
+ BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
+ int i, ret = 0;
+
+ /*
+ * Catch allocation failure early.
+ */
+ if (bn_wexpand(r, P256_LIMBS) == NULL) {
+ ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+ goto err;
+ }
+
+ if ((BN_num_bits(x) > 256) || BN_is_negative(x)) {
+ BIGNUM *tmp;
+
+ if ((tmp = BN_CTX_get(ctx)) == NULL
+ || !BN_nnmod(tmp, x, group->order, ctx)) {
+ ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+ goto err;
+ }
+ x = tmp;
+ }
+
+ if (!ecp_nistz256_bignum_to_field_elem(t, x)) {
+ ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE);
+ goto err;
+ }
+
+ ecp_nistz256_ord_mul_mont(table[0], t, RR);
+#if 0
+ /*
+ * Original sparse-then-fixed-window algorithm, retained for reference.
+ */
+ for (i = 2; i < 16; i += 2) {
+ ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1);
+ ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]);
+ }
+
+ /*
+ * The top 128bit of the exponent are highly redudndant, so we
+ * perform an optimized flow
+ */
+ ecp_nistz256_ord_sqr_mont(t, table[15-1], 4); /* f0 */
+ ecp_nistz256_ord_mul_mont(t, t, table[15-1]); /* ff */
+
+ ecp_nistz256_ord_sqr_mont(out, t, 8); /* ff00 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */
+
+ ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */
+ ecp_nistz256_ord_mul_mont(t, t, out); /* ffffffff */
+
+ ecp_nistz256_ord_sqr_mont(out, t, 64); /* ffffffff0000000000000000 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffff */
+
+ ecp_nistz256_ord_sqr_mont(out, out, 32); /* ffffffff00000000ffffffff00000000 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */
+
+ /*
+ * The bottom 128 bit of the exponent are processed with fixed 4-bit window
+ */
+ for(i = 0; i < 32; i++) {
+ /* expLo - the low 128 bits of the exponent we use (ord(p256) - 2),
+ * split into nibbles */
+ static const unsigned char expLo[32] = {
+ 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4,
+ 0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf
+ };
+
+ ecp_nistz256_ord_sqr_mont(out, out, 4);
+ /* The exponent is public, no need in constant-time access */
+ ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]);
+ }
+#else
+ /*
+ * https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+ *
+ * Even though this code path spares 12 squarings, 4.5%, and 13
+ * multiplications, 25%, on grand scale sign operation is not that
+ * much faster, not more that 2%...
+ */
+ enum {
+ i_1 = 0, i_10, i_11, i_101, i_111, i_1010, i_1111,
+ i_10101, i_101010, i_101111, i_x6, i_x8, i_x16, i_x32
+ };
+
+ /* pre-calculate powers */
+ ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1);
+
+ ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
+
+ ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
+
+ ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
+
+ ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1);
+
+ ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
+
+ ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1);
+ ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
+
+ ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1);
+
+ ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
+
+ ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
+
+ ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2);
+ ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
+
+ ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8);
+ ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
+
+ ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16);
+ ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
+
+ /* calculations */
+ ecp_nistz256_ord_sqr_mont(out, table[i_x32], 64);
+ ecp_nistz256_ord_mul_mont(out, out, table[i_x32]);
+
+ for (i = 0; i < 27; i++) {
+ static const struct { unsigned char p, i; } chain[27] = {
+ { 32, i_x32 }, { 6, i_101111 }, { 5, i_111 },
+ { 4, i_11 }, { 5, i_1111 }, { 5, i_10101 },
+ { 4, i_101 }, { 3, i_101 }, { 3, i_101 },
+ { 5, i_111 }, { 9, i_101111 }, { 6, i_1111 },
+ { 2, i_1 }, { 5, i_1 }, { 6, i_1111 },
+ { 5, i_111 }, { 4, i_111 }, { 5, i_111 },
+ { 5, i_101 }, { 3, i_11 }, { 10, i_101111 },
+ { 2, i_11 }, { 5, i_11 }, { 5, i_11 },
+ { 3, i_1 }, { 7, i_10101 }, { 6, i_1111 }
+ };
+
+ ecp_nistz256_ord_sqr_mont(out, out, chain[i].p);
+ ecp_nistz256_ord_mul_mont(out, out, table[chain[i].i]);
+ }
+#endif
+ ecp_nistz256_ord_mul_mont(out, out, one);
+
+ /*
+ * Can't fail, but check return code to be consistent anyway.
+ */
+ if (!bn_set_words(r, out, P256_LIMBS))
+ goto err;
+
+ ret = 1;
+err:
+ return ret;
+}
+#else
+# define ecp_nistz256_inv_mod_ord NULL
+#endif
+
const EC_METHOD *EC_GFp_nistz256_method(void)
{
static const EC_METHOD ret = {
@@ -1544,7 +1729,8 @@ const EC_METHOD *EC_GFp_nistz256_method(void)
ec_key_simple_generate_public_key,
0, /* keycopy */
0, /* keyfinish */
- ecdh_simple_compute_key
+ ecdh_simple_compute_key,
+ ecp_nistz256_inv_mod_ord /* can be #define-d NULL */
};
return &ret;
diff --git a/crypto/err/openssl.txt b/crypto/err/openssl.txt
index 9ec0009..6449662 100644
--- a/crypto/err/openssl.txt
+++ b/crypto/err/openssl.txt
@@ -458,6 +458,7 @@ EC_F_ECPARAMETERS_PRINT_FP:148:ECParameters_print_fp
EC_F_ECPKPARAMETERS_PRINT:149:ECPKParameters_print
EC_F_ECPKPARAMETERS_PRINT_FP:150:ECPKParameters_print_fp
EC_F_ECP_NISTZ256_GET_AFFINE:240:ecp_nistz256_get_affine
+EC_F_ECP_NISTZ256_INV_MOD_ORD:275:ecp_nistz256_inv_mod_ord
EC_F_ECP_NISTZ256_MULT_PRECOMPUTE:243:ecp_nistz256_mult_precompute
EC_F_ECP_NISTZ256_POINTS_MUL:241:ecp_nistz256_points_mul
EC_F_ECP_NISTZ256_PRE_COMP_NEW:244:ecp_nistz256_pre_comp_new
diff --git a/include/openssl/ecerr.h b/include/openssl/ecerr.h
index bd09cb7..a1b9ea1 100644
--- a/include/openssl/ecerr.h
+++ b/include/openssl/ecerr.h
@@ -50,6 +50,7 @@ int ERR_load_EC_strings(void);
# define EC_F_ECPKPARAMETERS_PRINT 149
# define EC_F_ECPKPARAMETERS_PRINT_FP 150
# define EC_F_ECP_NISTZ256_GET_AFFINE 240
+# define EC_F_ECP_NISTZ256_INV_MOD_ORD 275
# define EC_F_ECP_NISTZ256_MULT_PRECOMPUTE 243
# define EC_F_ECP_NISTZ256_POINTS_MUL 241
# define EC_F_ECP_NISTZ256_PRE_COMP_NEW 244
More information about the openssl-commits
mailing list