[openssl-commits] [openssl] master update

Sun Jan 7 20:33:34 UTC 2018

The branch master has been updated
       via  10bc3409459a525654d6b986b3cd49d22dd95460 (commit)
       via  ab4f2026b7ff8902d70ddd75adc080fc66ffd642 (commit)
       via  d5e11843fe430dfa89bdf83b6f7805c709dcdb41 (commit)
       via  eb7916960bf50f436593abe3d5f2e0592d291017 (commit)
      from  617b49db14fa4c1211bfc5d0e88294d0f159c9a9 (commit)


- Log -----------------------------------------------------------------
commit 10bc3409459a525654d6b986b3cd49d22dd95460
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Dec 30 20:15:44 2017 +0100

    ec/ecp_nistz256.c: switch to faster addition chain in scalar inversion.
    
    [and improve formatting]
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/5001)

commit ab4f2026b7ff8902d70ddd75adc080fc66ffd642
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Dec 30 15:11:25 2017 +0100

    ec/asm/ecp_nistz256-armv8.pl: add optimized inversion.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/5001)

commit d5e11843fe430dfa89bdf83b6f7805c709dcdb41
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Dec 30 15:51:55 2017 +0100

    ec/asm/ecp_nistz256-x86_64.pl: add .cfi and SEH handlers to new functions.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/5001)

commit eb7916960bf50f436593abe3d5f2e0592d291017
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Dec 30 15:08:31 2017 +0100

    ec/ecp_nistz256.c: improve ECDSA sign by 30-40%.
    
    This is based on RT#3810, which added dedicated modular inversion.
    ECDSA verify results improves as well, but not as much.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/5001)

-----------------------------------------------------------------------

Summary of changes:
 crypto/ec/asm/ecp_nistz256-armv8.pl  |  309 +++++++-
 crypto/ec/asm/ecp_nistz256-x86_64.pl | 1391 ++++++++++++++++++++++++++++++----
 crypto/ec/ec_err.c                   |    2 +
 crypto/ec/ec_lcl.h                   |    7 +-
 crypto/ec/ec_lib.c                   |   13 +-
 crypto/ec/ecdsa_ossl.c               |   60 +-
 crypto/ec/ecp_nistz256.c             |  192 ++++-
 crypto/err/openssl.txt               |    1 +
 include/openssl/ecerr.h              |    1 +
 9 files changed, 1809 insertions(+), 167 deletions(-)

diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl
index d93c4fe..2f1eb21 100644
--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
@@ -22,11 +22,10 @@
 # http://eprint.iacr.org/2013/816.
 #
 #			with/without -DECP_NISTZ256_ASM
-# Apple A7		+120-360%
-# Cortex-A53		+120-400%
-# Cortex-A57		+120-350%
-# X-Gene		+200-330%
-# Denver		+140-400%
+# Apple A7		+190-360%
+# Cortex-A53		+190-400%
+# Cortex-A57		+190-350%
+# Denver		+230-400%
 #
 # Ranges denote minimum and maximum improvement coefficients depending
 # on benchmark. Lower coefficients are for ECDSA sign, server-side
@@ -109,6 +108,10 @@ $code.=<<___;
 .quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
 .Lone:
 .quad	1,0,0,0
+.Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
 .asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 
 // void	ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
@@ -1309,6 +1312,302 @@ $code.=<<___;
 	ret
 .size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
 ___
+}
+if (1) {
+my ($ord0,$ord1) = ($poly1,$poly3);
+my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
+my $acc7 = $bi;
+
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,%function
+.align	4
+ecp_nistz256_ord_mul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adr	$ordk,.Lord
+	ldr	$bi,[$bp]		// bp[0]
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+
+	ldp	$ord0,$ord1,[$ordk,#0]
+	ldp	$ord2,$ord3,[$ordk,#16]
+	ldr	$ordk,[$ordk,#32]
+
+	mul	$acc0,$a0,$bi		// a[0]*b[0]
+	umulh	$t0,$a0,$bi
+
+	mul	$acc1,$a1,$bi		// a[1]*b[0]
+	umulh	$t1,$a1,$bi
+
+	mul	$acc2,$a2,$bi		// a[2]*b[0]
+	umulh	$t2,$a2,$bi
+
+	mul	$acc3,$a3,$bi		// a[3]*b[0]
+	umulh	$acc4,$a3,$bi
+
+	mul	$t4,$acc0,$ordk
+
+	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$t2
+	adc	$acc4,$acc4,xzr
+	mov	$acc5,xzr
+___
+for ($i=1;$i<4;$i++) {
+	################################################################
+	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
+	# *                                     abcdefgh
+	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+	#
+	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+	# rewrite above as:
+	#
+	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
+	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
+$code.=<<___;
+	ldr	$bi,[$bp,#8*$i]		// b[i]
+
+	lsl	$t0,$t4,#32
+	subs	$acc2,$acc2,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc3,$acc3,$t0
+	sbcs	$acc4,$acc4,$t1
+	sbc	$acc5,$acc5,xzr
+
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	 mul	$t0,$a0,$bi
+	adc	$t3,$t3,xzr
+	 mul	$t1,$a1,$bi
+
+	adds	$acc0,$acc1,$t2
+	 mul	$t2,$a2,$bi
+	adcs	$acc1,$acc2,$t3
+	 mul	$t3,$a3,$bi
+	adcs	$acc2,$acc3,$t4
+	adcs	$acc3,$acc4,$t4
+	adc	$acc4,$acc5,xzr
+
+	adds	$acc0,$acc0,$t0		// accumulate low parts
+	umulh	$t0,$a0,$bi
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a1,$bi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a2,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a3,$bi
+	adc	$acc4,$acc4,xzr
+	mul	$t4,$acc0,$ordk
+	adds	$acc1,$acc1,$t0		// accumulate high parts
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$t2
+	adcs	$acc4,$acc4,$t3
+	adc	$acc5,xzr,xzr
+___
+}
+$code.=<<___;
+	lsl	$t0,$t4,#32		// last reduction
+	subs	$acc2,$acc2,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc3,$acc3,$t0
+	sbcs	$acc4,$acc4,$t1
+	sbc	$acc5,$acc5,xzr
+
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	adc	$t3,$t3,xzr
+
+	adds	$acc0,$acc1,$t2
+	adcs	$acc1,$acc2,$t3
+	adcs	$acc2,$acc3,$t4
+	adcs	$acc3,$acc4,$t4
+	adc	$acc4,$acc5,xzr
+
+	subs	$t0,$acc0,$ord0		// ret -= modulus
+	sbcs	$t1,$acc1,$ord1
+	sbcs	$t2,$acc2,$ord2
+	sbcs	$t3,$acc3,$ord3
+	sbcs	xzr,$acc4,xzr
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                int rep);
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,%function
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adr	$ordk,.Lord
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+
+	ldp	$ord0,$ord1,[$ordk,#0]
+	ldp	$ord2,$ord3,[$ordk,#16]
+	ldr	$ordk,[$ordk,#32]
+	b	.Loop_ord_sqr
+
+.align	4
+.Loop_ord_sqr:
+	sub	$bp,$bp,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	$acc1,$a1,$a0		// a[1]*a[0]
+	umulh	$t1,$a1,$a0
+	mul	$acc2,$a2,$a0		// a[2]*a[0]
+	umulh	$t2,$a2,$a0
+	mul	$acc3,$a3,$a0		// a[3]*a[0]
+	umulh	$acc4,$a3,$a0
+
+	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
+	 mul	$t0,$a2,$a1		// a[2]*a[1]
+	 umulh	$t1,$a2,$a1
+	adcs	$acc3,$acc3,$t2
+	 mul	$t2,$a3,$a1		// a[3]*a[1]
+	 umulh	$t3,$a3,$a1
+	adc	$acc4,$acc4,xzr		// can't overflow
+
+	mul	$acc5,$a3,$a2		// a[3]*a[2]
+	umulh	$acc6,$a3,$a2
+
+	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
+	 mul	$acc0,$a0,$a0		// a[0]*a[0]
+	adc	$t2,$t3,xzr		// can't overflow
+
+	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
+	 umulh	$a0,$a0,$a0
+	adcs	$acc4,$acc4,$t1
+	 mul	$t1,$a1,$a1		// a[1]*a[1]
+	adcs	$acc5,$acc5,$t2
+	 umulh	$a1,$a1,$a1
+	adc	$acc6,$acc6,xzr		// can't overflow
+
+	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
+	 mul	$t2,$a2,$a2		// a[2]*a[2]
+	adcs	$acc2,$acc2,$acc2
+	 umulh	$a2,$a2,$a2
+	adcs	$acc3,$acc3,$acc3
+	 mul	$t3,$a3,$a3		// a[3]*a[3]
+	adcs	$acc4,$acc4,$acc4
+	 umulh	$a3,$a3,$a3
+	adcs	$acc5,$acc5,$acc5
+	adcs	$acc6,$acc6,$acc6
+	adc	$acc7,xzr,xzr
+
+	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
+	 mul	$t4,$acc0,$ordk
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$a1
+	adcs	$acc4,$acc4,$t2
+	adcs	$acc5,$acc5,$a2
+	adcs	$acc6,$acc6,$t3
+	adc	$acc7,$acc7,$a3
+___
+for($i=0; $i<4; $i++) {			# reductions
+$code.=<<___;
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	adc	$t3,$t3,xzr
+
+	adds	$acc0,$acc1,$t2
+	adcs	$acc1,$acc2,$t3
+	adcs	$acc2,$acc3,$t4
+	adc	$acc3,xzr,$t4		// can't overflow
+___
+$code.=<<___	if ($i<3);
+	mul	$t3,$acc0,$ordk
+___
+$code.=<<___;
+	lsl	$t0,$t4,#32
+	subs	$acc1,$acc1,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc2,$acc2,$t0
+	sbc	$acc3,$acc3,$t1		// can't borrow
+___
+	($t3,$t4) = ($t4,$t3);
+}
+$code.=<<___;
+	adds	$acc0,$acc0,$acc4	// accumulate upper half
+	adcs	$acc1,$acc1,$acc5
+	adcs	$acc2,$acc2,$acc6
+	adcs	$acc3,$acc3,$acc7
+	adc	$acc4,xzr,xzr
+
+	subs	$t0,$acc0,$ord0		// ret -= modulus
+	sbcs	$t1,$acc1,$ord1
+	sbcs	$t2,$acc2,$ord2
+	sbcs	$t3,$acc3,$ord3
+	sbcs	xzr,$acc4,xzr
+
+	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$a1,$acc1,$t1,lo
+	csel	$a2,$acc2,$t2,lo
+	csel	$a3,$acc3,$t3,lo
+
+	cbnz	$bp,.Loop_ord_sqr
+
+	stp	$a0,$a1,[$rp]
+	stp	$a2,$a3,[$rp,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
 }	}
 
 ########################################################################
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 48d6464..96b5dba 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -1,15 +1,17 @@
 #! /usr/bin/env perl
 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html
 #
-# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
 # (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
 #
 # Reference:
 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -18,23 +20,25 @@
 # Further optimization by <appro at openssl.org>:
 #
 #		this/original	with/without -DECP_NISTZ256_ASM(*)
-# Opteron	+12-49%		+110-150%
-# Bulldozer	+14-45%		+175-210%
-# P4		+18-46%		n/a :-(
-# Westmere	+12-34%		+80-87%
-# Sandy Bridge	+9-35%		+110-120%
-# Ivy Bridge	+9-35%		+110-125%
-# Haswell	+8-37%		+140-160%
-# Broadwell	+18-58%		+145-210%
-# Atom		+15-50%		+130-180%
-# VIA Nano	+43-160%	+300-480%
+# Opteron	+15-49%		+150-195%
+# Bulldozer	+18-45%		+175-240%
+# P4		+24-46%		+100-150%
+# Westmere	+18-34%		+87-160%
+# Sandy Bridge	+14-35%		+120-185%
+# Ivy Bridge	+11-35%		+125-180%
+# Haswell	+10-37%		+160-200%
+# Broadwell	+24-58%		+210-270%
+# Atom		+20-50%		+180-240%
+# VIA Nano	+50-160%	+480-480%
 #
 # (*)	"without -DECP_NISTZ256_ASM" refers to build with
 #	"enable-ec_nistp_64_gcc_128";
 #
 # Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
 
 $flavour = shift;
 $output  = shift;
@@ -95,6 +99,12 @@ $code.=<<___;
 .long 3,3,3,3,3,3,3,3
 .LONE_mont:
 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
 ___
 
 {
@@ -334,152 +344,1231 @@ ecp_nistz256_add:
 	 mov	$a1, $t1
 	adc	\$0, $t4
 
-	sub	8*0($a_ptr), $a0
-	 mov	$a2, $t2
-	sbb	8*1($a_ptr), $a1
-	sbb	8*2($a_ptr), $a2
-	 mov	$a3, $t3
-	sbb	8*3($a_ptr), $a3
-	sbb	\$0, $t4
+	sub	8*0($a_ptr), $a0
+	 mov	$a2, $t2
+	sbb	8*1($a_ptr), $a1
+	sbb	8*2($a_ptr), $a2
+	 mov	$a3, $t3
+	sbb	8*3($a_ptr), $a3
+	sbb	\$0, $t4
+
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovc	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovc	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	mov	0(%rsp),%r13
+.cfi_restore	%r13
+	mov	8(%rsp),%r12
+.cfi_restore	%r12
+	lea	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Ladd_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_add,.-ecp_nistz256_add
+
+################################################################################
+# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
+.globl	ecp_nistz256_sub
+.type	ecp_nistz256_sub,\@function,3
+.align	32
+ecp_nistz256_sub:
+.cfi_startproc
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+.Lsub_body:
+
+	mov	8*0($a_ptr), $a0
+	xor	$t4, $t4
+	mov	8*1($a_ptr), $a1
+	mov	8*2($a_ptr), $a2
+	mov	8*3($a_ptr), $a3
+	lea	.Lpoly(%rip), $a_ptr
+
+	sub	8*0($b_ptr), $a0
+	sbb	8*1($b_ptr), $a1
+	 mov	$a0, $t0
+	sbb	8*2($b_ptr), $a2
+	sbb	8*3($b_ptr), $a3
+	 mov	$a1, $t1
+	sbb	\$0, $t4
+
+	add	8*0($a_ptr), $a0
+	 mov	$a2, $t2
+	adc	8*1($a_ptr), $a1
+	adc	8*2($a_ptr), $a2
+	 mov	$a3, $t3
+	adc	8*3($a_ptr), $a3
+	test	$t4, $t4
+
+	cmovz	$t0, $a0
+	cmovz	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovz	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovz	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	mov	0(%rsp),%r13
+.cfi_restore	%r13
+	mov	8(%rsp),%r12
+.cfi_restore	%r12
+	lea	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Lsub_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_sub,.-ecp_nistz256_sub
+
+################################################################################
+# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
+.globl	ecp_nistz256_neg
+.type	ecp_nistz256_neg,\@function,2
+.align	32
+ecp_nistz256_neg:
+.cfi_startproc
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+.Lneg_body:
+
+	xor	$a0, $a0
+	xor	$a1, $a1
+	xor	$a2, $a2
+	xor	$a3, $a3
+	xor	$t4, $t4
+
+	sub	8*0($a_ptr), $a0
+	sbb	8*1($a_ptr), $a1
+	sbb	8*2($a_ptr), $a2
+	 mov	$a0, $t0
+	sbb	8*3($a_ptr), $a3
+	lea	.Lpoly(%rip), $a_ptr
+	 mov	$a1, $t1
+	sbb	\$0, $t4
+
+	add	8*0($a_ptr), $a0
+	 mov	$a2, $t2
+	adc	8*1($a_ptr), $a1
+	adc	8*2($a_ptr), $a2
+	 mov	$a3, $t3
+	adc	8*3($a_ptr), $a3
+	test	$t4, $t4
+
+	cmovz	$t0, $a0
+	cmovz	$t1, $a1
+	mov	$a0, 8*0($r_ptr)
+	cmovz	$t2, $a2
+	mov	$a1, 8*1($r_ptr)
+	cmovz	$t3, $a3
+	mov	$a2, 8*2($r_ptr)
+	mov	$a3, 8*3($r_ptr)
+
+	mov	0(%rsp),%r13
+.cfi_restore	%r13
+	mov	8(%rsp),%r12
+.cfi_restore	%r12
+	lea	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Lneg_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+___
+}
+{
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
+my ($poly1,$poly3)=($acc6,$acc7);
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_ord_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,\@function,3
+.align	32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc
+___
+$code.=<<___	if ($addx);
+	mov	\$0x80100, %ecx
+	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	cmp	\$0x80100, %ecx
+	je	.Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mul_body:
+
+	mov	8*0($b_org), %rax
+	mov	$b_org, $b_ptr
+	lea	.Lord(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# * b[0]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	mov	%rax, $acc0
+	mov	$t0, %rax
+	mov	%rdx, $acc1
+
+	mulq	8*1($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc2
+
+	mulq	8*2($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc0, $acc5
+	 imulq	%r15,$acc0
+
+	mov	%rdx, $acc3
+	mulq	8*3($a_ptr)
+	add	%rax, $acc3
+	 mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# First reduction step
+	mulq	8*0(%r14)
+	mov	$acc0, $t1
+	add	%rax, $acc5		# guaranteed to be zero
+	mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $acc0		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$t1, %rax
+	adc	%rdx, $acc2
+	mov	$t1, %rdx
+	adc	\$0, $acc0		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*1($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc0, $acc3
+	adc	$t1, $acc4
+	adc	\$0, $acc5
+
+	################################# * b[1]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc1, $t0
+	 imulq	%r15, $acc1
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	xor	$acc0, $acc0
+	add	%rax, $acc4
+	 mov	$acc1, %rax
+	adc	%rdx, $acc5
+	adc	\$0, $acc0
+
+	################################# Second reduction step
+	mulq	8*0(%r14)
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc1, %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $acc1		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t1, %rax
+	adc	%rdx, $acc3
+	mov	$t1, %rdx
+	adc	\$0, $acc1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc4
+	 mov	8*2($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc1, $acc4
+	adc	$t1, $acc5
+	adc	\$0, $acc0
+
+	################################## * b[2]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc2, $t0
+	 imulq	%r15, $acc2
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	xor	$acc1, $acc1
+	add	%rax, $acc5
+	 mov	$acc2, %rax
+	adc	%rdx, $acc0
+	adc	\$0, $acc1
+
+	################################# Third reduction step
+	mulq	8*0(%r14)
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc2, %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc4
+	sbb	\$0, $acc2		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t1, %rax
+	adc	%rdx, $acc4
+	mov	$t1, %rdx
+	adc	\$0, $acc2		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc5
+	 mov	8*3($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc2, $acc5
+	adc	$t1, $acc0
+	adc	\$0, $acc1
+
+	################################# * b[3]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	add	%rax, $acc5
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc3, $t0
+	 imulq	%r15, $acc3
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc0
+	adc	\$0, %rdx
+	xor	$acc2, $acc2
+	add	%rax, $acc0
+	 mov	$acc3, %rax
+	adc	%rdx, $acc1
+	adc	\$0, $acc2
+
+	################################# Last reduction step
+	mulq	8*0(%r14)
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc3, %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc5
+	sbb	\$0, $acc3		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t1, %rax
+	adc	%rdx, $acc5
+	mov	$t1, %rdx
+	adc	\$0, $acc3		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc3, $acc0
+	adc	$t1, $acc1
+	adc	\$0, $acc2
+
+	################################# Subtract ord
+	 mov	$acc4, $a_ptr
+	sub	8*0(%r14), $acc4
+	 mov	$acc5, $acc3
+	sbb	8*1(%r14), $acc5
+	 mov	$acc0, $t0
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$a_ptr, $acc4
+	cmovc	$acc3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mul_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   int rep);
+
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc
+___
+$code.=<<___	if ($addx);
+	mov	\$0x80100, %ecx
+	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	cmp	\$0x80100, %ecx
+	je	.Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqr_body:
+
+	mov	8*0($a_ptr), $acc0
+	mov	8*1($a_ptr), %rax
+	mov	8*2($a_ptr), $acc6
+	mov	8*3($a_ptr), $acc7
+	lea	.Lord(%rip), $a_ptr	# pointer to modulus
+	mov	$b_org, $b_ptr
+	jmp	.Loop_ord_sqr
+
+.align	32
+.Loop_ord_sqr:
+	################################# a[1:] * a[0]
+	mov	%rax, $t1		# put aside a[1]
+	mul	$acc0			# a[1] * a[0]
+	mov	%rax, $acc1
+	movq	$t1, %xmm1		# offload a[1]
+	mov	$acc6, %rax
+	mov	%rdx, $acc2
+
+	mul	$acc0			# a[2] * a[0]
+	add	%rax, $acc2
+	mov	$acc7, %rax
+	movq	$acc6, %xmm2		# offload a[2]
+	adc	\$0, %rdx
+	mov	%rdx, $acc3
+
+	mul	$acc0			# a[3] * a[0]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	movq	$acc7, %xmm3		# offload a[3]
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# a[3] * a[2]
+	mul	$acc6			# a[3] * a[2]
+	mov	%rax, $acc5
+	mov	$acc6, %rax
+	mov	%rdx, $acc6
+
+	################################# a[2:] * a[1]
+	mul	$t1			# a[2] * a[1]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc7
+
+	mul	$t1			# a[3] * a[1]
+	add	%rax, $acc4
+	adc	\$0, %rdx
+
+	add	$acc7, $acc4
+	adc	%rdx, $acc5
+	adc	\$0, $acc6		# can't overflow
+
+	################################# *2
+	xor	$acc7, $acc7
+	mov	$acc0, %rax
+	add	$acc1, $acc1
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	adc	$acc4, $acc4
+	adc	$acc5, $acc5
+	adc	$acc6, $acc6
+	adc	\$0, $acc7
+
+	################################# Missing products
+	mul	%rax			# a[0] * a[0]
+	mov	%rax, $acc0
+	movq	%xmm1, %rax
+	mov	%rdx, $t1
+
+	mul	%rax			# a[1] * a[1]
+	add	$t1, $acc1
+	adc	%rax, $acc2
+	movq	%xmm2, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mul	%rax			# a[2] * a[2]
+	add	$t1, $acc3
+	adc	%rax, $acc4
+	movq	%xmm3, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	 mov	$acc0, $t0
+	 imulq	8*4($a_ptr), $acc0	# *= .LordK
+
+	mul	%rax			# a[3] * a[3]
+	add	$t1, $acc5
+	adc	%rax, $acc6
+	 mov	8*0($a_ptr), %rax	# modulus[0]
+	adc	%rdx, $acc7		# can't overflow
+
+	################################# First reduction step
+	mul	$acc0
+	mov	$acc0, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax	# modulus[1]
+	adc	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc0
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$acc0, %rax
+	adc	%rdx, $acc2
+	mov	$acc0, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc1, $t0
+	 imulq	8*4($a_ptr), $acc1	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc0		# can't borrow
+
+	add	$t1, $acc3
+	adc	\$0, $acc0		# can't overflow
+
+	################################# Second reduction step
+	mul	$acc1
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc1
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$acc1, %rax
+	adc	%rdx, $acc3
+	mov	$acc1, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc2, $t0
+	 imulq	8*4($a_ptr), $acc2	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc1		# can't borrow
+
+	add	$t1, $acc0
+	adc	\$0, $acc1		# can't overflow
+
+	################################# Third reduction step
+	mul	$acc2
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc0
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc2
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$acc2, %rax
+	adc	%rdx, $acc0
+	mov	$acc2, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc3, $t0
+	 imulq	8*4($a_ptr), $acc3	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc1
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc2		# can't borrow
+
+	add	$t1, $acc1
+	adc	\$0, $acc2		# can't overflow
+
+	################################# Last reduction step
+	mul	$acc3
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc1
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc3
+	add	$t0, $acc0
+	adc	\$0, %rdx
+	add	%rax, $acc0
+	mov	$acc3, %rax
+	adc	%rdx, $acc1
+	mov	$acc3, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc2
+	sbb	%rdx, $acc3		# can't borrow
+
+	add	$t1, $acc2
+	adc	\$0, $acc3		# can't overflow
+
+	################################# Add bits [511:256] of the sqr result
+	xor	%rdx, %rdx
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc0, $acc4
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, %rax
+	adc	\$0, %rdx
+
+	################################# Compare to modulus
+	sub	8*0($a_ptr), $acc0
+	 mov	$acc2, $acc6
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc7
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rdx
+
+	cmovc	$acc4, $acc0
+	cmovnc	$acc1, %rax
+	cmovnc	$acc2, $acc6
+	cmovnc	$acc3, $acc7
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqr
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	%rax,  8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc6, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc7, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqr_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+
+$code.=<<___	if ($addx);
+################################################################################
+.type	ecp_nistz256_ord_mul_montx,\@function,3
+.align	32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mulx_body:
+
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	.Lord-128(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# Multiply by b[0]
+	mulx	$acc1, $acc0, $acc1
+	mulx	$acc2, $t0, $acc2
+	mulx	$acc3, $t1, $acc3
+	add	$t0, $acc1
+	mulx	$acc4, $t0, $acc4
+	 mov	$acc0, %rdx
+	 mulx	%r15, %rdx, %rax
+	adc	$t1, $acc2
+	adc	$t0, $acc3
+	adc	\$0, $acc4
+
+	################################# reduction
+	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*1($b_ptr), %rdx
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	adcx	$acc0, $acc4
+	adox	$acc0, $acc5
+	adc	\$0, $acc5		# cf=0, of=0
+
+	################################# Multiply by b[1]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc1, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	adcx	$acc0, $acc5
+	adox	$acc0, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc1		# guaranteed to be zero
+	adox	$t1, $acc2
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*2($b_ptr), %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adcx	$acc1, $acc5
+	adox	$acc1, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# Multiply by b[2]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc2, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	adcx	$acc1, $acc0
+	adox	$acc1, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*3($b_ptr), %rdx
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	adcx	$acc2, $acc0
+	adox	$acc2, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# Multiply by b[3]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc3, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+
+	adcx	$acc2, $acc1
+	adox	$acc2, $acc2
+	adc	\$0, $acc2		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc3		# guranteed to be zero
+	adox	$t1, $acc4
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128(%r14), $t0, $t1
+	lea	128(%r14),%r14
+	 mov	$acc4, $t2
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	 mov	$acc5, $t3
+	adcx	$acc3, $acc1
+	adox	$acc3, $acc2
+	adc	\$0, $acc2
+
+	#################################
+	# Branch-less conditional subtraction of P
+	 mov	$acc0, $t0
+	sub	8*0(%r14), $acc4
+	sbb	8*1(%r14), $acc5
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
 
-	cmovc	$t0, $a0
-	cmovc	$t1, $a1
-	mov	$a0, 8*0($r_ptr)
-	cmovc	$t2, $a2
-	mov	$a1, 8*1($r_ptr)
-	cmovc	$t3, $a3
-	mov	$a2, 8*2($r_ptr)
-	mov	$a3, 8*3($r_ptr)
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
 
-	mov	0(%rsp),%r13
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
 .cfi_restore	%r13
-	mov	8(%rsp),%r12
+	mov	24(%rsp),%r12
 .cfi_restore	%r12
-	lea	16(%rsp),%rsp
-.cfi_adjust_cfa_offset	-16
-.Ladd_epilogue:
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mulx_epilogue:
 	ret
 .cfi_endproc
-.size	ecp_nistz256_add,.-ecp_nistz256_add
+.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
 
-################################################################################
-# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
-.globl	ecp_nistz256_sub
-.type	ecp_nistz256_sub,\@function,3
+.type	ecp_nistz256_ord_sqr_montx,\@function,3
 .align	32
-ecp_nistz256_sub:
+ecp_nistz256_ord_sqr_montx:
 .cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
 	push	%r12
 .cfi_push	%r12
 	push	%r13
 .cfi_push	%r13
-.Lsub_body:
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqrx_body:
 
-	mov	8*0($a_ptr), $a0
-	xor	$t4, $t4
-	mov	8*1($a_ptr), $a1
-	mov	8*2($a_ptr), $a2
-	mov	8*3($a_ptr), $a3
-	lea	.Lpoly(%rip), $a_ptr
+	mov	$b_org, $b_ptr
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), $acc6
+	mov	8*2($a_ptr), $acc7
+	mov	8*3($a_ptr), $acc0
+	lea	.Lord(%rip), $a_ptr
+	jmp	.Loop_ord_sqrx
 
-	sub	8*0($b_ptr), $a0
-	sbb	8*1($b_ptr), $a1
-	 mov	$a0, $t0
-	sbb	8*2($b_ptr), $a2
-	sbb	8*3($b_ptr), $a3
-	 mov	$a1, $t1
-	sbb	\$0, $t4
+.align	32
+.Loop_ord_sqrx:
+	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
+	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
+	 mov	%rdx, %rax		# offload a[0]
+	 movq	$acc6, %xmm1		# offload a[1]
+	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
+	 mov	$acc6, %rdx
+	add	$t0, $acc2
+	 movq	$acc7, %xmm2		# offload a[2]
+	adc	$t1, $acc3
+	adc	\$0, $acc4
+	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
+	#################################
+	mulx	$acc7, $t0, $t1		# a[1]*a[2]
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
 
-	add	8*0($a_ptr), $a0
-	 mov	$a2, $t2
-	adc	8*1($a_ptr), $a1
-	adc	8*2($a_ptr), $a2
-	 mov	$a3, $t3
-	adc	8*3($a_ptr), $a3
-	test	$t4, $t4
+	mulx	$acc0, $t0, $t1		# a[1]*a[3]
+	 mov	$acc7, %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adc	\$0, $acc5
+	#################################
+	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
+	mov	%rax, %rdx
+	 movq	$acc0, %xmm3		# offload a[3]
+	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
+	 adcx	$acc1, $acc1		# acc1:6<<1
+	adox	$t0, $acc5
+	 adcx	$acc2, $acc2
+	adox	$acc7, $acc6		# of=0
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
-	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
-	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
-	mov	$a2, 8*2($r_ptr)
-	mov	$a3, 8*3($r_ptr)
+	################################# a[i]*a[i]
+	mulx	%rdx, $acc0, $t1
+	movq	%xmm1, %rdx
+	 adcx	$acc3, $acc3
+	adox	$t1, $acc1
+	 adcx	$acc4, $acc4
+	mulx	%rdx, $t0, $t4
+	movq	%xmm2, %rdx
+	 adcx	$acc5, $acc5
+	adox	$t0, $acc2
+	 adcx	$acc6, $acc6
+	mulx	%rdx, $t0, $t1
+	.byte	0x67
+	movq	%xmm3, %rdx
+	adox	$t4, $acc3
+	 adcx	$acc7, $acc7
+	adox	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	%rdx, $t0, $t4
+	adox	$t0, $acc6
+	adox	$t4, $acc7
 
-	mov	0(%rsp),%r13
-.cfi_restore	%r13
-	mov	8(%rsp),%r12
-.cfi_restore	%r12
-	lea	16(%rsp),%rsp
-.cfi_adjust_cfa_offset	-16
-.Lsub_epilogue:
-	ret
-.cfi_endproc
-.size	ecp_nistz256_sub,.-ecp_nistz256_sub
+	################################# reduction
+	mov	$acc0, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
 
-################################################################################
-# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
-.globl	ecp_nistz256_neg
-.type	ecp_nistz256_neg,\@function,2
-.align	32
-ecp_nistz256_neg:
-.cfi_startproc
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-.Lneg_body:
+	xor	%rax, %rax		# cf=0, of=0
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0		# of=0
+	adcx	%rax, $acc0		# cf=0
 
-	xor	$a0, $a0
-	xor	$a1, $a1
-	xor	$a2, $a2
-	xor	$a3, $a3
-	xor	$t4, $t4
+	#################################
+	mov	$acc1, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
 
-	sub	8*0($a_ptr), $a0
-	sbb	8*1($a_ptr), $a1
-	sbb	8*2($a_ptr), $a2
-	 mov	$a0, $t0
-	sbb	8*3($a_ptr), $a3
-	lea	.Lpoly(%rip), $a_ptr
-	 mov	$a1, $t1
-	sbb	\$0, $t4
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc1		# guaranteed to be zero
+	adcx	$t1, $acc2
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc3
+	adcx	$t1, $acc0
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1		# cf=0
+	adox	%rax, $acc1		# of=0
 
-	add	8*0($a_ptr), $a0
-	 mov	$a2, $t2
-	adc	8*1($a_ptr), $a1
-	adc	8*2($a_ptr), $a2
-	 mov	$a3, $t3
-	adc	8*3($a_ptr), $a3
-	test	$t4, $t4
+	#################################
+	mov	$acc2, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
-	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
-	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
-	mov	$a2, 8*2($r_ptr)
-	mov	$a3, 8*3($r_ptr)
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2		# of=0
+	adcx	%rax, $acc2		# cf=0
 
-	mov	0(%rsp),%r13
+	#################################
+	mov	$acc3, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc3		# guaranteed to be zero
+	adcx	$t1, $acc0
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc1
+	adcx	$t1, $acc2
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	adox	%rax, $acc3
+
+	################################# accumulate upper half
+	add	$acc0, $acc4		# add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc4, %rdx
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, $acc6
+	adc	\$0, %rax
+
+	################################# compare to modulus
+	sub	8*0($a_ptr), $acc4
+	 mov	$acc2, $acc7
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc0
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rax
+
+	cmovnc	$acc4, %rdx
+	cmovnc	$acc1, $acc6
+	cmovnc	$acc2, $acc7
+	cmovnc	$acc3, $acc0
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqrx
+
+	mov	%rdx, 8*0($r_ptr)
+	mov	$acc6, 8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc7, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc0, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
 .cfi_restore	%r13
-	mov	8(%rsp),%r12
+	mov	24(%rsp),%r12
 .cfi_restore	%r12
-	lea	16(%rsp),%rsp
-.cfi_adjust_cfa_offset	-16
-.Lneg_epilogue:
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqrx_epilogue:
 	ret
 .cfi_endproc
-.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
 ___
-}
-{
-my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
-my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
-my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
-my ($poly1,$poly3)=($acc6,$acc7);
 
 $code.=<<___;
 ################################################################################
@@ -3366,6 +4455,24 @@ full_handler:
 	.rva	.LSEH_end_ecp_nistz256_neg
 	.rva	.LSEH_info_ecp_nistz256_neg
 
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___	if ($addx);
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
 	.rva	.LSEH_begin_ecp_nistz256_to_mont
 	.rva	.LSEH_end_ecp_nistz256_to_mont
 	.rva	.LSEH_info_ecp_nistz256_to_mont
@@ -3453,6 +4560,30 @@ $code.=<<___;
 	.byte	9,0,0,0
 	.rva	short_handler
 	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
+.LSEH_info_ecp_nistz256_ord_mul_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___ if ($addx);
+.LSEH_info_ecp_nistz256_ord_mul_montx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_montx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___;
 .LSEH_info_ecp_nistz256_to_mont:
 	.byte	9,0,0,0
 	.rva	full_handler
diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c
index 9f82b4e..efec5a7 100644
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c
@@ -48,6 +48,8 @@ static const ERR_STRING_DATA EC_str_functs[] = {
      "ECPKParameters_print_fp"},
     {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_GET_AFFINE, 0),
      "ecp_nistz256_get_affine"},
+    {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_INV_MOD_ORD, 0),
+     "ecp_nistz256_inv_mod_ord"},
     {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 0),
      "ecp_nistz256_mult_precompute"},
     {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_POINTS_MUL, 0),
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index 6cc0190..540aa53 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -155,6 +155,9 @@ struct ec_method_st {
     /* custom ECDH operation */
     int (*ecdh_compute_key)(unsigned char **pout, size_t *poutlen,
                             const EC_POINT *pub_key, const EC_KEY *ecdh);
+    /* Inverse modulo order */
+    int (*field_inverse_mod_ord)(const EC_GROUP *, BIGNUM *r, BIGNUM *x,
+                                 BN_CTX *ctx);
 };
 
 /*
@@ -520,7 +523,6 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
 void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,
                                      unsigned char *digit, unsigned char in);
 #endif
-int ec_precompute_mont_data(EC_GROUP *);
 int ec_group_simple_order_bits(const EC_GROUP *group);
 
 #ifdef ECP_NISTZ256_ASM
@@ -604,3 +606,6 @@ int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
            const uint8_t peer_public_value[32]);
 void X25519_public_from_private(uint8_t out_public_value[32],
                                 const uint8_t private_key[32]);
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+                            BIGNUM *x, BN_CTX *ctx);
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index 7ae48cf..8d508dd 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -261,6 +261,8 @@ int EC_METHOD_get_field_type(const EC_METHOD *meth)
     return meth->field_type;
 }
 
+static int ec_precompute_mont_data(EC_GROUP *);
+
 int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
                            const BIGNUM *order, const BIGNUM *cofactor)
 {
@@ -961,7 +963,7 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group)
  * ec_precompute_mont_data sets |group->mont_data| from |group->order| and
  * returns one on success. On error it returns zero.
  */
-int ec_precompute_mont_data(EC_GROUP *group)
+static int ec_precompute_mont_data(EC_GROUP *group)
 {
     BN_CTX *ctx = BN_CTX_new();
     int ret = 0;
@@ -1006,3 +1008,12 @@ int ec_group_simple_order_bits(const EC_GROUP *group)
         return 0;
     return BN_num_bits(group->order);
 }
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+                            BIGNUM *x, BN_CTX *ctx)
+{
+    if (group->meth->field_inverse_mod_ord != NULL)
+        return group->meth->field_inverse_mod_ord(group, res, x, ctx);
+    else
+        return 0;
+}
diff --git a/crypto/ec/ecdsa_ossl.c b/crypto/ec/ecdsa_ossl.c
index 30458f1..a405d38 100644
--- a/crypto/ec/ecdsa_ossl.c
+++ b/crypto/ec/ecdsa_ossl.c
@@ -153,30 +153,33 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
     }
     while (BN_is_zero(r));
 
-    /* compute the inverse of k */
-    if (EC_GROUP_get_mont_data(group) != NULL) {
-        /*
-         * We want inverse in constant time, therefore we utilize the fact
-         * order must be prime and use Fermat's Little Theorem instead.
-         */
-        if (!BN_set_word(X, 2)) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        if (!BN_mod_sub(X, order, X, order, ctx)) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        BN_set_flags(X, BN_FLG_CONSTTIME);
-        if (!BN_mod_exp_mont_consttime
-            (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-    } else {
-        if (!BN_mod_inverse(k, k, order, ctx)) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
+    /* Check if optimized inverse is implemented */
+    if (EC_GROUP_do_inverse_ord(group, k, k, ctx) == 0) {
+        /* compute the inverse of k */
+        if (group->mont_data != NULL) {
+            /*
+             * We want inverse in constant time, therefore we utilize the fact
+             * order must be prime and use Fermats Little Theorem instead.
+             */
+            if (!BN_set_word(X, 2)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+            if (!BN_mod_sub(X, order, X, order, ctx)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+            BN_set_flags(X, BN_FLG_CONSTTIME);
+            if (!BN_mod_exp_mont_consttime(k, k, X, order, ctx,
+                                           group->mont_data)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+        } else {
+            if (!BN_mod_inverse(k, k, order, ctx)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
         }
     }
 
@@ -407,9 +410,12 @@ int ossl_ecdsa_verify_sig(const unsigned char *dgst, int dgst_len,
         goto err;
     }
     /* calculate tmp1 = inv(S) mod order */
-    if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
-        ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
-        goto err;
+    /* Check if optimized inverse is implemented */
+    if (EC_GROUP_do_inverse_ord(group, u2, sig->s, ctx) == 0) {
+        if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
+            ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
+            goto err;
+        }
     }
     /* digest -> m */
     i = BN_num_bits(order);
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
index 3863a61..08a7e84 100644
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -1,15 +1,17 @@
 /*
  * Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
  * Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+ * Copyright (c) 2015, CloudFlare, Inc.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
  * in the file LICENSE in the source distribution or at
  * https://www.openssl.org/source/license.html
  *
- * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
  * (1) Intel Corporation, Israel Development Center, Haifa, Israel
  * (2) University of Haifa, Israel
+ * (3) CloudFlare, Inc.
  *
  * Reference:
  * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -908,7 +910,7 @@ __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
  */
 #if defined(ECP_NISTZ256_AVX2)
 # if !(defined(__x86_64) || defined(__x86_64__) || \
-       defined(_M_AMD64) || defined(_MX64)) || \
+       defined(_M_AMD64) || defined(_M_X64)) || \
      !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
 #  undef ECP_NISTZ256_AVX2
 # else
@@ -1495,6 +1497,189 @@ static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
     return HAVEPRECOMP(group, nistz256);
 }
 
+#if defined(__x86_64) || defined(__x86_64__) || \
+    defined(_M_AMD64) || defined(_M_X64) || \
+    defined(__powerpc64__) || defined(_ARCH_PP64) || \
+    defined(__aarch64__)
+/*
+ * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P)
+ */
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               int rep);
+
+static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r,
+                                    BIGNUM *x, BN_CTX *ctx)
+{
+    /* RR = 2^512 mod ord(p256) */
+    static const BN_ULONG RR[P256_LIMBS]  = {
+        TOBN(0x83244c95,0xbe79eea2), TOBN(0x4699799c,0x49bd6fa6),
+        TOBN(0x2845b239,0x2b6bec59), TOBN(0x66e12d94,0xf3d95620)
+    };
+    /* The constant 1 (unlike ONE that is one in Montgomery representation) */
+    static const BN_ULONG one[P256_LIMBS] = {
+        TOBN(0,1), TOBN(0,0), TOBN(0,0), TOBN(0,0)
+    };
+    /*
+     * We don't use entry 0 in the table, so we omit it and address
+     * with -1 offset.
+     */
+    BN_ULONG table[15][P256_LIMBS];
+    BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
+    int i, ret = 0;
+
+    /*
+     * Catch allocation failure early.
+     */
+    if (bn_wexpand(r, P256_LIMBS) == NULL) {
+        ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+        goto err;
+    }
+
+    if ((BN_num_bits(x) > 256) || BN_is_negative(x)) {
+        BIGNUM *tmp;
+
+        if ((tmp = BN_CTX_get(ctx)) == NULL
+            || !BN_nnmod(tmp, x, group->order, ctx)) {
+            ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+            goto err;
+        }
+        x = tmp;
+    }
+
+    if (!ecp_nistz256_bignum_to_field_elem(t, x)) {
+        ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE);
+        goto err;
+    }
+
+    ecp_nistz256_ord_mul_mont(table[0], t, RR);
+#if 0
+    /*
+     * Original sparse-then-fixed-window algorithm, retained for reference.
+     */
+    for (i = 2; i < 16; i += 2) {
+        ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1);
+        ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]);
+    }
+
+    /*
+     * The top 128bit of the exponent are highly redudndant, so we
+     * perform an optimized flow
+     */
+    ecp_nistz256_ord_sqr_mont(t, table[15-1], 4);   /* f0 */
+    ecp_nistz256_ord_mul_mont(t, t, table[15-1]);   /* ff */
+
+    ecp_nistz256_ord_sqr_mont(out, t, 8);           /* ff00 */
+    ecp_nistz256_ord_mul_mont(out, out, t);         /* ffff */
+
+    ecp_nistz256_ord_sqr_mont(t, out, 16);          /* ffff0000 */
+    ecp_nistz256_ord_mul_mont(t, t, out);           /* ffffffff */
+
+    ecp_nistz256_ord_sqr_mont(out, t, 64);          /* ffffffff0000000000000000 */
+    ecp_nistz256_ord_mul_mont(out, out, t);         /* ffffffff00000000ffffffff */
+
+    ecp_nistz256_ord_sqr_mont(out, out, 32);        /* ffffffff00000000ffffffff00000000 */
+    ecp_nistz256_ord_mul_mont(out, out, t);         /* ffffffff00000000ffffffffffffffff */
+
+    /*
+     * The bottom 128 bit of the exponent are processed with fixed 4-bit window
+     */
+    for(i = 0; i < 32; i++) {
+        /* expLo - the low 128 bits of the exponent we use (ord(p256) - 2),
+         * split into nibbles */
+        static const unsigned char expLo[32]  = {
+            0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4,
+            0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf
+        };
+
+        ecp_nistz256_ord_sqr_mont(out, out, 4);
+        /* The exponent is public, no need in constant-time access */
+        ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]);
+    }
+#else
+    /*
+     * https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+     *
+     * Even though this code path spares 12 squarings, 4.5%, and 13
+     * multiplications, 25%, on grand scale sign operation is not that
+     * much faster, not more that 2%...
+     */
+    enum {
+        i_1 = 0, i_10,     i_11,     i_101, i_111, i_1010, i_1111,
+        i_10101, i_101010, i_101111, i_x6,  i_x8,  i_x16,  i_x32
+    };
+
+    /* pre-calculate powers */
+    ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1);
+
+    ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
+
+    ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
+
+    ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
+
+    ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1);
+
+    ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
+
+    ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1);
+    ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
+
+    ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1);
+
+    ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
+
+    ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
+
+    ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2);
+    ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
+
+    ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8);
+    ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
+
+    ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16);
+    ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
+
+    /* calculations */
+    ecp_nistz256_ord_sqr_mont(out, table[i_x32], 64);
+    ecp_nistz256_ord_mul_mont(out, out, table[i_x32]);
+
+    for (i = 0; i < 27; i++) {
+        static const struct { unsigned char p, i; } chain[27] = {
+            { 32, i_x32 }, { 6,  i_101111 }, { 5,  i_111    },
+            { 4,  i_11  }, { 5,  i_1111   }, { 5,  i_10101  },
+            { 4,  i_101 }, { 3,  i_101    }, { 3,  i_101    },
+            { 5,  i_111 }, { 9,  i_101111 }, { 6,  i_1111   },
+            { 2,  i_1   }, { 5,  i_1      }, { 6,  i_1111   },
+            { 5,  i_111 }, { 4,  i_111    }, { 5,  i_111    },
+            { 5,  i_101 }, { 3,  i_11     }, { 10, i_101111 },
+            { 2,  i_11  }, { 5,  i_11     }, { 5,  i_11     },
+            { 3,  i_1   }, { 7,  i_10101  }, { 6,  i_1111   }
+        };
+
+        ecp_nistz256_ord_sqr_mont(out, out, chain[i].p);
+        ecp_nistz256_ord_mul_mont(out, out, table[chain[i].i]);
+    }
+#endif
+    ecp_nistz256_ord_mul_mont(out, out, one);
+
+    /*
+     * Can't fail, but check return code to be consistent anyway.
+     */
+    if (!bn_set_words(r, out, P256_LIMBS))
+        goto err;
+
+    ret = 1;
+err:
+    return ret;
+}
+#else
+# define ecp_nistz256_inv_mod_ord NULL
+#endif
+
 const EC_METHOD *EC_GFp_nistz256_method(void)
 {
     static const EC_METHOD ret = {
@@ -1544,7 +1729,8 @@ const EC_METHOD *EC_GFp_nistz256_method(void)
         ec_key_simple_generate_public_key,
         0, /* keycopy */
         0, /* keyfinish */
-        ecdh_simple_compute_key
+        ecdh_simple_compute_key,
+        ecp_nistz256_inv_mod_ord                    /* can be #define-d NULL */
     };
 
     return &ret;
diff --git a/crypto/err/openssl.txt b/crypto/err/openssl.txt
index 9ec0009..6449662 100644
--- a/crypto/err/openssl.txt
+++ b/crypto/err/openssl.txt
@@ -458,6 +458,7 @@ EC_F_ECPARAMETERS_PRINT_FP:148:ECParameters_print_fp
 EC_F_ECPKPARAMETERS_PRINT:149:ECPKParameters_print
 EC_F_ECPKPARAMETERS_PRINT_FP:150:ECPKParameters_print_fp
 EC_F_ECP_NISTZ256_GET_AFFINE:240:ecp_nistz256_get_affine
+EC_F_ECP_NISTZ256_INV_MOD_ORD:275:ecp_nistz256_inv_mod_ord
 EC_F_ECP_NISTZ256_MULT_PRECOMPUTE:243:ecp_nistz256_mult_precompute
 EC_F_ECP_NISTZ256_POINTS_MUL:241:ecp_nistz256_points_mul
 EC_F_ECP_NISTZ256_PRE_COMP_NEW:244:ecp_nistz256_pre_comp_new
diff --git a/include/openssl/ecerr.h b/include/openssl/ecerr.h
index bd09cb7..a1b9ea1 100644
--- a/include/openssl/ecerr.h
+++ b/include/openssl/ecerr.h
@@ -50,6 +50,7 @@ int ERR_load_EC_strings(void);
 # define EC_F_ECPKPARAMETERS_PRINT                        149
 # define EC_F_ECPKPARAMETERS_PRINT_FP                     150
 # define EC_F_ECP_NISTZ256_GET_AFFINE                     240
+# define EC_F_ECP_NISTZ256_INV_MOD_ORD                    275
 # define EC_F_ECP_NISTZ256_MULT_PRECOMPUTE                243
 # define EC_F_ECP_NISTZ256_POINTS_MUL                     241
 # define EC_F_ECP_NISTZ256_PRE_COMP_NEW                   244