[openssl-commits] [openssl] master update

Sun Jul 15 17:13:45 UTC 2018

The branch master has been updated
       via  3c849bc901fa191fc517bc20d905783e6e428de5 (commit)
       via  d3e3263072c91999afc256fa4666c40912dde410 (commit)
       via  dfd5fb09500d5800b37b3aec05884fc7409032d7 (commit)
       via  2de607d8c952fef0cadf158b0a020037837911ac (commit)
      from  5d1c09de1f2736e1d4b1877206d08455ec75f558 (commit)


- Log -----------------------------------------------------------------
commit 3c849bc901fa191fc517bc20d905783e6e428de5
Author: Andy Polyakov <appro at openssl.org>
Date:   Thu Jul 12 11:53:16 2018 +0200

    ec/curve25519.c: reorganize for better accessibility.
    
    Move base 2^64 code to own #if section. It was nested in base 2^51 section,
    which arguably might have been tricky to follow.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6699)

commit d3e3263072c91999afc256fa4666c40912dde410
Author: Andy Polyakov <appro at openssl.org>
Date:   Wed Jul 11 22:36:49 2018 +0200

    ec/asm/x25519-x86_64.pl: add CFI directives and Windows SE handler.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6699)

commit dfd5fb09500d5800b37b3aec05884fc7409032d7
Author: Andy Polyakov <appro at openssl.org>
Date:   Wed Jul 11 22:22:52 2018 +0200

    test/.../evppkey.txt: X25519 regression test vectors.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6699)

commit 2de607d8c952fef0cadf158b0a020037837911ac
Author: Andy Polyakov <appro at openssl.org>
Date:   Wed Jul 11 22:08:02 2018 +0200

    ec/asm/x25519-x86_64.pl: fix base 2^64 add/sub and final reduction.
    
    Base 2^64 addition/subtraction and final reduction failed to treat
    partially reduced values correctly.
    
    Thanks to Wycheproof Project for vectors and Paul Kehrer for report.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6699)

-----------------------------------------------------------------------

Summary of changes:
 crypto/ec/asm/x25519-x86_64.pl            | 318 +++++++++++++++++++++++++++++-
 crypto/ec/curve25519.c                    | 293 +++++++++++++--------------
 test/recipes/30-test_evp_data/evppkey.txt |  38 ++++
 3 files changed, 501 insertions(+), 148 deletions(-)

diff --git a/crypto/ec/asm/x25519-x86_64.pl b/crypto/ec/asm/x25519-x86_64.pl
index 930d7bd..da81e06 100755
--- a/crypto/ec/asm/x25519-x86_64.pl
+++ b/crypto/ec/asm/x25519-x86_64.pl
@@ -102,13 +102,22 @@ $code.=<<___;
 .type	x25519_fe51_mul,\@function,3
 .align	32
 x25519_fe51_mul:
+.cfi_startproc
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-8*5(%rsp),%rsp
+.cfi_adjust_cfa_offset	40
+.Lfe51_mul_body:
 
 	mov	8*0(%rsi),%rax		# f[0]
 	mov	8*0(%rdx),%r11		# load g[0-4]
@@ -236,19 +245,30 @@ x25519_fe51_mul:
 
 	mov	8*4(%rsp),%rdi		# restore 1st argument
 	jmp	.Lreduce51
+.Lfe51_mul_epilogue:
+.cfi_endproc
 .size	x25519_fe51_mul,.-x25519_fe51_mul
 
 .globl	x25519_fe51_sqr
 .type	x25519_fe51_sqr,\@function,2
 .align	32
 x25519_fe51_sqr:
+.cfi_startproc
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	lea	-8*5(%rsp),%rsp
+.cfi_adjust_cfa_offset	40
+.Lfe51_sqr_body:
 
 	mov	8*0(%rsi),%rax		# g[0]
 	mov	8*2(%rsi),%r15		# g[2]
@@ -391,27 +411,45 @@ x25519_fe51_sqr:
 	mov	%r10,8*4(%rdi)
 
 	mov	8*5(%rsp),%r15
+.cfi_restore	%r15
 	mov	8*6(%rsp),%r14
+.cfi_restore	%r14
 	mov	8*7(%rsp),%r13
+.cfi_restore	%r13
 	mov	8*8(%rsp),%r12
+.cfi_restore	%r12
 	mov	8*9(%rsp),%rbx
+.cfi_restore	%rbx
 	mov	8*10(%rsp),%rbp
+.cfi_restore	%rbp
 	lea	8*11(%rsp),%rsp
+.cfi_adjust_cfa_offset	88
+.Lfe51_sqr_epilogue:
 	ret
+.cfi_endproc
 .size	x25519_fe51_sqr,.-x25519_fe51_sqr
 
 .globl	x25519_fe51_mul121666
 .type	x25519_fe51_mul121666,\@function,2
 .align	32
 x25519_fe51_mul121666:
+.cfi_startproc
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
-	mov	\$121666,%eax
+.cfi_push	%r15
 	lea	-8*5(%rsp),%rsp
+.cfi_adjust_cfa_offset	40
+.Lfe51_mul121666_body:
+	mov	\$121666,%eax
 
 	mulq	8*0(%rsi)
 	mov	%rax,%rbx		# %rbx:%rcx = h0
@@ -434,6 +472,8 @@ x25519_fe51_mul121666:
 	mov	%rdx,%r15
 
 	jmp	.Lreduce51
+.Lfe51_mul121666_epilogue:
+.cfi_endproc
 .size	x25519_fe51_mul121666,.-x25519_fe51_mul121666
 ___
 ########################################################################
@@ -460,14 +500,24 @@ x25519_fe64_eligible:
 .type	x25519_fe64_mul,\@function,3
 .align	32
 x25519_fe64_mul:
+.cfi_startproc
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	push	%rdi			# offload dst
+.cfi_push	%rdi
 	lea	-8*2(%rsp),%rsp
+.cfi_adjust_cfa_offset	16
+.Lfe64_mul_body:
 
 	mov	%rdx,%rax
 	mov	8*0(%rdx),%rbp		# b[0]
@@ -534,20 +584,32 @@ x25519_fe64_mul:
 	adox	%rdi,$acc7		# of=0
 
 	jmp	.Lreduce64
+.Lfe64_mul_epilogue:
+.cfi_endproc
 .size	x25519_fe64_mul,.-x25519_fe64_mul
 
 .globl	x25519_fe64_sqr
 .type	x25519_fe64_sqr,\@function,2
 .align	32
 x25519_fe64_sqr:
+.cfi_startproc
 	push	%rbp
+.cfi_push	%rbp
 	push	%rbx
+.cfi_push	%rbx
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	push	%rdi			# offload dst
+.cfi_push	%rdi
 	lea	-8*2(%rsp),%rsp
+.cfi_adjust_cfa_offset	16
+.Lfe64_sqr_body:
 
 	mov	8*0(%rsi),%rdx		# a[0]
 	mov	8*1(%rsi),%rcx		# a[1]
@@ -637,19 +699,29 @@ x25519_fe64_sqr:
 	mov	$acc0,8*0(%rdi)
 
 	mov	8*3(%rsp),%r15
+.cfi_restore	%r15
 	mov	8*4(%rsp),%r14
+.cfi_restore	%r14
 	mov	8*5(%rsp),%r13
+.cfi_restore	%r13
 	mov	8*6(%rsp),%r12
+.cfi_restore	%r12
 	mov	8*7(%rsp),%rbx
+.cfi_restore	%rbx
 	mov	8*8(%rsp),%rbp
+.cfi_restore	%rbp
 	lea	8*9(%rsp),%rsp
+.cfi_adjust_cfa_offset	88
+.Lfe64_sqr_epilogue:
 	ret
+.cfi_endproc
 .size	x25519_fe64_sqr,.-x25519_fe64_sqr
 
 .globl	x25519_fe64_mul121666
 .type	x25519_fe64_mul121666,\@function,2
 .align	32
 x25519_fe64_mul121666:
+.Lfe64_mul121666_body:
 	mov	\$121666,%edx
 	mulx	8*0(%rsi),$acc0,%rcx
 	mulx	8*1(%rsi),$acc1,%rax
@@ -676,6 +748,7 @@ x25519_fe64_mul121666:
 	mov	$acc3,8*3(%rdi)
 	mov	$acc0,8*0(%rdi)
 
+.Lfe64_mul121666_epilogue:
 	ret
 .size	x25519_fe64_mul121666,.-x25519_fe64_mul121666
 
@@ -683,6 +756,7 @@ x25519_fe64_mul121666:
 .type	x25519_fe64_add,\@function,3
 .align	32
 x25519_fe64_add:
+.Lfe64_add_body:
 	mov	8*0(%rsi),$acc0
 	mov	8*1(%rsi),$acc1
 	mov	8*2(%rsi),$acc2
@@ -698,13 +772,18 @@ x25519_fe64_add:
 
 	add	%rax,$acc0
 	adc	\$0,$acc1
-	mov	$acc0,8*0(%rdi)
 	adc	\$0,$acc2
 	mov	$acc1,8*1(%rdi)
 	adc	\$0,$acc3
 	mov	$acc2,8*2(%rdi)
+	sbb	%rax,%rax		# cf -> mask
 	mov	$acc3,8*3(%rdi)
+	and	\$38,%rax
 
+	add	%rax,$acc0
+	mov	$acc0,8*0(%rdi)
+
+.Lfe64_add_epilogue:
 	ret
 .size	x25519_fe64_add,.-x25519_fe64_add
 
@@ -712,6 +791,7 @@ x25519_fe64_add:
 .type	x25519_fe64_sub,\@function,3
 .align	32
 x25519_fe64_sub:
+.Lfe64_sub_body:
 	mov	8*0(%rsi),$acc0
 	mov	8*1(%rsi),$acc1
 	mov	8*2(%rsi),$acc2
@@ -727,13 +807,18 @@ x25519_fe64_sub:
 
 	sub	%rax,$acc0
 	sbb	\$0,$acc1
-	mov	$acc0,8*0(%rdi)
 	sbb	\$0,$acc2
 	mov	$acc1,8*1(%rdi)
 	sbb	\$0,$acc3
 	mov	$acc2,8*2(%rdi)
+	sbb	%rax,%rax		# cf -> mask
 	mov	$acc3,8*3(%rdi)
+	and	\$38,%rax
+
+	sub	%rax,$acc0
+	mov	$acc0,8*0(%rdi)
 
+.Lfe64_sub_epilogue:
 	ret
 .size	x25519_fe64_sub,.-x25519_fe64_sub
 
@@ -741,6 +826,7 @@ x25519_fe64_sub:
 .type	x25519_fe64_tobytes,\@function,2
 .align	32
 x25519_fe64_tobytes:
+.Lfe64_to_body:
 	mov	8*0(%rsi),$acc0
 	mov	8*1(%rsi),$acc1
 	mov	8*2(%rsi),$acc2
@@ -751,6 +837,7 @@ x25519_fe64_tobytes:
 	sar	\$63,$acc3		# most significant bit -> mask
 	shr	\$1,%rax		# most significant bit cleared
 	and	\$19,$acc3
+	add	\$19,$acc3		# compare to modulus in the same go
 
 	add	$acc3,$acc0
 	adc	\$0,$acc1
@@ -760,15 +847,20 @@ x25519_fe64_tobytes:
 	lea	(%rax,%rax),$acc3
 	sar	\$63,%rax		# most significant bit -> mask
 	shr	\$1,$acc3		# most significant bit cleared
+	not	%rax
 	and	\$19,%rax
 
-	add	%rax,$acc0
+	sub	%rax,$acc0
+	sbb	\$0,$acc1
+	sbb	\$0,$acc2
+	sbb	\$0,$acc3
 
+	mov	$acc0,8*0(%rdi)
 	mov	$acc1,8*1(%rdi)
 	mov	$acc2,8*2(%rdi)
 	mov	$acc3,8*3(%rdi)
-	mov	$acc0,8*0(%rdi)
 
+.Lfe64_to_epilogue:
 	ret
 .size	x25519_fe64_tobytes,.-x25519_fe64_tobytes
 ___
@@ -804,6 +896,222 @@ $code.=<<___;
 .asciz	"X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+
+.type	short_handler,\@abi-omnipotent
+.align	16
+short_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+	jmp	.Lcommon_seh_tail
+.size	short_handler,.-short_handler
+
+.type	full_handler,\@abi-omnipotent
+.align	16
+full_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rax,%r10),%rax
+
+	mov	-8(%rax),%rbp
+	mov	-16(%rax),%rbx
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	full_handler,.-full_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_x25519_fe51_mul
+	.rva	.LSEH_end_x25519_fe51_mul
+	.rva	.LSEH_info_x25519_fe51_mul
+
+	.rva	.LSEH_begin_x25519_fe51_sqr
+	.rva	.LSEH_end_x25519_fe51_sqr
+	.rva	.LSEH_info_x25519_fe51_sqr
+
+	.rva	.LSEH_begin_x25519_fe51_mul121666
+	.rva	.LSEH_end_x25519_fe51_mul121666
+	.rva	.LSEH_info_x25519_fe51_mul121666
+___
+$code.=<<___	if ($addx);
+	.rva	.LSEH_begin_x25519_fe64_mul
+	.rva	.LSEH_end_x25519_fe64_mul
+	.rva	.LSEH_info_x25519_fe64_mul
+
+	.rva	.LSEH_begin_x25519_fe64_sqr
+	.rva	.LSEH_end_x25519_fe64_sqr
+	.rva	.LSEH_info_x25519_fe64_sqr
+
+	.rva	.LSEH_begin_x25519_fe64_mul121666
+	.rva	.LSEH_end_x25519_fe64_mul121666
+	.rva	.LSEH_info_x25519_fe64_mul121666
+
+	.rva	.LSEH_begin_x25519_fe64_add
+	.rva	.LSEH_end_x25519_fe64_add
+	.rva	.LSEH_info_x25519_fe64_add
+
+	.rva	.LSEH_begin_x25519_fe64_sub
+	.rva	.LSEH_end_x25519_fe64_sub
+	.rva	.LSEH_info_x25519_fe64_sub
+
+	.rva	.LSEH_begin_x25519_fe64_tobytes
+	.rva	.LSEH_end_x25519_fe64_tobytes
+	.rva	.LSEH_info_x25519_fe64_tobytes
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_x25519_fe51_mul:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lfe51_mul_body,.Lfe51_mul_epilogue	# HandlerData[]
+	.long	88,0
+.LSEH_info_x25519_fe51_sqr:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lfe51_sqr_body,.Lfe51_sqr_epilogue	# HandlerData[]
+	.long	88,0
+.LSEH_info_x25519_fe51_mul121666:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lfe51_mul121666_body,.Lfe51_mul121666_epilogue	# HandlerData[]
+	.long	88,0
+___
+$code.=<<___	if ($addx);
+.LSEH_info_x25519_fe64_mul:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lfe64_mul_body,.Lfe64_mul_epilogue	# HandlerData[]
+	.long	72,0
+.LSEH_info_x25519_fe64_sqr:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lfe64_sqr_body,.Lfe64_sqr_epilogue	# HandlerData[]
+	.long	72,0
+.LSEH_info_x25519_fe64_mul121666:
+	.byte	9,0,0,0
+	.rva	short_handler
+	.rva	.Lfe64_mul121666_body,.Lfe64_mul121666_epilogue	# HandlerData[]
+.LSEH_info_x25519_fe64_add:
+	.byte	9,0,0,0
+	.rva	short_handler
+	.rva	.Lfe64_add_body,.Lfe64_add_epilogue	# HandlerData[]
+.LSEH_info_x25519_fe64_sub:
+	.byte	9,0,0,0
+	.rva	short_handler
+	.rva	.Lfe64_sub_body,.Lfe64_sub_epilogue	# HandlerData[]
+.LSEH_info_x25519_fe64_tobytes:
+	.byte	9,0,0,0
+	.rva	short_handler
+	.rva	.Lfe64_to_body,.Lfe64_to_epilogue	# HandlerData[]
+___
+}
+
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
 close STDOUT;
diff --git a/crypto/ec/curve25519.c b/crypto/ec/curve25519.c
index 9666de1..abe9b9c 100644
--- a/crypto/ec/curve25519.c
+++ b/crypto/ec/curve25519.c
@@ -11,149 +11,23 @@
 #include "ec_lcl.h"
 #include <openssl/sha.h>
 
-#if defined(X25519_ASM) \
-    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
-         && !defined(__sparc__) \
-         && !(defined(__ANDROID__) && !defined(__clang__)) )
-/*
- * Base 2^51 implementation.
- */
-# define BASE_2_51_IMPLEMENTED
-
-typedef uint64_t fe51[5];
-# if !defined(X25519_ASM)
-typedef __uint128_t u128;
-# endif
-
-static const uint64_t MASK51 = 0x7ffffffffffff;
-
-static uint64_t load_7(const uint8_t *in)
-{
-    uint64_t result;
-
-    result = in[0];
-    result |= ((uint64_t)in[1]) << 8;
-    result |= ((uint64_t)in[2]) << 16;
-    result |= ((uint64_t)in[3]) << 24;
-    result |= ((uint64_t)in[4]) << 32;
-    result |= ((uint64_t)in[5]) << 40;
-    result |= ((uint64_t)in[6]) << 48;
-
-    return result;
-}
-
-static uint64_t load_6(const uint8_t *in)
-{
-    uint64_t result;
-
-    result = in[0];
-    result |= ((uint64_t)in[1]) << 8;
-    result |= ((uint64_t)in[2]) << 16;
-    result |= ((uint64_t)in[3]) << 24;
-    result |= ((uint64_t)in[4]) << 32;
-    result |= ((uint64_t)in[5]) << 40;
-
-    return result;
-}
-
-static void fe51_frombytes(fe51 h, const uint8_t *s)
-{
-    uint64_t h0 = load_7(s);                                /* 56 bits */
-    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
-    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
-    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
-    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */
-
-    h1 |= h0 >> 51; h0 &= MASK51;
-    h2 |= h1 >> 51; h1 &= MASK51;
-    h3 |= h2 >> 51; h2 &= MASK51;
-    h4 |= h3 >> 51; h3 &= MASK51;
-
-    h[0] = h0;
-    h[1] = h1;
-    h[2] = h2;
-    h[3] = h3;
-    h[4] = h4;
-}
-
-static void fe51_tobytes(uint8_t *s, const fe51 h)
-{
-    uint64_t h0 = h[0];
-    uint64_t h1 = h[1];
-    uint64_t h2 = h[2];
-    uint64_t h3 = h[3];
-    uint64_t h4 = h[4];
-    uint64_t q;
+#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
+                            defined(_M_AMD64) || defined(_M_X64))
 
-    /* compare to modulus */
-    q = (h0 + 19) >> 51;
-    q = (h1 + q) >> 51;
-    q = (h2 + q) >> 51;
-    q = (h3 + q) >> 51;
-    q = (h4 + q) >> 51;
-
-    /* full reduce */
-    h0 += 19 * q;
-    h1 += h0 >> 51; h0 &= MASK51;
-    h2 += h1 >> 51; h1 &= MASK51;
-    h3 += h2 >> 51; h2 &= MASK51;
-    h4 += h3 >> 51; h3 &= MASK51;
-                    h4 &= MASK51;
-
-    /* smash */
-    s[0] = (uint8_t)(h0 >> 0);
-    s[1] = (uint8_t)(h0 >> 8);
-    s[2] = (uint8_t)(h0 >> 16);
-    s[3] = (uint8_t)(h0 >> 24);
-    s[4] = (uint8_t)(h0 >> 32);
-    s[5] = (uint8_t)(h0 >> 40);
-    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
-    s[7] = (uint8_t)(h1 >> 5);
-    s[8] = (uint8_t)(h1 >> 13);
-    s[9] = (uint8_t)(h1 >> 21);
-    s[10] = (uint8_t)(h1 >> 29);
-    s[11] = (uint8_t)(h1 >> 37);
-    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
-    s[13] = (uint8_t)(h2 >> 2);
-    s[14] = (uint8_t)(h2 >> 10);
-    s[15] = (uint8_t)(h2 >> 18);
-    s[16] = (uint8_t)(h2 >> 26);
-    s[17] = (uint8_t)(h2 >> 34);
-    s[18] = (uint8_t)(h2 >> 42);
-    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
-    s[20] = (uint8_t)(h3 >> 7);
-    s[21] = (uint8_t)(h3 >> 15);
-    s[22] = (uint8_t)(h3 >> 23);
-    s[23] = (uint8_t)(h3 >> 31);
-    s[24] = (uint8_t)(h3 >> 39);
-    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
-    s[26] = (uint8_t)(h4 >> 4);
-    s[27] = (uint8_t)(h4 >> 12);
-    s[28] = (uint8_t)(h4 >> 20);
-    s[29] = (uint8_t)(h4 >> 28);
-    s[30] = (uint8_t)(h4 >> 36);
-    s[31] = (uint8_t)(h4 >> 44);
-}
-
-# ifdef X25519_ASM
-void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
-void x25519_fe51_sqr(fe51 h, const fe51 f);
-void x25519_fe51_mul121666(fe51 h, fe51 f);
-#  define fe51_mul x25519_fe51_mul
-#  define fe51_sq  x25519_fe51_sqr
-#  define fe51_mul121666 x25519_fe51_mul121666
-
-#  if defined(__x86_64) || defined(__x86_64__) || \
-      defined(_M_AMD64) || defined(_M_X64)
-
-#   define BASE_2_64_IMPLEMENTED
+# define BASE_2_64_IMPLEMENTED
 
 typedef uint64_t fe64[4];
 
 int x25519_fe64_eligible(void);
 
 /*
- * There are no reference C implementations for this radix.
+ * Following subroutines perform corresponding operations modulo
+ * 2^256-38, i.e. double the curve modulus. However, inputs and
+ * outputs are permitted to be partially reduced, i.e. to remain
+ * in [0..2^256) range. It's all tied up in final fe64_tobytes
+ * that performs full reduction modulo 2^255-19.
+ *
+ * There are no reference C implementations for these.
  */
 void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_sqr(fe64 h, const fe64 f);
@@ -161,12 +35,12 @@ void x25519_fe64_mul121666(fe64 h, fe64 f);
 void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_tobytes(uint8_t *s, const fe64 f);
-#   define fe64_mul x25519_fe64_mul
-#   define fe64_sqr x25519_fe64_sqr
-#   define fe64_mul121666 x25519_fe64_mul121666
-#   define fe64_add x25519_fe64_add
-#   define fe64_sub x25519_fe64_sub
-#   define fe64_tobytes x25519_fe64_tobytes
+# define fe64_mul x25519_fe64_mul
+# define fe64_sqr x25519_fe64_sqr
+# define fe64_mul121666 x25519_fe64_mul121666
+# define fe64_add x25519_fe64_add
+# define fe64_sub x25519_fe64_sub
+# define fe64_tobytes x25519_fe64_tobytes
 
 static uint64_t load_8(const uint8_t *in)
 {
@@ -375,10 +249,143 @@ static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32],
 
     OPENSSL_cleanse(e, sizeof(e));
 }
-#  endif
+#endif
+
+#if defined(X25519_ASM) \
+    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
+         && !defined(__sparc__) \
+         && !(defined(__ANDROID__) && !defined(__clang__)) )
+/*
+ * Base 2^51 implementation. It's virtually no different from reference
+ * base 2^25.5 implementation in respect to lax boundary conditions for
+ * intermediate values and even individual limbs. So that whatever you
+ * know about the reference, applies even here...
+ */
+# define BASE_2_51_IMPLEMENTED
+
+typedef uint64_t fe51[5];
+
+static const uint64_t MASK51 = 0x7ffffffffffff;
+
+static uint64_t load_7(const uint8_t *in)
+{
+    uint64_t result;
+
+    result = in[0];
+    result |= ((uint64_t)in[1]) << 8;
+    result |= ((uint64_t)in[2]) << 16;
+    result |= ((uint64_t)in[3]) << 24;
+    result |= ((uint64_t)in[4]) << 32;
+    result |= ((uint64_t)in[5]) << 40;
+    result |= ((uint64_t)in[6]) << 48;
+
+    return result;
+}
+
+static uint64_t load_6(const uint8_t *in)
+{
+    uint64_t result;
+
+    result = in[0];
+    result |= ((uint64_t)in[1]) << 8;
+    result |= ((uint64_t)in[2]) << 16;
+    result |= ((uint64_t)in[3]) << 24;
+    result |= ((uint64_t)in[4]) << 32;
+    result |= ((uint64_t)in[5]) << 40;
+
+    return result;
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+    uint64_t h0 = load_7(s);                                /* 56 bits */
+    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
+    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
+    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
+    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */
+
+    h1 |= h0 >> 51; h0 &= MASK51;
+    h2 |= h1 >> 51; h1 &= MASK51;
+    h3 |= h2 >> 51; h2 &= MASK51;
+    h4 |= h3 >> 51; h3 &= MASK51;
+
+    h[0] = h0;
+    h[1] = h1;
+    h[2] = h2;
+    h[3] = h3;
+    h[4] = h4;
+}
+
+static void fe51_tobytes(uint8_t *s, const fe51 h)
+{
+    uint64_t h0 = h[0];
+    uint64_t h1 = h[1];
+    uint64_t h2 = h[2];
+    uint64_t h3 = h[3];
+    uint64_t h4 = h[4];
+    uint64_t q;
 
+    /* compare to modulus */
+    q = (h0 + 19) >> 51;
+    q = (h1 + q) >> 51;
+    q = (h2 + q) >> 51;
+    q = (h3 + q) >> 51;
+    q = (h4 + q) >> 51;
+
+    /* full reduce */
+    h0 += 19 * q;
+    h1 += h0 >> 51; h0 &= MASK51;
+    h2 += h1 >> 51; h1 &= MASK51;
+    h3 += h2 >> 51; h2 &= MASK51;
+    h4 += h3 >> 51; h3 &= MASK51;
+                    h4 &= MASK51;
+
+    /* smash */
+    s[0] = (uint8_t)(h0 >> 0);
+    s[1] = (uint8_t)(h0 >> 8);
+    s[2] = (uint8_t)(h0 >> 16);
+    s[3] = (uint8_t)(h0 >> 24);
+    s[4] = (uint8_t)(h0 >> 32);
+    s[5] = (uint8_t)(h0 >> 40);
+    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
+    s[7] = (uint8_t)(h1 >> 5);
+    s[8] = (uint8_t)(h1 >> 13);
+    s[9] = (uint8_t)(h1 >> 21);
+    s[10] = (uint8_t)(h1 >> 29);
+    s[11] = (uint8_t)(h1 >> 37);
+    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
+    s[13] = (uint8_t)(h2 >> 2);
+    s[14] = (uint8_t)(h2 >> 10);
+    s[15] = (uint8_t)(h2 >> 18);
+    s[16] = (uint8_t)(h2 >> 26);
+    s[17] = (uint8_t)(h2 >> 34);
+    s[18] = (uint8_t)(h2 >> 42);
+    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
+    s[20] = (uint8_t)(h3 >> 7);
+    s[21] = (uint8_t)(h3 >> 15);
+    s[22] = (uint8_t)(h3 >> 23);
+    s[23] = (uint8_t)(h3 >> 31);
+    s[24] = (uint8_t)(h3 >> 39);
+    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
+    s[26] = (uint8_t)(h4 >> 4);
+    s[27] = (uint8_t)(h4 >> 12);
+    s[28] = (uint8_t)(h4 >> 20);
+    s[29] = (uint8_t)(h4 >> 28);
+    s[30] = (uint8_t)(h4 >> 36);
+    s[31] = (uint8_t)(h4 >> 44);
+}
+
+# if defined(X25519_ASM)
+void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+void x25519_fe51_sqr(fe51 h, const fe51 f);
+void x25519_fe51_mul121666(fe51 h, fe51 f);
+#  define fe51_mul x25519_fe51_mul
+#  define fe51_sq  x25519_fe51_sqr
+#  define fe51_mul121666 x25519_fe51_mul121666
 # else
 
+typedef __uint128_t u128;
+
 static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
 {
     u128 h0, h1, h2, h3, h4;
diff --git a/test/recipes/30-test_evp_data/evppkey.txt b/test/recipes/30-test_evp_data/evppkey.txt
index 7435125..d482c14 100644
--- a/test/recipes/30-test_evp_data/evppkey.txt
+++ b/test/recipes/30-test_evp_data/evppkey.txt
@@ -18436,3 +18436,41 @@ Ctrl = digest:SM3
 Input = D7AD397F6FFA5D4F7F11E7217F241607DC30618C236D2C09C1B9EA8FDADEE2E8
 Output = 3045022100f11bf36e75bb304f094fb42a4ca22377d0cc768637c5011cd59fb9ed4b130c98022035545ffe2c2efb3abee4fee661468946d886004fae8ea5311593e48f7fe21b91
 Result = KEYOP_MISMATCH
+
+Title = Chosen Wycheproof vectors
+
+PrivateKeyRaw = WychePRIVATE0:X25519:288796bc5aff4b81a37501757bc0753a3c21964790d38699308debc17a6eaf8d
+
+PublicKeyRaw = WychePUBLIC0:X25519:f0ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff7f
+
+Derive=WychePRIVATE0
+PeerKey=WychePUBLIC0
+SharedSecret=b4e0dd76da7b071728b61f856771aa356e57eda78a5b1655cc3820fb5f854c5c
+
+PrivateKeyRaw = WychePRIVATE1:X25519:60887b3dc72443026ebedbbbb70665f42b87add1440e7768fbd7e8e2ce5f639d
+
+PublicKeyRaw = WychePUBLIC1:X25519:f0ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+Derive=WychePRIVATE1
+PeerKey=WychePUBLIC1
+SharedSecret=38d6304c4a7e6d9f7959334fb5245bd2c754525d4c91db950206926234c1f633
+
+PrivateKeyRaw = WychePRIVATE2:X25519:a0a4f130b98a5be4b1cedb7cb85584a3520e142d474dc9ccb909a073a976bf63
+
+PublicKeyRaw = WychePUBLIC2:X25519:0ab4e76380d84dde4f6833c58f2a9fb8f83bb0169b172be4b6e0592887741a36
+
+Derive=WychePRIVATE2
+PeerKey=WychePUBLIC2
+SharedSecret=0200000000000000000000000000000000000000000000000000000000000000
+
+PublicKeyRaw = WychePUBLIC3:X25519:89e10d5701b4337d2d032181538b1064bd4084401ceca1fd12663a1959388000
+
+Derive=WychePRIVATE2
+PeerKey=WychePUBLIC3
+SharedSecret=0900000000000000000000000000000000000000000000000000000000000000
+
+PublicKeyRaw = WychePUBLIC4:X25519:2b55d3aa4a8f80c8c0b2ae5f933e85af49beac36c2fa7394bab76c8933f8f81d
+
+Derive=WychePRIVATE2
+PeerKey=WychePUBLIC4
+SharedSecret=1000000000000000000000000000000000000000000000000000000000000000