[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Sat Feb 13 11:31:17 UTC 2016
The branch master has been updated
via 647097e17d932124de895d151458c4205c64bab1 (commit)
via 1fdcef75b06cb17e046bf62cda9ace3e943a5661 (commit)
from 6d9843e7f5899835696d90f859ff88041c851d09 (commit)
- Log -----------------------------------------------------------------
commit 647097e17d932124de895d151458c4205c64bab1
Author: Andy Polyakov <appro at openssl.org>
Date: Wed Feb 10 11:59:45 2016 +0100
Configurations: engage ARM ChaCha20 and Poly1305 modules.
Reviewed-by: Richard Levitte <levitte at openssl.org>
commit 1fdcef75b06cb17e046bf62cda9ace3e943a5661
Author: Andy Polyakov <appro at openssl.org>
Date: Mon Dec 14 18:12:07 2015 +0100
ARM assembly pack: add ChaCha20 and Poly1305 modules.
Reviewed-by: Richard Levitte <levitte at openssl.org>
-----------------------------------------------------------------------
Summary of changes:
Configurations/00-base-templates.conf | 4 +
crypto/chacha/Makefile.in | 3 +
crypto/chacha/asm/chacha-armv4.pl | 1144 +++++++++++++++++++++++++++++++
crypto/chacha/asm/chacha-armv8.pl | 1126 ++++++++++++++++++++++++++++++
crypto/poly1305/Makefile.in | 3 +
crypto/poly1305/asm/poly1305-armv4.pl | 1216 +++++++++++++++++++++++++++++++++
crypto/poly1305/asm/poly1305-armv8.pl | 925 +++++++++++++++++++++++++
7 files changed, 4421 insertions(+)
create mode 100755 crypto/chacha/asm/chacha-armv4.pl
create mode 100755 crypto/chacha/asm/chacha-armv8.pl
create mode 100755 crypto/poly1305/asm/poly1305-armv4.pl
create mode 100755 crypto/poly1305/asm/poly1305-armv8.pl
diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
index 9d405ef..0f02340 100644
--- a/Configurations/00-base-templates.conf
+++ b/Configurations/00-base-templates.conf
@@ -153,6 +153,8 @@
aes_asm_src => "aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S",
sha1_asm_src => "sha1-armv4-large.S sha256-armv4.S sha512-armv4.S",
modes_asm_src => "ghash-armv4.S ghashv8-armx.S",
+ chacha_asm_src => "chacha-armv4.S",
+ poly1305_asm_src=> "poly1305-armv4.S",
perlasm_scheme => "void"
},
aarch64_asm => {
@@ -163,6 +165,8 @@
aes_asm_src => "aes_core.c aes_cbc.c aesv8-armx.S vpaes-armv8.S",
sha1_asm_src => "sha1-armv8.S sha256-armv8.S sha512-armv8.S",
modes_asm_src => "ghashv8-armx.S",
+ chacha_asm_src => "chacha-armv8.S",
+ poly1305_asm_src=> "poly1305-armv8.S",
},
parisc11_asm => {
template => 1,
diff --git a/crypto/chacha/Makefile.in b/crypto/chacha/Makefile.in
index 6fb63c1..dd0f36c 100644
--- a/crypto/chacha/Makefile.in
+++ b/crypto/chacha/Makefile.in
@@ -43,6 +43,9 @@ chacha-x86_64.s: asm/chacha-x86_64.pl
chacha-%.S: asm/chacha-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+chacha-armv4.o: chacha-armv4.S
+chacha-armv8.o: chacha-armv8.S
+
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl
new file mode 100755
index 0000000..4d234b7
--- /dev/null
+++ b/crypto/chacha/asm/chacha-armv4.pl
@@ -0,0 +1,1144 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2014
+#
+# ChaCha20 for ARMv4.
+#
+# Performance in cycles per byte out of large buffer.
+#
+# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
+#
+# Cortex-A5 19.3(*)/+95% 21.8 14.1
+# Cortex-A8 10.5(*)/+160% 13.9 6.35
+# Cortex-A9 12.9(**)/+110% 14.3 6.50
+# Cortex-A15 11.0/+40% 16.0 5.00
+# Snapdragon S4 11.5/+125% 13.6 4.90
+#
+# (*) most "favourable" result for aligned data on little-endian
+# processor, result for misaligned data is 10-15% lower;
+# (**) this result is a trade-off: it can be improved by 20%,
+# but then Snapdragon S4 and Cortex-A8 results get
+# 20-25% worse;
+
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',', at _,$arg)."\n";
+}
+
+my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
+my @t=map("r$_",(8..11));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my $odd = $d0&1;
+my ($xc,$xc_) = (@t[0..1]);
+my ($xd,$xd_) = $odd ? (@t[2], at x[$d1]) : (@x[$d0], at t[2]);
+my @ret;
+
+ # Consider order in which variables are addressed by their
+ # index:
+ #
+ # a b c d
+ #
+ # 0 4 8 12 < even round
+ # 1 5 9 13
+ # 2 6 10 14
+ # 3 7 11 15
+ # 0 5 10 15 < odd round
+ # 1 6 11 12
+ # 2 7 8 13
+ # 3 4 9 14
+ #
+ # 'a', 'b' are permanently allocated in registers, @x[0..7],
+ # while 'c's and pair of 'd's are maintained in memory. If
+ # you observe 'c' column, you'll notice that pair of 'c's is
+ # invariant between rounds. This means that we have to reload
+ # them once per round, in the middle. This is why you'll see
+ # bunch of 'c' stores and loads in the middle, but none in
+ # the beginning or end. If you observe 'd' column, you'll
+ # notice that 15 and 13 are reused in next pair of rounds.
+ # This is why these two are chosen for offloading to memory,
+ # to make loads count more.
+ push @ret,(
+ "&add (@x[$a0], at x[$a0], at x[$b0])",
+ "&mov ($xd,$xd,'ror#16')",
+ "&add (@x[$a1], at x[$a1], at x[$b1])",
+ "&mov ($xd_,$xd_,'ror#16')",
+ "&eor ($xd,$xd, at x[$a0],'ror#16')",
+ "&eor ($xd_,$xd_, at x[$a1],'ror#16')",
+
+ "&add ($xc,$xc,$xd)",
+ "&mov (@x[$b0], at x[$b0],'ror#20')",
+ "&add ($xc_,$xc_,$xd_)",
+ "&mov (@x[$b1], at x[$b1],'ror#20')",
+ "&eor (@x[$b0], at x[$b0],$xc,'ror#20')",
+ "&eor (@x[$b1], at x[$b1],$xc_,'ror#20')",
+
+ "&add (@x[$a0], at x[$a0], at x[$b0])",
+ "&mov ($xd,$xd,'ror#24')",
+ "&add (@x[$a1], at x[$a1], at x[$b1])",
+ "&mov ($xd_,$xd_,'ror#24')",
+ "&eor ($xd,$xd, at x[$a0],'ror#24')",
+ "&eor ($xd_,$xd_, at x[$a1],'ror#24')",
+
+ "&add ($xc,$xc,$xd)",
+ "&mov (@x[$b0], at x[$b0],'ror#25')" );
+ push @ret,(
+ "&str ($xd,'[sp,#4*(16+$d0)]')",
+ "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
+ push @ret,(
+ "&add ($xc_,$xc_,$xd_)",
+ "&mov (@x[$b1], at x[$b1],'ror#25')" );
+ push @ret,(
+ "&str ($xd_,'[sp,#4*(16+$d1)]')",
+ "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
+ push @ret,(
+ "&eor (@x[$b0], at x[$b0],$xc,'ror#25')",
+ "&eor (@x[$b1], at x[$b1],$xc_,'ror#25')" );
+
+ $xd=@x[$d2] if (!$odd);
+ $xd_=@x[$d3] if ($odd);
+ push @ret,(
+ "&str ($xc,'[sp,#4*(16+$c0)]')",
+ "&ldr ($xc,'[sp,#4*(16+$c2)]')",
+ "&add (@x[$a2], at x[$a2], at x[$b2])",
+ "&mov ($xd,$xd,'ror#16')",
+ "&str ($xc_,'[sp,#4*(16+$c1)]')",
+ "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
+ "&add (@x[$a3], at x[$a3], at x[$b3])",
+ "&mov ($xd_,$xd_,'ror#16')",
+ "&eor ($xd,$xd, at x[$a2],'ror#16')",
+ "&eor ($xd_,$xd_, at x[$a3],'ror#16')",
+
+ "&add ($xc,$xc,$xd)",
+ "&mov (@x[$b2], at x[$b2],'ror#20')",
+ "&add ($xc_,$xc_,$xd_)",
+ "&mov (@x[$b3], at x[$b3],'ror#20')",
+ "&eor (@x[$b2], at x[$b2],$xc,'ror#20')",
+ "&eor (@x[$b3], at x[$b3],$xc_,'ror#20')",
+
+ "&add (@x[$a2], at x[$a2], at x[$b2])",
+ "&mov ($xd,$xd,'ror#24')",
+ "&add (@x[$a3], at x[$a3], at x[$b3])",
+ "&mov ($xd_,$xd_,'ror#24')",
+ "&eor ($xd,$xd, at x[$a2],'ror#24')",
+ "&eor ($xd_,$xd_, at x[$a3],'ror#24')",
+
+ "&add ($xc,$xc,$xd)",
+ "&mov (@x[$b2], at x[$b2],'ror#25')",
+ "&add ($xc_,$xc_,$xd_)",
+ "&mov (@x[$b3], at x[$b3],'ror#25')",
+ "&eor (@x[$b2], at x[$b2],$xc,'ror#25')",
+ "&eor (@x[$b3], at x[$b3],$xc_,'ror#25')" );
+
+ @ret;
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.LChaCha20_ctr32
+#else
+.word -1
+#endif
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+.LChaCha20_ctr32:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r14,pc,#16 @ ChaCha20_ctr32
+#else
+ adr r14,.LChaCha20_ctr32
+#endif
+#if __ARM_MAX_ARCH__>=7
+ cmp r2,#192 @ test len
+ bls .Lshort
+ ldr r4,[r14,#-32]
+ ldr r4,[r14,r4]
+# ifdef __APPLE__
+ ldr r4,[r4]
+# endif
+ tst r4,#1
+ bne .LChaCha20_neon
+.Lshort:
+#endif
+ ldmia r12,{r4-r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+ sub r14,r14,#64 @ .Lsigma
+ stmdb sp!,{r4-r7} @ copy counter and nonce
+ ldmia r3,{r4-r11} @ load key
+ ldmia r14,{r0-r3} @ load sigma
+ stmdb sp!,{r4-r11} @ copy key
+ stmdb sp!,{r0-r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
+ str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0-r9} @ load key material
+ str @t[3],[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr @t[3], [sp,#4*(15)]
+ ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
+ ldr @t[2], [sp,#4*(13)]
+ ldr @x[14],[sp,#4*(14)]
+ str @t[3], [sp,#4*(16+15)]
+ mov @t[3],#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs @t[3], at t[3],#1
+___
+ foreach (&ROUND(0, 4, 8,12)) { eval; }
+ foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+ bne .Loop
+
+ ldr @t[3],[sp,#4*(32+2)] @ load len
+
+ str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
+ str @t[1], [sp,#4*(16+9)]
+ str @x[12],[sp,#4*(16+12)]
+ str @t[2], [sp,#4*(16+13)]
+ str @x[14],[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ @x[0-7] and second half at sp+4*(16+8)
+
+ cmp @t[3],#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr @t[0],[sp,#4*(0)] @ load key material
+ ldr @t[1],[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+ orr @t[2],r12,r14
+ tst @t[2],#3 @ are input and output aligned?
+ ldr @t[2],[sp,#4*(2)]
+ bne .Lunaligned
+ cmp @t[3],#64 @ restore flags
+# else
+ ldr @t[2],[sp,#4*(2)]
+# endif
+ ldr @t[3],[sp,#4*(3)]
+
+ add @x[0], at x[0], at t[0] @ accumulate key material
+ add @x[1], at x[1], at t[1]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+
+ add @x[2], at x[2], at t[2]
+ add @x[3], at x[3], at t[3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[0], at x[0]
+ rev @x[1], at x[1]
+ rev @x[2], at x[2]
+ rev @x[3], at x[3]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[0], at x[0], at t[0] @ xor with input
+ eorhs @x[1], at x[1], at t[1]
+ add @t[0],sp,#4*(4)
+ str @x[0],[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[2], at x[2], at t[2]
+ eorhs @x[3], at x[3], at t[3]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+ str @x[1],[r14,#-12]
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4], at x[4], at t[0] @ accumulate key material
+ add @x[5], at x[5], at t[1]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+ add @x[6], at x[6], at t[2]
+ add @x[7], at x[7], at t[3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[4], at x[4]
+ rev @x[5], at x[5]
+ rev @x[6], at x[6]
+ rev @x[7], at x[7]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[4], at x[4], at t[0]
+ eorhs @x[5], at x[5], at t[1]
+ add @t[0],sp,#4*(8)
+ str @x[4],[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[6], at x[6], at t[2]
+ eorhs @x[7], at x[7], at t[3]
+ str @x[5],[r14,#-12]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+ str @x[6],[r14,#-8]
+ add @x[0],sp,#4*(16+8)
+ str @x[7],[r14,#-4]
+
+ ldmia @x[0],{@x[0]- at x[7]} @ load second half
+
+ add @x[0], at x[0], at t[0] @ accumulate key material
+ add @x[1], at x[1], at t[1]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
+ strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
+ add @x[2], at x[2], at t[2]
+ add @x[3], at x[3], at t[3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[0], at x[0]
+ rev @x[1], at x[1]
+ rev @x[2], at x[2]
+ rev @x[3], at x[3]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[0], at x[0], at t[0]
+ eorhs @x[1], at x[1], at t[1]
+ add @t[0],sp,#4*(12)
+ str @x[0],[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[2], at x[2], at t[2]
+ eorhs @x[3], at x[3], at t[3]
+ str @x[1],[r14,#-12]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4], at x[4], at t[0] @ accumulate key material
+ add @x[5], at x[5], at t[1]
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi @t[0], at t[0],#1 @ next counter value
+ strhi @t[0],[sp,#4*(12)] @ save next counter value
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[0],[r12],#16 @ load input
+ ldrhs @t[1],[r12,#-12]
+ add @x[6], at x[6], at t[2]
+ add @x[7], at x[7], at t[3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs @t[2],[r12,#-8]
+ ldrhs @t[3],[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev @x[4], at x[4]
+ rev @x[5], at x[5]
+ rev @x[6], at x[6]
+ rev @x[7], at x[7]
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[4], at x[4], at t[0]
+ eorhs @x[5], at x[5], at t[1]
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs @x[6], at x[6], at t[2]
+ eorhs @x[7], at x[7], at t[3]
+ str @x[4],[r14],#16 @ store output
+ str @x[5],[r14,#-12]
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs @t[3], at t[0],#64 @ len-=64
+ str @x[6],[r14,#-8]
+ str @x[7],[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+# if __ARM_ARCH__<7
+ b .Ltail
+
+.align 4
+.Lunaligned: @ unaligned endian-neutral path
+ cmp @t[3],#64 @ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+ ldr @t[3],[sp,#4*(3)]
+___
+for ($i=0;$i<16;$i+=4) {
+my $j=$i&0x7;
+
+$code.=<<___ if ($i==4);
+ add @x[0],sp,#4*(16+8)
+___
+$code.=<<___ if ($i==8);
+ ldmia @x[0],{@x[0]- at x[7]} @ load second half
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
+ strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
+___
+$code.=<<___;
+ add @x[$j+0], at x[$j+0], at t[0] @ accumulate key material
+___
+$code.=<<___ if ($i==12);
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi @t[0], at t[0],#1 @ next counter value
+ strhi @t[0],[sp,#4*(12)] @ save next counter value
+___
+$code.=<<___;
+ add @x[$j+1], at x[$j+1], at t[1]
+ add @x[$j+2], at x[$j+2], at t[2]
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo @t[0], at t[0], at t[0] @ zero or ...
+ ldrhsb @t[0],[r12],#16 @ ... load input
+ eorlo @t[1], at t[1], at t[1]
+ ldrhsb @t[1],[r12,#-12]
+
+ add @x[$j+3], at x[$j+3], at t[3]
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo @t[2], at t[2], at t[2]
+ ldrhsb @t[2],[r12,#-8]
+ eorlo @t[3], at t[3], at t[3]
+ ldrhsb @t[3],[r12,#-4]
+
+ eor @x[$j+0], at t[0], at x[$j+0] @ xor with input (or zero)
+ eor @x[$j+1], at t[1], at x[$j+1]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[0],[r12,#-15] @ load more input
+ ldrhsb @t[1],[r12,#-11]
+ eor @x[$j+2], at t[2], at x[$j+2]
+ strb @x[$j+0],[r14],#16 @ store output
+ eor @x[$j+3], at t[3], at x[$j+3]
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[2],[r12,#-7]
+ ldrhsb @t[3],[r12,#-3]
+ strb @x[$j+1],[r14,#-12]
+ eor @x[$j+0], at t[0], at x[$j+0],lsr#8
+ strb @x[$j+2],[r14,#-8]
+ eor @x[$j+1], at t[1], at x[$j+1],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[0],[r12,#-14] @ load more input
+ ldrhsb @t[1],[r12,#-10]
+ strb @x[$j+3],[r14,#-4]
+ eor @x[$j+2], at t[2], at x[$j+2],lsr#8
+ strb @x[$j+0],[r14,#-15]
+ eor @x[$j+3], at t[3], at x[$j+3],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[2],[r12,#-6]
+ ldrhsb @t[3],[r12,#-2]
+ strb @x[$j+1],[r14,#-11]
+ eor @x[$j+0], at t[0], at x[$j+0],lsr#8
+ strb @x[$j+2],[r14,#-7]
+ eor @x[$j+1], at t[1], at x[$j+1],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[0],[r12,#-13] @ load more input
+ ldrhsb @t[1],[r12,#-9]
+ strb @x[$j+3],[r14,#-3]
+ eor @x[$j+2], at t[2], at x[$j+2],lsr#8
+ strb @x[$j+0],[r14,#-14]
+ eor @x[$j+3], at t[3], at x[$j+3],lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb @t[2],[r12,#-5]
+ ldrhsb @t[3],[r12,#-1]
+ strb @x[$j+1],[r14,#-10]
+ strb @x[$j+2],[r14,#-6]
+ eor @x[$j+0], at t[0], at x[$j+0],lsr#8
+ strb @x[$j+3],[r14,#-2]
+ eor @x[$j+1], at t[1], at x[$j+1],lsr#8
+ strb @x[$j+0],[r14,#-13]
+ eor @x[$j+2], at t[2], at x[$j+2],lsr#8
+ strb @x[$j+1],[r14,#-9]
+ eor @x[$j+3], at t[3], at x[$j+3],lsr#8
+ strb @x[$j+2],[r14,#-5]
+ strb @x[$j+3],[r14,#-1]
+___
+$code.=<<___ if ($i<12);
+ add @t[0],sp,#4*(4+$i)
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+___
+}
+$code.=<<___;
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs @t[3], at t[0],#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add @t[2],sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb @t[0],[@t[2]],#1 @ read buffer on stack
+ ldrb @t[1],[r12],#1 @ read input
+ subs @t[3], at t[3],#1
+ eor @t[0], at t[0], at t[1]
+ strb @t[0],[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+ ldmia sp!,{r4-r11,pc}
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+___
+
+{{{
+my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
+ map("q$_",(0..15));
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+ (
+ "&vadd_i32 ($a,$a,$b)",
+ "&veor ($d,$d,$a)",
+ "&vrev32_16 ($d,$d)", # vrot ($d,16)
+
+ "&vadd_i32 ($c,$c,$d)",
+ "&veor ($t,$b,$c)",
+ "&vshr_u32 ($b,$t,20)",
+ "&vsli_32 ($b,$t,12)",
+
+ "&vadd_i32 ($a,$a,$b)",
+ "&veor ($t,$d,$a)",
+ "&vshr_u32 ($d,$t,24)",
+ "&vsli_32 ($d,$t,8)",
+
+ "&vadd_i32 ($c,$c,$d)",
+ "&veor ($t,$b,$c)",
+ "&vshr_u32 ($b,$t,25)",
+ "&vsli_32 ($b,$t,7)",
+
+ "&vext_8 ($c,$c,$c,8)",
+ "&vext_8 ($b,$b,$b,$odd?12:4)",
+ "&vext_8 ($d,$d,$d,$odd?4:12)"
+ );
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+.LChaCha20_neon:
+ adr r14,.Lsigma
+ vstmdb sp!,{d8-d15} @ ABI spec says so
+ stmdb sp!,{r0-r3}
+
+ vld1.32 {$b0-$c0},[r3] @ load key
+ ldmia r3,{r4-r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {$d0},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0-r3} @ load sigma
+ vld1.32 {$a0},[r14]! @ load sigma
+ vld1.32 {$t0},[r14] @ one
+ vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
+ str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
+ vshl.i32 $t1#lo,$t0#lo,#1 @ two
+ vstr $t0#lo,[sp,#4*(16+0)]
+ vshl.i32 $t2#lo,$t0#lo,#2 @ four
+ vstr $t1#lo,[sp,#4*(16+2)]
+ vmov $a1,$a0
+ vstr $t2#lo,[sp,#4*(16+4)]
+ vmov $a2,$a0
+ vmov $b1,$b0
+ vmov $b2,$b0
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0-r9} @ load key material
+ cmp @t[3],#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ vmov $a1,$a0
+ str @t[3],[sp,#4*(32+2)] @ save len
+ vmov $a2,$a0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov $b1,$b0
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov $b2,$b0
+.Loop_neon_enter:
+ ldr @t[3], [sp,#4*(15)]
+ vadd.i32 $d1,$d0,$t0 @ counter+1
+ ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
+ vmov $c1,$c0
+ ldr @t[2], [sp,#4*(13)]
+ vmov $c2,$c0
+ ldr @x[14],[sp,#4*(14)]
+ vadd.i32 $d2,$d1,$t0 @ counter+2
+ str @t[3], [sp,#4*(16+15)]
+ mov @t[3],#10
+ add @x[12], at x[12],#3 @ counter+3
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs @t[3], at t[3],#1
+___
+ my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
+ my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
+ my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
+ my @thread3=&ROUND(0,4,8,12);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+
+ @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
+ @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
+ @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
+ @thread3=&ROUND(0,5,10,15);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+$code.=<<___;
+ bne .Loop_neon
+
+ add @t[3],sp,#32
+ vld1.32 {$t0-$t1},[sp] @ load key material
+ vld1.32 {$t2-$t3},[@t[3]]
+
+ ldr @t[3],[sp,#4*(32+2)] @ load len
+
+ str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
+ str @t[1], [sp,#4*(16+9)]
+ str @x[12],[sp,#4*(16+12)]
+ str @t[2], [sp,#4*(16+13)]
+ str @x[14],[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ @x[0-7] and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 $a0,$a0,$t0 @ accumulate key material
+ vadd.i32 $a1,$a1,$t0
+ vadd.i32 $a2,$a2,$t0
+ vldr $t0#lo,[sp,#4*(16+0)] @ one
+
+ vadd.i32 $b0,$b0,$t1
+ vadd.i32 $b1,$b1,$t1
+ vadd.i32 $b2,$b2,$t1
+ vldr $t1#lo,[sp,#4*(16+2)] @ two
+
+ vadd.i32 $c0,$c0,$t2
+ vadd.i32 $c1,$c1,$t2
+ vadd.i32 $c2,$c2,$t2
+ vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
+ vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
+
+ vadd.i32 $d0,$d0,$t3
+ vadd.i32 $d1,$d1,$t3
+ vadd.i32 $d2,$d2,$t3
+
+ cmp @t[3],#64*4
+ blo .Ltail_neon
+
+ vld1.8 {$t0-$t1},[r12]! @ load input
+ mov @t[3],sp
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0 @ xor with input
+ veor $b0,$b0,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a1,$a1,$t0
+ vst1.8 {$a0-$b0},[r14]! @ store output
+ veor $b1,$b1,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c1,$c1,$t2
+ vst1.8 {$c0-$d0},[r14]!
+ veor $d1,$d1,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a2,$a2,$t0
+ vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
+ veor $t0#hi,$t0#hi,$t0#hi
+ vldr $t0#lo,[sp,#4*(16+4)] @ four
+ veor $b2,$b2,$t1
+ vld1.32 {$c0-$d0},[@t[3]]
+ veor $c2,$c2,$t2
+ vst1.8 {$a1-$b1},[r14]!
+ veor $d2,$d2,$t3
+ vst1.8 {$c1-$d1},[r14]!
+
+ vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
+ vldr $t0#lo,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{@t[0]- at t[3]} @ load key material
+ add @x[0], at x[0], at t[0] @ accumulate key material
+ ldr @t[0],[r12],#16 @ load input
+ vst1.8 {$a2-$b2},[r14]!
+ add @x[1], at x[1], at t[1]
+ ldr @t[1],[r12,#-12]
+ vst1.8 {$c2-$d2},[r14]!
+ add @x[2], at x[2], at t[2]
+ ldr @t[2],[r12,#-8]
+ add @x[3], at x[3], at t[3]
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[1], at x[1]
+ rev @x[2], at x[2]
+ rev @x[3], at x[3]
+# endif
+ eor @x[0], at x[0], at t[0] @ xor with input
+ add @t[0],sp,#4*(4)
+ eor @x[1], at x[1], at t[1]
+ str @x[0],[r14],#16 @ store output
+ eor @x[2], at x[2], at t[2]
+ str @x[1],[r14,#-12]
+ eor @x[3], at x[3], at t[3]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4], at x[4], at t[0] @ accumulate key material
+ ldr @t[0],[r12],#16 @ load input
+ add @x[5], at x[5], at t[1]
+ ldr @t[1],[r12,#-12]
+ add @x[6], at x[6], at t[2]
+ ldr @t[2],[r12,#-8]
+ add @x[7], at x[7], at t[3]
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[4], at x[4]
+ rev @x[5], at x[5]
+ rev @x[6], at x[6]
+ rev @x[7], at x[7]
+# endif
+ eor @x[4], at x[4], at t[0]
+ add @t[0],sp,#4*(8)
+ eor @x[5], at x[5], at t[1]
+ str @x[4],[r14],#16 @ store output
+ eor @x[6], at x[6], at t[2]
+ str @x[5],[r14,#-12]
+ eor @x[7], at x[7], at t[3]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+ str @x[6],[r14,#-8]
+ add @x[0],sp,#4*(16+8)
+ str @x[7],[r14,#-4]
+
+ ldmia @x[0],{@x[0]- at x[7]} @ load second half
+
+ add @x[0], at x[0], at t[0] @ accumulate key material
+ ldr @t[0],[r12],#16 @ load input
+ add @x[1], at x[1], at t[1]
+ ldr @t[1],[r12,#-12]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
+ add @x[2], at x[2], at t[2]
+ ldr @t[2],[r12,#-8]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
+ add @x[3], at x[3], at t[3]
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[1], at x[1]
+ rev @x[2], at x[2]
+ rev @x[3], at x[3]
+# endif
+ eor @x[0], at x[0], at t[0]
+ add @t[0],sp,#4*(12)
+ eor @x[1], at x[1], at t[1]
+ str @x[0],[r14],#16 @ store output
+ eor @x[2], at x[2], at t[2]
+ str @x[1],[r14,#-12]
+ eor @x[3], at x[3], at t[3]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+ str @x[2],[r14,#-8]
+ str @x[3],[r14,#-4]
+
+ add @x[4], at x[4], at t[0] @ accumulate key material
+ add @t[0], at t[0],#4 @ next counter value
+ add @x[5], at x[5], at t[1]
+ str @t[0],[sp,#4*(12)] @ save next counter value
+ ldr @t[0],[r12],#16 @ load input
+ add @x[6], at x[6], at t[2]
+ add @x[4], at x[4],#3 @ counter+3
+ ldr @t[1],[r12,#-12]
+ add @x[7], at x[7], at t[3]
+ ldr @t[2],[r12,#-8]
+ ldr @t[3],[r12,#-4]
+# ifdef __ARMEB__
+ rev @x[4], at x[4]
+ rev @x[5], at x[5]
+ rev @x[6], at x[6]
+ rev @x[7], at x[7]
+# endif
+ eor @x[4], at x[4], at t[0]
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
+ eor @x[5], at x[5], at t[1]
+ eor @x[6], at x[6], at t[2]
+ str @x[4],[r14],#16 @ store output
+ eor @x[7], at x[7], at t[3]
+ str @x[5],[r14,#-12]
+ sub @t[3], at t[0],#64*4 @ len-=64*4
+ str @x[6],[r14,#-8]
+ str @x[7],[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str @t[3], [sp,#4*(20+32+2)] @ save len
+ add @t[3],sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr @x[12],[sp,#4*(16+10)]
+ ldr @x[14],[sp,#4*(16+11)]
+ vldmia @t[3],{d8-d15} @ fulfill ABI requirement
+ str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
+ str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
+
+ ldr @t[3], [sp,#4*(15)]
+ ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
+ ldr @t[2], [sp,#4*(13)]
+ ldr @x[14],[sp,#4*(14)]
+ str @t[3], [sp,#4*(20+16+15)]
+ add @t[3],sp,#4*(20)
+ vst1.32 {$a0-$b0},[@t[3]]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {$c0-$d0},[@t[3]]
+ mov @t[3],#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp @t[3],#64*3
+ bhs .L192_or_more_neon
+ cmp @t[3],#64*2
+ bhs .L128_or_more_neon
+ cmp @t[3],#64*1
+ bhs .L64_or_more_neon
+
+ add @t[0],sp,#4*(8)
+ vst1.8 {$a0-$b0},[sp]
+ add @t[2],sp,#4*(0)
+ vst1.8 {$c0-$d0},[@t[0]]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {$t0-$t1},[r12]!
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0
+ veor $b0,$b0,$t1
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vst1.8 {$a0-$b0},[r14]!
+ vst1.8 {$c0-$d0},[r14]!
+
+ beq .Ldone_neon
+
+ add @t[0],sp,#4*(8)
+ vst1.8 {$a1-$b1},[sp]
+ add @t[2],sp,#4*(0)
+ vst1.8 {$c1-$d1},[@t[0]]
+ sub @t[3], at t[3],#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {$t0-$t1},[r12]!
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0
+ veor $b0,$b0,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a1,$a1,$t0
+ veor $b1,$b1,$t1
+ vst1.8 {$a0-$b0},[r14]!
+ veor $c1,$c1,$t2
+ vst1.8 {$c0-$d0},[r14]!
+ veor $d1,$d1,$t3
+ vst1.8 {$a1-$b1},[r14]!
+ vst1.8 {$c1-$d1},[r14]!
+
+ beq .Ldone_neon
+
+ add @t[0],sp,#4*(8)
+ vst1.8 {$a2-$b2},[sp]
+ add @t[2],sp,#4*(0)
+ vst1.8 {$c2-$d2},[@t[0]]
+ sub @t[3], at t[3],#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {$t0-$t1},[r12]!
+ vld1.8 {$t2-$t3},[r12]!
+ veor $a0,$a0,$t0
+ veor $b0,$b0,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c0,$c0,$t2
+ veor $d0,$d0,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a1,$a1,$t0
+ veor $b1,$b1,$t1
+ vld1.8 {$t0-$t1},[r12]!
+ veor $c1,$c1,$t2
+ vst1.8 {$a0-$b0},[r14]!
+ veor $d1,$d1,$t3
+ vld1.8 {$t2-$t3},[r12]!
+
+ veor $a2,$a2,$t0
+ vst1.8 {$c0-$d0},[r14]!
+ veor $b2,$b2,$t1
+ vst1.8 {$a1-$b1},[r14]!
+ veor $c2,$c2,$t2
+ vst1.8 {$c1-$d1},[r14]!
+ veor $d2,$d2,$t3
+ vst1.8 {$a2-$b2},[r14]!
+ vst1.8 {$c2-$d2},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{@t[0]- at t[3]} @ load key material
+ add @x[0], at x[0], at t[0] @ accumulate key material
+ add @t[0],sp,#4*(4)
+ add @x[1], at x[1], at t[1]
+ add @x[2], at x[2], at t[2]
+ add @x[3], at x[3], at t[3]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+
+ add @x[4], at x[4], at t[0] @ accumulate key material
+ add @t[0],sp,#4*(8)
+ add @x[5], at x[5], at t[1]
+ add @x[6], at x[6], at t[2]
+ add @x[7], at x[7], at t[3]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+# ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[1], at x[1]
+ rev @x[2], at x[2]
+ rev @x[3], at x[3]
+ rev @x[4], at x[4]
+ rev @x[5], at x[5]
+ rev @x[6], at x[6]
+ rev @x[7], at x[7]
+# endif
+ stmia sp,{@x[0]- at x[7]}
+ add @x[0],sp,#4*(16+8)
+
+ ldmia @x[0],{@x[0]- at x[7]} @ load second half
+
+ add @x[0], at x[0], at t[0] @ accumulate key material
+ add @t[0],sp,#4*(12)
+ add @x[1], at x[1], at t[1]
+ add @x[2], at x[2], at t[2]
+ add @x[3], at x[3], at t[3]
+ ldmia @t[0],{@t[0]- at t[3]} @ load key material
+
+ add @x[4], at x[4], at t[0] @ accumulate key material
+ add @t[0],sp,#4*(8)
+ add @x[5], at x[5], at t[1]
+ add @x[4], at x[4],#3 @ counter+3
+ add @x[6], at x[6], at t[2]
+ add @x[7], at x[7], at t[3]
+ ldr @t[3],[sp,#4*(32+2)] @ re-load len
+# ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[1], at x[1]
+ rev @x[2], at x[2]
+ rev @x[3], at x[3]
+ rev @x[4], at x[4]
+ rev @x[5], at x[5]
+ rev @x[6], at x[6]
+ rev @x[7], at x[7]
+# endif
+ stmia @t[0],{@x[0]- at x[7]}
+ add @t[2],sp,#4*(0)
+ sub @t[3], at t[0],#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb @t[0],[@t[2]],#1 @ read buffer on stack
+ ldrb @t[1],[r12],#1 @ read input
+ subs @t[3], at t[3],#1
+ eor @t[0], at t[0], at t[1]
+ strb @t[0],[r14],#1 @ store ouput
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8-d15}
+ add sp,sp,#4*(16+3)
+ ldmia sp!,{r4-r11,pc}
+.size ChaCha20_neon,.-ChaCha20_neon
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+}}}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl
new file mode 100755
index 0000000..6ddb31f
--- /dev/null
+++ b/crypto/chacha/asm/chacha-armv8.pl
@@ -0,0 +1,1126 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2015
+#
+# ChaCha20 for ARMv8.
+#
+# Performance in cycles per byte out of large buffer.
+#
+# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
+#
+# Apple A7 5.50/+49% 3.33 1.70
+# Cortex-A53 8.40/+80% 4.72 4.72(*)
+# Cortex-A57 8.06/+43% 4.90 4.43(**)
+# Denver 4.50/+82% 2.63 2.67(*)
+# X-Gene 9.50/+46% 8.82 8.89(*)
+#
+# (*) it's expected that doubling interleave factor doesn't help
+# all processors, only those with higher NEON latency and
+# higher instruction issue rate;
+# (**) expected improvement was actually higher;
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',', at _,$arg)."\n";
+}
+
+my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
+
+my @x=map("x$_",(5..17,19..21));
+my @d=map("x$_",(22..28,30));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+
+ (
+ "&add_32 (@x[$a0], at x[$a0], at x[$b0])",
+ "&add_32 (@x[$a1], at x[$a1], at x[$b1])",
+ "&add_32 (@x[$a2], at x[$a2], at x[$b2])",
+ "&add_32 (@x[$a3], at x[$a3], at x[$b3])",
+ "&eor_32 (@x[$d0], at x[$d0], at x[$a0])",
+ "&eor_32 (@x[$d1], at x[$d1], at x[$a1])",
+ "&eor_32 (@x[$d2], at x[$d2], at x[$a2])",
+ "&eor_32 (@x[$d3], at x[$d3], at x[$a3])",
+ "&ror_32 (@x[$d0], at x[$d0],16)",
+ "&ror_32 (@x[$d1], at x[$d1],16)",
+ "&ror_32 (@x[$d2], at x[$d2],16)",
+ "&ror_32 (@x[$d3], at x[$d3],16)",
+
+ "&add_32 (@x[$c0], at x[$c0], at x[$d0])",
+ "&add_32 (@x[$c1], at x[$c1], at x[$d1])",
+ "&add_32 (@x[$c2], at x[$c2], at x[$d2])",
+ "&add_32 (@x[$c3], at x[$c3], at x[$d3])",
+ "&eor_32 (@x[$b0], at x[$b0], at x[$c0])",
+ "&eor_32 (@x[$b1], at x[$b1], at x[$c1])",
+ "&eor_32 (@x[$b2], at x[$b2], at x[$c2])",
+ "&eor_32 (@x[$b3], at x[$b3], at x[$c3])",
+ "&ror_32 (@x[$b0], at x[$b0],20)",
+ "&ror_32 (@x[$b1], at x[$b1],20)",
+ "&ror_32 (@x[$b2], at x[$b2],20)",
+ "&ror_32 (@x[$b3], at x[$b3],20)",
+
+ "&add_32 (@x[$a0], at x[$a0], at x[$b0])",
+ "&add_32 (@x[$a1], at x[$a1], at x[$b1])",
+ "&add_32 (@x[$a2], at x[$a2], at x[$b2])",
+ "&add_32 (@x[$a3], at x[$a3], at x[$b3])",
+ "&eor_32 (@x[$d0], at x[$d0], at x[$a0])",
+ "&eor_32 (@x[$d1], at x[$d1], at x[$a1])",
+ "&eor_32 (@x[$d2], at x[$d2], at x[$a2])",
+ "&eor_32 (@x[$d3], at x[$d3], at x[$a3])",
+ "&ror_32 (@x[$d0], at x[$d0],24)",
+ "&ror_32 (@x[$d1], at x[$d1],24)",
+ "&ror_32 (@x[$d2], at x[$d2],24)",
+ "&ror_32 (@x[$d3], at x[$d3],24)",
+
+ "&add_32 (@x[$c0], at x[$c0], at x[$d0])",
+ "&add_32 (@x[$c1], at x[$c1], at x[$d1])",
+ "&add_32 (@x[$c2], at x[$c2], at x[$d2])",
+ "&add_32 (@x[$c3], at x[$c3], at x[$d3])",
+ "&eor_32 (@x[$b0], at x[$b0], at x[$c0])",
+ "&eor_32 (@x[$b1], at x[$b1], at x[$c1])",
+ "&eor_32 (@x[$b2], at x[$b2], at x[$c2])",
+ "&eor_32 (@x[$b3], at x[$b3], at x[$c3])",
+ "&ror_32 (@x[$b0], at x[$b0],25)",
+ "&ror_32 (@x[$b1], at x[$b1],25)",
+ "&ror_32 (@x[$b2], at x[$b2],25)",
+ "&ror_32 (@x[$b3], at x[$b3],25)"
+ );
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.extern OPENSSL_armcap_P
+
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
+.quad OPENSSL_armcap_P-.
+#endif
+.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+ cbz $len,.Labort
+ adr @x[0],.LOPENSSL_armcap_P
+ cmp $len,#192
+ b.lo .Lshort
+#ifdef __ILP32__
+ ldrsw @x[1],[@x[0]]
+#else
+ ldr @x[1],[@x[0]]
+#endif
+ ldr w17,[@x[1], at x[0]]
+ tst w17,#ARMV7_NEON
+ b.ne ChaCha20_neon
+
+.Lshort:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr @x[0],.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp @d[0], at d[1],[@x[0]] // load sigma
+ ldp @d[2], at d[3],[$key] // load key
+ ldp @d[4], at d[5],[$key,#16]
+ ldp @d[6], at d[7],[$ctr] // load counter
+#ifdef __ARMEB__
+ ror @d[2], at d[2],#32
+ ror @d[3], at d[3],#32
+ ror @d[4], at d[4],#32
+ ror @d[5], at d[5],#32
+ ror @d[6], at d[6],#32
+ ror @d[7], at d[7],#32
+#endif
+
+.Loop_outer:
+ mov.32 @x[0], at d[0] // unpack key block
+ lsr @x[1], at d[0],#32
+ mov.32 @x[2], at d[1]
+ lsr @x[3], at d[1],#32
+ mov.32 @x[4], at d[2]
+ lsr @x[5], at d[2],#32
+ mov.32 @x[6], at d[3]
+ lsr @x[7], at d[3],#32
+ mov.32 @x[8], at d[4]
+ lsr @x[9], at d[4],#32
+ mov.32 @x[10], at d[5]
+ lsr @x[11], at d[5],#32
+ mov.32 @x[12], at d[6]
+ lsr @x[13], at d[6],#32
+ mov.32 @x[14], at d[7]
+ lsr @x[15], at d[7],#32
+
+ mov $ctr,#10
+ subs $len,$len,#64
+.Loop:
+ sub $ctr,$ctr,#1
+___
+ foreach (&ROUND(0, 4, 8,12)) { eval; }
+ foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+ cbnz $ctr,.Loop
+
+ add.32 @x[0], at x[0], at d[0] // accumulate key block
+ add @x[1], at x[1], at d[0],lsr#32
+ add.32 @x[2], at x[2], at d[1]
+ add @x[3], at x[3], at d[1],lsr#32
+ add.32 @x[4], at x[4], at d[2]
+ add @x[5], at x[5], at d[2],lsr#32
+ add.32 @x[6], at x[6], at d[3]
+ add @x[7], at x[7], at d[3],lsr#32
+ add.32 @x[8], at x[8], at d[4]
+ add @x[9], at x[9], at d[4],lsr#32
+ add.32 @x[10], at x[10], at d[5]
+ add @x[11], at x[11], at d[5],lsr#32
+ add.32 @x[12], at x[12], at d[6]
+ add @x[13], at x[13], at d[6],lsr#32
+ add.32 @x[14], at x[14], at d[7]
+ add @x[15], at x[15], at d[7],lsr#32
+
+ b.lo .Ltail
+
+ add @x[0], at x[0], at x[1],lsl#32 // pack
+ add @x[2], at x[2], at x[3],lsl#32
+ ldp @x[1], at x[3],[$inp,#0] // load input
+ add @x[4], at x[4], at x[5],lsl#32
+ add @x[6], at x[6], at x[7],lsl#32
+ ldp @x[5], at x[7],[$inp,#16]
+ add @x[8], at x[8], at x[9],lsl#32
+ add @x[10], at x[10], at x[11],lsl#32
+ ldp @x[9], at x[11],[$inp,#32]
+ add @x[12], at x[12], at x[13],lsl#32
+ add @x[14], at x[14], at x[15],lsl#32
+ ldp @x[13], at x[15],[$inp,#48]
+ add $inp,$inp,#64
+#ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[2], at x[2]
+ rev @x[4], at x[4]
+ rev @x[6], at x[6]
+ rev @x[8], at x[8]
+ rev @x[10], at x[10]
+ rev @x[12], at x[12]
+ rev @x[14], at x[14]
+#endif
+ eor @x[0], at x[0], at x[1]
+ eor @x[2], at x[2], at x[3]
+ eor @x[4], at x[4], at x[5]
+ eor @x[6], at x[6], at x[7]
+ eor @x[8], at x[8], at x[9]
+ eor @x[10], at x[10], at x[11]
+ eor @x[12], at x[12], at x[13]
+ eor @x[14], at x[14], at x[15]
+
+ stp @x[0], at x[2],[$out,#0] // store output
+ add @d[6], at d[6],#1 // increment counter
+ stp @x[4], at x[6],[$out,#16]
+ stp @x[8], at x[10],[$out,#32]
+ stp @x[12], at x[14],[$out,#48]
+ add $out,$out,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add $len,$len,#64
+.Less_than_64:
+ sub $out,$out,#1
+ add $inp,$inp,$len
+ add $out,$out,$len
+ add $ctr,sp,$len
+ neg $len,$len
+
+ add @x[0], at x[0], at x[1],lsl#32 // pack
+ add @x[2], at x[2], at x[3],lsl#32
+ add @x[4], at x[4], at x[5],lsl#32
+ add @x[6], at x[6], at x[7],lsl#32
+ add @x[8], at x[8], at x[9],lsl#32
+ add @x[10], at x[10], at x[11],lsl#32
+ add @x[12], at x[12], at x[13],lsl#32
+ add @x[14], at x[14], at x[15],lsl#32
+#ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[2], at x[2]
+ rev @x[4], at x[4]
+ rev @x[6], at x[6]
+ rev @x[8], at x[8]
+ rev @x[10], at x[10]
+ rev @x[12], at x[12]
+ rev @x[14], at x[14]
+#endif
+ stp @x[0], at x[2],[sp,#0]
+ stp @x[4], at x[6],[sp,#16]
+ stp @x[8], at x[10],[sp,#32]
+ stp @x[12], at x[14],[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[$inp,$len]
+ ldrb w11,[$ctr,$len]
+ add $len,$len,#1
+ eor w10,w10,w11
+ strb w10,[$out,$len]
+ cbnz $len,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+___
+
+{{{
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
+ map("v$_.4s",(0..7,16..23));
+my (@K)=map("v$_.4s",(24..30));
+my $ONE="v31.4s";
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+ (
+ "&add ('$a','$a','$b')",
+ "&eor ('$d','$d','$a')",
+ "&rev32_16 ('$d','$d')", # vrot ($d,16)
+
+ "&add ('$c','$c','$d')",
+ "&eor ('$t','$b','$c')",
+ "&ushr ('$b','$t',20)",
+ "&sli ('$b','$t',12)",
+
+ "&add ('$a','$a','$b')",
+ "&eor ('$t','$d','$a')",
+ "&ushr ('$d','$t',24)",
+ "&sli ('$d','$t',8)",
+
+ "&add ('$c','$c','$d')",
+ "&eor ('$t','$b','$c')",
+ "&ushr ('$b','$t',25)",
+ "&sli ('$b','$t',7)",
+
+ "&ext ('$c','$c','$c',8)",
+ "&ext ('$d','$d','$d',$odd?4:12)",
+ "&ext ('$b','$b','$b',$odd?12:4)"
+ );
+}
+
+$code.=<<___;
+
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr @x[0],.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp $len,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp @d[0], at d[1],[@x[0]] // load sigma
+ ld1 {@K[0]},[@x[0]],#16
+ ldp @d[2], at d[3],[$key] // load key
+ ldp @d[4], at d[5],[$key,#16]
+ ld1 {@K[1], at K[2]},[$key]
+ ldp @d[6], at d[7],[$ctr] // load counter
+ ld1 {@K[3]},[$ctr]
+ ld1 {$ONE},[@x[0]]
+#ifdef __ARMEB__
+ rev64 @K[0], at K[0]
+ ror @d[2], at d[2],#32
+ ror @d[3], at d[3],#32
+ ror @d[4], at d[4],#32
+ ror @d[5], at d[5],#32
+ ror @d[6], at d[6],#32
+ ror @d[7], at d[7],#32
+#endif
+ add @K[3], at K[3],$ONE // += 1
+ add @K[4], at K[3],$ONE
+ add @K[5], at K[4],$ONE
+ shl $ONE,$ONE,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov.32 @x[0], at d[0] // unpack key block
+ lsr @x[1], at d[0],#32
+ mov $A0, at K[0]
+ mov.32 @x[2], at d[1]
+ lsr @x[3], at d[1],#32
+ mov $A1, at K[0]
+ mov.32 @x[4], at d[2]
+ lsr @x[5], at d[2],#32
+ mov $A2, at K[0]
+ mov.32 @x[6], at d[3]
+ mov $B0, at K[1]
+ lsr @x[7], at d[3],#32
+ mov $B1, at K[1]
+ mov.32 @x[8], at d[4]
+ mov $B2, at K[1]
+ lsr @x[9], at d[4],#32
+ mov $D0, at K[3]
+ mov.32 @x[10], at d[5]
+ mov $D1, at K[4]
+ lsr @x[11], at d[5],#32
+ mov $D2, at K[5]
+ mov.32 @x[12], at d[6]
+ mov $C0, at K[2]
+ lsr @x[13], at d[6],#32
+ mov $C1, at K[2]
+ mov.32 @x[14], at d[7]
+ mov $C2, at K[2]
+ lsr @x[15], at d[7],#32
+
+ mov $ctr,#10
+ subs $len,$len,#256
+.Loop_neon:
+ sub $ctr,$ctr,#1
+___
+ my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+ my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+ my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+ my @thread3=&ROUND(0,4,8,12);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+ @thread3=&ROUND(0,5,10,15);
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread3));
+ eval(shift(@thread1)); eval(shift(@thread3));
+ eval(shift(@thread2)); eval(shift(@thread3));
+ }
+$code.=<<___;
+ cbnz $ctr,.Loop_neon
+
+ add.32 @x[0], at x[0], at d[0] // accumulate key block
+ add $A0,$A0, at K[0]
+ add @x[1], at x[1], at d[0],lsr#32
+ add $A1,$A1, at K[0]
+ add.32 @x[2], at x[2], at d[1]
+ add $A2,$A2, at K[0]
+ add @x[3], at x[3], at d[1],lsr#32
+ add $C0,$C0, at K[2]
+ add.32 @x[4], at x[4], at d[2]
+ add $C1,$C1, at K[2]
+ add @x[5], at x[5], at d[2],lsr#32
+ add $C2,$C2, at K[2]
+ add.32 @x[6], at x[6], at d[3]
+ add $D0,$D0, at K[3]
+ add @x[7], at x[7], at d[3],lsr#32
+ add.32 @x[8], at x[8], at d[4]
+ add $D1,$D1, at K[4]
+ add @x[9], at x[9], at d[4],lsr#32
+ add.32 @x[10], at x[10], at d[5]
+ add $D2,$D2, at K[5]
+ add @x[11], at x[11], at d[5],lsr#32
+ add.32 @x[12], at x[12], at d[6]
+ add $B0,$B0, at K[1]
+ add @x[13], at x[13], at d[6],lsr#32
+ add.32 @x[14], at x[14], at d[7]
+ add $B1,$B1, at K[1]
+ add @x[15], at x[15], at d[7],lsr#32
+ add $B2,$B2, at K[1]
+
+ b.lo .Ltail_neon
+
+ add @x[0], at x[0], at x[1],lsl#32 // pack
+ add @x[2], at x[2], at x[3],lsl#32
+ ldp @x[1], at x[3],[$inp,#0] // load input
+ add @x[4], at x[4], at x[5],lsl#32
+ add @x[6], at x[6], at x[7],lsl#32
+ ldp @x[5], at x[7],[$inp,#16]
+ add @x[8], at x[8], at x[9],lsl#32
+ add @x[10], at x[10], at x[11],lsl#32
+ ldp @x[9], at x[11],[$inp,#32]
+ add @x[12], at x[12], at x[13],lsl#32
+ add @x[14], at x[14], at x[15],lsl#32
+ ldp @x[13], at x[15],[$inp,#48]
+ add $inp,$inp,#64
+#ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[2], at x[2]
+ rev @x[4], at x[4]
+ rev @x[6], at x[6]
+ rev @x[8], at x[8]
+ rev @x[10], at x[10]
+ rev @x[12], at x[12]
+ rev @x[14], at x[14]
+#endif
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor @x[0], at x[0], at x[1]
+ eor @x[2], at x[2], at x[3]
+ eor @x[4], at x[4], at x[5]
+ eor @x[6], at x[6], at x[7]
+ eor @x[8], at x[8], at x[9]
+ eor $A0,$A0,$T0
+ eor @x[10], at x[10], at x[11]
+ eor $B0,$B0,$T1
+ eor @x[12], at x[12], at x[13]
+ eor $C0,$C0,$T2
+ eor @x[14], at x[14], at x[15]
+ eor $D0,$D0,$T3
+ ld1.8 {$T0-$T3},[$inp],#64
+
+ stp @x[0], at x[2],[$out,#0] // store output
+ add @d[6], at d[6],#4 // increment counter
+ stp @x[4], at x[6],[$out,#16]
+ add @K[3], at K[3],$ONE // += 4
+ stp @x[8], at x[10],[$out,#32]
+ add @K[4], at K[4],$ONE
+ stp @x[12], at x[14],[$out,#48]
+ add @K[5], at K[5],$ONE
+ add $out,$out,#64
+
+ st1.8 {$A0-$D0},[$out],#64
+ ld1.8 {$A0-$D0},[$inp],#64
+
+ eor $A1,$A1,$T0
+ eor $B1,$B1,$T1
+ eor $C1,$C1,$T2
+ eor $D1,$D1,$T3
+ st1.8 {$A1-$D1},[$out],#64
+
+ eor $A2,$A2,$A0
+ eor $B2,$B2,$B0
+ eor $C2,$C2,$C0
+ eor $D2,$D2,$D0
+ st1.8 {$A2-$D2},[$out],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.Ltail_neon:
+ add $len,$len,#256
+ cmp $len,#64
+ b.lo .Less_than_64
+
+ add @x[0], at x[0], at x[1],lsl#32 // pack
+ add @x[2], at x[2], at x[3],lsl#32
+ ldp @x[1], at x[3],[$inp,#0] // load input
+ add @x[4], at x[4], at x[5],lsl#32
+ add @x[6], at x[6], at x[7],lsl#32
+ ldp @x[5], at x[7],[$inp,#16]
+ add @x[8], at x[8], at x[9],lsl#32
+ add @x[10], at x[10], at x[11],lsl#32
+ ldp @x[9], at x[11],[$inp,#32]
+ add @x[12], at x[12], at x[13],lsl#32
+ add @x[14], at x[14], at x[15],lsl#32
+ ldp @x[13], at x[15],[$inp,#48]
+ add $inp,$inp,#64
+#ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[2], at x[2]
+ rev @x[4], at x[4]
+ rev @x[6], at x[6]
+ rev @x[8], at x[8]
+ rev @x[10], at x[10]
+ rev @x[12], at x[12]
+ rev @x[14], at x[14]
+#endif
+ eor @x[0], at x[0], at x[1]
+ eor @x[2], at x[2], at x[3]
+ eor @x[4], at x[4], at x[5]
+ eor @x[6], at x[6], at x[7]
+ eor @x[8], at x[8], at x[9]
+ eor @x[10], at x[10], at x[11]
+ eor @x[12], at x[12], at x[13]
+ eor @x[14], at x[14], at x[15]
+
+ stp @x[0], at x[2],[$out,#0] // store output
+ add @d[6], at d[6],#4 // increment counter
+ stp @x[4], at x[6],[$out,#16]
+ stp @x[8], at x[10],[$out,#32]
+ stp @x[12], at x[14],[$out,#48]
+ add $out,$out,#64
+ b.eq .Ldone_neon
+ sub $len,$len,#64
+ cmp $len,#64
+ b.lo .Less_than_128
+
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor $A0,$A0,$T0
+ eor $B0,$B0,$T1
+ eor $C0,$C0,$T2
+ eor $D0,$D0,$T3
+ st1.8 {$A0-$D0},[$out],#64
+ b.eq .Ldone_neon
+ sub $len,$len,#64
+ cmp $len,#64
+ b.lo .Less_than_192
+
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor $A1,$A1,$T0
+ eor $B1,$B1,$T1
+ eor $C1,$C1,$T2
+ eor $D1,$D1,$T3
+ st1.8 {$A1-$D1},[$out],#64
+ b.eq .Ldone_neon
+ sub $len,$len,#64
+
+ st1.8 {$A2-$D2},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1.8 {$A0-$D0},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1.8 {$A1-$D1},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub $out,$out,#1
+ add $inp,$inp,$len
+ add $out,$out,$len
+ add $ctr,sp,$len
+ neg $len,$len
+
+.Loop_tail_neon:
+ ldrb w10,[$inp,$len]
+ ldrb w11,[$ctr,$len]
+ add $len,$len,#1
+ eor w10,w10,w11
+ strb w10,[$out,$len]
+ cbnz $len,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_neon,.-ChaCha20_neon
+___
+{
+my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
+ $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
+
+$code.=<<___;
+.type ChaCha20_512_neon,%function
+.align 5
+ChaCha20_512_neon:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr @x[0],.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp @d[0], at d[1],[@x[0]] // load sigma
+ ld1 {@K[0]},[@x[0]],#16
+ ldp @d[2], at d[3],[$key] // load key
+ ldp @d[4], at d[5],[$key,#16]
+ ld1 {@K[1], at K[2]},[$key]
+ ldp @d[6], at d[7],[$ctr] // load counter
+ ld1 {@K[3]},[$ctr]
+ ld1 {$ONE},[@x[0]]
+#ifdef __ARMEB__
+ rev64 @K[0], at K[0]
+ ror @d[2], at d[2],#32
+ ror @d[3], at d[3],#32
+ ror @d[4], at d[4],#32
+ ror @d[5], at d[5],#32
+ ror @d[6], at d[6],#32
+ ror @d[7], at d[7],#32
+#endif
+ add @K[3], at K[3],$ONE // += 1
+ stp @K[0], at K[1],[sp,#0] // off-load key block, invariant part
+ add @K[3], at K[3],$ONE // not typo
+ str @K[2],[sp,#32]
+ add @K[4], at K[3],$ONE
+ add @K[5], at K[4],$ONE
+ add @K[6], at K[5],$ONE
+ shl $ONE,$ONE,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub $len,$len,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov $A0, at K[0]
+ mov $A1, at K[0]
+ mov $A2, at K[0]
+ mov $A3, at K[0]
+ mov $A4, at K[0]
+ mov $A5, at K[0]
+ mov $B0, at K[1]
+ mov.32 @x[0], at d[0] // unpack key block
+ mov $B1, at K[1]
+ lsr @x[1], at d[0],#32
+ mov $B2, at K[1]
+ mov.32 @x[2], at d[1]
+ mov $B3, at K[1]
+ lsr @x[3], at d[1],#32
+ mov $B4, at K[1]
+ mov.32 @x[4], at d[2]
+ mov $B5, at K[1]
+ lsr @x[5], at d[2],#32
+ mov $D0, at K[3]
+ mov.32 @x[6], at d[3]
+ mov $D1, at K[4]
+ lsr @x[7], at d[3],#32
+ mov $D2, at K[5]
+ mov.32 @x[8], at d[4]
+ mov $D3, at K[6]
+ lsr @x[9], at d[4],#32
+ mov $C0, at K[2]
+ mov.32 @x[10], at d[5]
+ mov $C1, at K[2]
+ lsr @x[11], at d[5],#32
+ add $D4,$D0,$ONE // +4
+ mov.32 @x[12], at d[6]
+ add $D5,$D1,$ONE // +4
+ lsr @x[13], at d[6],#32
+ mov $C2, at K[2]
+ mov.32 @x[14], at d[7]
+ mov $C3, at K[2]
+ lsr @x[15], at d[7],#32
+ mov $C4, at K[2]
+ stp @K[3], at K[4],[sp,#48] // off-load key block, variable part
+ mov $C5, at K[2]
+ str @K[5],[sp,#80]
+
+ mov $ctr,#5
+ subs $len,$len,#512
+.Loop_upper_neon:
+ sub $ctr,$ctr,#1
+___
+ my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+ my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+ my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+ my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+ my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+ my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+ my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+ my $diff = ($#thread0+1)*6 - $#thread67 - 1;
+ my $i = 0;
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+ @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+ @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+ @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+ @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+$code.=<<___;
+ cbnz $ctr,.Loop_upper_neon
+
+ add.32 @x[0], at x[0], at d[0] // accumulate key block
+ add @x[1], at x[1], at d[0],lsr#32
+ add.32 @x[2], at x[2], at d[1]
+ add @x[3], at x[3], at d[1],lsr#32
+ add.32 @x[4], at x[4], at d[2]
+ add @x[5], at x[5], at d[2],lsr#32
+ add.32 @x[6], at x[6], at d[3]
+ add @x[7], at x[7], at d[3],lsr#32
+ add.32 @x[8], at x[8], at d[4]
+ add @x[9], at x[9], at d[4],lsr#32
+ add.32 @x[10], at x[10], at d[5]
+ add @x[11], at x[11], at d[5],lsr#32
+ add.32 @x[12], at x[12], at d[6]
+ add @x[13], at x[13], at d[6],lsr#32
+ add.32 @x[14], at x[14], at d[7]
+ add @x[15], at x[15], at d[7],lsr#32
+
+ add @x[0], at x[0], at x[1],lsl#32 // pack
+ add @x[2], at x[2], at x[3],lsl#32
+ ldp @x[1], at x[3],[$inp,#0] // load input
+ add @x[4], at x[4], at x[5],lsl#32
+ add @x[6], at x[6], at x[7],lsl#32
+ ldp @x[5], at x[7],[$inp,#16]
+ add @x[8], at x[8], at x[9],lsl#32
+ add @x[10], at x[10], at x[11],lsl#32
+ ldp @x[9], at x[11],[$inp,#32]
+ add @x[12], at x[12], at x[13],lsl#32
+ add @x[14], at x[14], at x[15],lsl#32
+ ldp @x[13], at x[15],[$inp,#48]
+ add $inp,$inp,#64
+#ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[2], at x[2]
+ rev @x[4], at x[4]
+ rev @x[6], at x[6]
+ rev @x[8], at x[8]
+ rev @x[10], at x[10]
+ rev @x[12], at x[12]
+ rev @x[14], at x[14]
+#endif
+ eor @x[0], at x[0], at x[1]
+ eor @x[2], at x[2], at x[3]
+ eor @x[4], at x[4], at x[5]
+ eor @x[6], at x[6], at x[7]
+ eor @x[8], at x[8], at x[9]
+ eor @x[10], at x[10], at x[11]
+ eor @x[12], at x[12], at x[13]
+ eor @x[14], at x[14], at x[15]
+
+ stp @x[0], at x[2],[$out,#0] // store output
+ add @d[6], at d[6],#1 // increment counter
+ mov.32 @x[0], at d[0] // unpack key block
+ lsr @x[1], at d[0],#32
+ stp @x[4], at x[6],[$out,#16]
+ mov.32 @x[2], at d[1]
+ lsr @x[3], at d[1],#32
+ stp @x[8], at x[10],[$out,#32]
+ mov.32 @x[4], at d[2]
+ lsr @x[5], at d[2],#32
+ stp @x[12], at x[14],[$out,#48]
+ add $out,$out,#64
+ mov.32 @x[6], at d[3]
+ lsr @x[7], at d[3],#32
+ mov.32 @x[8], at d[4]
+ lsr @x[9], at d[4],#32
+ mov.32 @x[10], at d[5]
+ lsr @x[11], at d[5],#32
+ mov.32 @x[12], at d[6]
+ lsr @x[13], at d[6],#32
+ mov.32 @x[14], at d[7]
+ lsr @x[15], at d[7],#32
+
+ mov $ctr,#5
+.Loop_lower_neon:
+ sub $ctr,$ctr,#1
+___
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
+ @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
+ @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
+ @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
+ @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+
+ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
+ @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
+ @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
+ @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
+ @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
+ @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
+ @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
+
+ foreach (@thread0) {
+ eval; eval(shift(@thread67));
+ eval(shift(@thread1)); eval(shift(@thread67));
+ eval(shift(@thread2)); eval(shift(@thread67));
+ eval(shift(@thread3)); eval(shift(@thread67));
+ eval(shift(@thread4)); eval(shift(@thread67));
+ eval(shift(@thread5)); eval(shift(@thread67));
+ }
+$code.=<<___;
+ cbnz $ctr,.Loop_lower_neon
+
+ add.32 @x[0], at x[0], at d[0] // accumulate key block
+ ldp @K[0], at K[1],[sp,#0]
+ add @x[1], at x[1], at d[0],lsr#32
+ ldp @K[2], at K[3],[sp,#32]
+ add.32 @x[2], at x[2], at d[1]
+ ldp @K[4], at K[5],[sp,#64]
+ add @x[3], at x[3], at d[1],lsr#32
+ add $A0,$A0, at K[0]
+ add.32 @x[4], at x[4], at d[2]
+ add $A1,$A1, at K[0]
+ add @x[5], at x[5], at d[2],lsr#32
+ add $A2,$A2, at K[0]
+ add.32 @x[6], at x[6], at d[3]
+ add $A3,$A3, at K[0]
+ add @x[7], at x[7], at d[3],lsr#32
+ add $A4,$A4, at K[0]
+ add.32 @x[8], at x[8], at d[4]
+ add $A5,$A5, at K[0]
+ add @x[9], at x[9], at d[4],lsr#32
+ add $C0,$C0, at K[2]
+ add.32 @x[10], at x[10], at d[5]
+ add $C1,$C1, at K[2]
+ add @x[11], at x[11], at d[5],lsr#32
+ add $C2,$C2, at K[2]
+ add.32 @x[12], at x[12], at d[6]
+ add $C3,$C3, at K[2]
+ add @x[13], at x[13], at d[6],lsr#32
+ add $C4,$C4, at K[2]
+ add.32 @x[14], at x[14], at d[7]
+ add $C5,$C5, at K[2]
+ add @x[15], at x[15], at d[7],lsr#32
+ add $D4,$D4,$ONE // +4
+ add @x[0], at x[0], at x[1],lsl#32 // pack
+ add $D5,$D5,$ONE // +4
+ add @x[2], at x[2], at x[3],lsl#32
+ add $D0,$D0, at K[3]
+ ldp @x[1], at x[3],[$inp,#0] // load input
+ add $D1,$D1, at K[4]
+ add @x[4], at x[4], at x[5],lsl#32
+ add $D2,$D2, at K[5]
+ add @x[6], at x[6], at x[7],lsl#32
+ add $D3,$D3, at K[6]
+ ldp @x[5], at x[7],[$inp,#16]
+ add $D4,$D4, at K[3]
+ add @x[8], at x[8], at x[9],lsl#32
+ add $D5,$D5, at K[4]
+ add @x[10], at x[10], at x[11],lsl#32
+ add $B0,$B0, at K[1]
+ ldp @x[9], at x[11],[$inp,#32]
+ add $B1,$B1, at K[1]
+ add @x[12], at x[12], at x[13],lsl#32
+ add $B2,$B2, at K[1]
+ add @x[14], at x[14], at x[15],lsl#32
+ add $B3,$B3, at K[1]
+ ldp @x[13], at x[15],[$inp,#48]
+ add $B4,$B4, at K[1]
+ add $inp,$inp,#64
+ add $B5,$B5, at K[1]
+
+#ifdef __ARMEB__
+ rev @x[0], at x[0]
+ rev @x[2], at x[2]
+ rev @x[4], at x[4]
+ rev @x[6], at x[6]
+ rev @x[8], at x[8]
+ rev @x[10], at x[10]
+ rev @x[12], at x[12]
+ rev @x[14], at x[14]
+#endif
+ ld1.8 {$T0-$T3},[$inp],#64
+ eor @x[0], at x[0], at x[1]
+ eor @x[2], at x[2], at x[3]
+ eor @x[4], at x[4], at x[5]
+ eor @x[6], at x[6], at x[7]
+ eor @x[8], at x[8], at x[9]
+ eor $A0,$A0,$T0
+ eor @x[10], at x[10], at x[11]
+ eor $B0,$B0,$T1
+ eor @x[12], at x[12], at x[13]
+ eor $C0,$C0,$T2
+ eor @x[14], at x[14], at x[15]
+ eor $D0,$D0,$T3
+ ld1.8 {$T0-$T3},[$inp],#64
+
+ stp @x[0], at x[2],[$out,#0] // store output
+ add @d[6], at d[6],#7 // increment counter
+ stp @x[4], at x[6],[$out,#16]
+ stp @x[8], at x[10],[$out,#32]
+ stp @x[12], at x[14],[$out,#48]
+ add $out,$out,#64
+ st1.8 {$A0-$D0},[$out],#64
+
+ ld1.8 {$A0-$D0},[$inp],#64
+ eor $A1,$A1,$T0
+ eor $B1,$B1,$T1
+ eor $C1,$C1,$T2
+ eor $D1,$D1,$T3
+ st1.8 {$A1-$D1},[$out],#64
+
+ ld1.8 {$A1-$D1},[$inp],#64
+ eor $A2,$A2,$A0
+ ldp @K[0], at K[1],[sp,#0]
+ eor $B2,$B2,$B0
+ ldp @K[2], at K[3],[sp,#32]
+ eor $C2,$C2,$C0
+ eor $D2,$D2,$D0
+ st1.8 {$A2-$D2},[$out],#64
+
+ ld1.8 {$A2-$D2},[$inp],#64
+ eor $A3,$A3,$A1
+ eor $B3,$B3,$B1
+ eor $C3,$C3,$C1
+ eor $D3,$D3,$D1
+ st1.8 {$A3-$D3},[$out],#64
+
+ ld1.8 {$A3-$D3},[$inp],#64
+ eor $A4,$A4,$A2
+ eor $B4,$B4,$B2
+ eor $C4,$C4,$C2
+ eor $D4,$D4,$D2
+ st1.8 {$A4-$D4},[$out],#64
+
+ shl $A0,$ONE,#1 // 4 -> 8
+ eor $A5,$A5,$A3
+ eor $B5,$B5,$B3
+ eor $C5,$C5,$C3
+ eor $D5,$D5,$D3
+ st1.8 {$A5-$D5},[$out],#64
+
+ add @K[3], at K[3],$A0 // += 8
+ add @K[4], at K[4],$A0
+ add @K[5], at K[5],$A0
+ add @K[6], at K[6],$A0
+
+ b.hs .Loop_outer_512_neon
+
+ adds $len,$len,#512
+ ushr $A0,$ONE,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp @K[0],$ONE,[sp,#0] // wipe off-load area
+ stp @K[0],$ONE,[sp,#32]
+ stp @K[0],$ONE,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp $len,#192
+ sub @K[3], at K[3],$A0 // -= 1
+ sub @K[4], at K[4],$A0
+ sub @K[5], at K[5],$A0
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor @K[1], at K[1], at K[1]
+ eor @K[2], at K[2], at K[2]
+ eor @K[3], at K[3], at K[3]
+ eor @K[4], at K[4], at K[4]
+ eor @K[5], at K[5], at K[5]
+ eor @K[6], at K[6], at K[6]
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_512_neon,.-ChaCha20_512_neon
+___
+}
+}}}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
+ (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
+ (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
+ (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
+ (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
+
+ #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
+
+ print $_,"\n";
+}
diff --git a/crypto/poly1305/Makefile.in b/crypto/poly1305/Makefile.in
index c848843..0984a52 100644
--- a/crypto/poly1305/Makefile.in
+++ b/crypto/poly1305/Makefile.in
@@ -45,6 +45,9 @@ poly1305-x86_64.s: asm/poly1305-x86_64.pl
poly1305-%.S: asm/poly1305-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+poly1305-armv4.o: poly1305-armv4.S
+poly1305-armv8.o: poly1305-armv8.S
+
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
new file mode 100755
index 0000000..2cce9df
--- /dev/null
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -0,0 +1,1216 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# IALU(*)/gcc-4.4 NEON
+#
+# ARM11xx(ARMv6) 7.78/+100% -
+# Cortex-A5 6.30/+130% 2.96
+# Cortex-A8 6.25/+115% 2.36
+# Cortex-A9 5.10/+95% 2.55
+# Cortex-A15 3.79/+85% 1.25(**)
+# Snapdragon S4 5.70/+100% 1.48(**)
+#
+# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp $inp,#0
+ str r3,[$ctx,#0] @ zero hash value
+ str r3,[$ctx,#4]
+ str r3,[$ctx,#8]
+ str r3,[$ctx,#12]
+ str r3,[$ctx,#16]
+ str r3,[$ctx,#36] @ is_base2_26
+ add $ctx,$ctx,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+#endif
+ ldrb r4,[$inp,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[$inp,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[$inp,#2]
+ ldrb r7,[$inp,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[$inp,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[$inp,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[$inp,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# ifdef __APPLE__
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[$inp,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[$inp,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[$inp,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[$inp,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7
+ tst r12,#1 @ check for NEON
+# ifdef __APPLE__
+ adr r9,poly1305_blocks_neon
+ adr r11,poly1305_blocks
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r11,r9
+ adr r12,poly1305_emit
+ adr r10,poly1305_emit_neon
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r12,r10
+# else
+# ifdef __thumb2__
+ itete eq
+# endif
+ addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
+# endif
+# ifdef __thumb2__
+ orr r12,r12,#1 @ thumb-ify address
+ orr r11,r11,#1
+# endif
+#endif
+ ldrb r9,[$inp,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[$inp,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[$inp,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[$inp,#14]
+ and r6,r6,r3
+
+ ldrb r10,[$inp,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[$ctx,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[$ctx,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[$ctx,#8]
+ and r7,r7,r3
+ str r7,[$ctx,#12]
+#if __ARM_MAX_ARCH__>=7
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands $len,$len,#-16
+ beq .Lno_data
+
+ cmp $padbit,#0
+ add $len,$len,$inp @ end pointer
+ sub sp,sp,#32
+
+ ldmia $ctx,{$h0-$r3} @ load context
+
+ str $ctx,[sp,#12] @ offload stuff
+ mov lr,$inp
+ str $len,[sp,#16]
+ str $r1,[sp,#20]
+ str $r2,[sp,#24]
+ str $r3,[sp,#28]
+ b .Loop
+
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds $h0,$h0,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs $h1,$h1,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs $h2,$h2,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add $s1,$r1,$r1,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds $h0,$h0,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs $h1,$h1,r1
+ add $s1,$r1,$r1,lsr#2
+ adcs $h2,$h2,r2
+#endif
+ add $s2,$r2,$r2,lsr#2
+ adcs $h3,$h3,r3
+ add $s3,$r3,$r3,lsr#2
+
+ umull r2,r3,$h1,$r0
+ adc $h4,$h4,#0
+ umull r0,r1,$h0,$r0
+ umlal r2,r3,$h4,$s1
+ umlal r0,r1,$h3,$s1
+ ldr $r1,[sp,#20] @ reload $r1
+ umlal r2,r3,$h2,$s3
+ umlal r0,r1,$h1,$s3
+ umlal r2,r3,$h3,$s2
+ umlal r0,r1,$h2,$s2
+ umlal r2,r3,$h0,$r1
+ str r0,[sp,#0] @ future $h0
+ mul r0,$s2,$h4
+ ldr $r2,[sp,#24] @ reload $r2
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future $h2
+ str r2,[sp,#4] @ future $h1
+
+ mul r2,$s3,$h4
+ eor r3,r3,r3
+ umlal r0,r1,$h3,$s3
+ ldr $r3,[sp,#28] @ reload $r3
+ umlal r2,r3,$h3,$r0
+ umlal r0,r1,$h2,$r0
+ umlal r2,r3,$h2,$r1
+ umlal r0,r1,$h1,$r1
+ umlal r2,r3,$h1,$r2
+ umlal r0,r1,$h0,$r2
+ umlal r2,r3,$h0,$r3
+ ldr $h0,[sp,#0]
+ mul $h4,$r0,$h4
+ ldr $h1,[sp,#4]
+
+ adds $h2,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds $h3,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add $h4,$h4,r3 @ h4+=d3>>32
+
+ and r1,$h4,#-4
+ and $h4,$h4,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds $h0,$h0,r1
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adc $h3,$h3,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr $ctx,[sp,#12]
+ add sp,sp,#32
+ stmia $ctx,{$h0-$h4} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$h4;
+
+$code.=<<___;
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ stmdb sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+
+ ldmia $ctx,{$h0-$h4}
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0]
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+#else
+ strb $h0,[$mac,#0]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#4]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#8]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#12]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#1]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#5]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#9]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#13]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#2]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#6]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#10]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#14]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#3]
+ strb $h1,[$mac,#7]
+ strb $h2,[$mac,#11]
+ strb $h3,[$mac,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+ ldr r4,[$ctx,#20] @ load key base 2^32
+ ldr r5,[$ctx,#24]
+ ldr r6,[$ctx,#28]
+ ldr r7,[$ctx,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 $R0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 $R1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 $S1,r2
+ vdup.32 $R2,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 $S2,r3
+ vdup.32 $R3,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 $S3,r4
+ vdup.32 $R4,r6
+ vdup.32 $S4,r5
+
+ mov $zeros,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 $D0,$R0,${R0}[1]
+ vmull.u32 $D1,$R1,${R0}[1]
+ vmull.u32 $D2,$R2,${R0}[1]
+ vmull.u32 $D3,$R3,${R0}[1]
+ vmull.u32 $D4,$R4,${R0}[1]
+
+ vmlal.u32 $D0,$R4,${S1}[1]
+ vmlal.u32 $D1,$R0,${R1}[1]
+ vmlal.u32 $D2,$R1,${R1}[1]
+ vmlal.u32 $D3,$R2,${R1}[1]
+ vmlal.u32 $D4,$R3,${R1}[1]
+
+ vmlal.u32 $D0,$R3,${S2}[1]
+ vmlal.u32 $D1,$R4,${S2}[1]
+ vmlal.u32 $D3,$R1,${R2}[1]
+ vmlal.u32 $D2,$R0,${R2}[1]
+ vmlal.u32 $D4,$R2,${R2}[1]
+
+ vmlal.u32 $D0,$R2,${S3}[1]
+ vmlal.u32 $D3,$R0,${R3}[1]
+ vmlal.u32 $D1,$R3,${S3}[1]
+ vmlal.u32 $D2,$R4,${S3}[1]
+ vmlal.u32 $D4,$R1,${R3}[1]
+
+ vmlal.u32 $D3,$R4,${S4}[1]
+ vmlal.u32 $D0,$R1,${S4}[1]
+ vmlal.u32 $D1,$R2,${S4}[1]
+ vmlal.u32 $D2,$R3,${S4}[1]
+ vmlal.u32 $D4,$R0,${R4}[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vbic.i32 $D4#lo,#0xfc000000
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vbic.i32 $D2#lo,#0xfc000000
+
+ vshr.u32 $T0#lo,$D0#lo,#26
+ vbic.i32 $D0#lo,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+
+ subs $zeros,$zeros,#1
+ beq .Lsquare_break_neon
+
+ add $tbl0,$ctx,#(48+0*9*4)
+ add $tbl1,$ctx,#(48+1*9*4)
+
+ vtrn.32 $R0,$D0#lo @ r^2:r^1
+ vtrn.32 $R2,$D2#lo
+ vtrn.32 $R3,$D3#lo
+ vtrn.32 $R1,$D1#lo
+ vtrn.32 $R4,$D4#lo
+
+ vshl.u32 $S2,$R2,#2 @ *5
+ vshl.u32 $S3,$R3,#2
+ vshl.u32 $S1,$R1,#2
+ vshl.u32 $S4,$R4,#2
+ vadd.i32 $S2,$S2,$R2
+ vadd.i32 $S1,$S1,$R1
+ vadd.i32 $S3,$S3,$R3
+ vadd.i32 $S4,$S4,$R4
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0,:32]
+ vst1.32 {${S4}[1]},[$tbl1,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add $tbl0,$ctx,#(48+2*4*9)
+ add $tbl1,$ctx,#(48+3*4*9)
+
+ vmov $R0,$D0#lo @ r^4:r^3
+ vshl.u32 $S1,$D1#lo,#2 @ *5
+ vmov $R1,$D1#lo
+ vshl.u32 $S2,$D2#lo,#2
+ vmov $R2,$D2#lo
+ vshl.u32 $S3,$D3#lo,#2
+ vmov $R3,$D3#lo
+ vshl.u32 $S4,$D4#lo,#2
+ vmov $R4,$D4#lo
+ vadd.i32 $S1,$S1,$D1#lo
+ vadd.i32 $S2,$S2,$D2#lo
+ vadd.i32 $S3,$S3,$D3#lo
+ vadd.i32 $S4,$S4,$D4#lo
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0]
+ vst1.32 {${S4}[1]},[$tbl1]
+
+ ret @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+ ands $len,$len,#-16
+ beq .Lno_data_neon
+
+ cmp $len,#64
+ bhs .Lenter_neon
+ tst ip,ip @ is_base2_26?
+ beq poly1305_blocks
+
+.Lenter_neon:
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl poly1305_init_neon
+
+ ldr r4,[$ctx,#0] @ load hash value base 2^32
+ ldr r5,[$ctx,#4]
+ ldr r6,[$ctx,#8]
+ ldr r7,[$ctx,#12]
+ ldr ip,[$ctx,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor $D0#lo,$D0#lo,$D0#lo
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor $D1#lo,$D1#lo,$D1#lo
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor $D2#lo,$D2#lo,$D2#lo
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor $D3#lo,$D3#lo,$D3#lo
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor $D4#lo,$D4#lo,$D4#lo
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[$ctx,#36] @ is_base2_26
+
+ vmov.32 $D0#lo[0],r2
+ vmov.32 $D1#lo[0],r3
+ vmov.32 $D2#lo[0],r4
+ vmov.32 $D3#lo[0],r5
+ vmov.32 $D4#lo[0],r6
+ adr $zeros,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lbase2_32_neon
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor $D0#lo,$D0#lo,$D0#lo
+ veor $D1#lo,$D1#lo,$D1#lo
+ veor $D2#lo,$D2#lo,$D2#lo
+ veor $D3#lo,$D3#lo,$D3#lo
+ veor $D4#lo,$D4#lo,$D4#lo
+ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ adr $zeros,.Lzeros
+ vld1.32 {$D4#lo[0]},[$ctx]
+ sub $ctx,$ctx,#16 @ rewind
+
+.Lbase2_32_neon:
+ add $in2,$inp,#32
+ mov $padbit,$padbit,lsl#24
+ tst $len,#31
+ beq .Leven
+
+ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+ vmov.32 $H4#lo[0],$padbit
+ sub $len,$len,#16
+ add $in2,$inp,#32
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3#lo,$H3#lo,#18
+
+ vsri.u32 $H3#lo,$H2#lo,#14
+ vshl.u32 $H2#lo,$H2#lo,#12
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
+
+ vbic.i32 $H3#lo,#0xfc000000
+ vsri.u32 $H2#lo,$H1#lo,#20
+ vshl.u32 $H1#lo,$H1#lo,#6
+
+ vbic.i32 $H2#lo,#0xfc000000
+ vsri.u32 $H1#lo,$H0#lo,#26
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+
+ vbic.i32 $H0#lo,#0xfc000000
+ vbic.i32 $H1#lo,#0xfc000000
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo
+
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+
+ mov $tbl1,$zeros
+ add $tbl0,$ctx,#48
+
+ cmp $len,$len
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs $len,$len,#64
+# ifdef __thumb2__
+ it lo
+# endif
+ movlo $in2,$zeros
+
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi $tbl1,$ctx,#(48+1*9*4)
+ addhi $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3,$H3,#18
+
+ vsri.u32 $H3,$H2,#14
+ vshl.u32 $H2,$H2,#12
+
+ vbic.i32 $H3,#0xfc000000
+ vsri.u32 $H2,$H1,#20
+ vshl.u32 $H1,$H1,#6
+
+ vbic.i32 $H2,#0xfc000000
+ vsri.u32 $H1,$H0,#26
+
+ vbic.i32 $H0,#0xfc000000
+ vbic.i32 $H1,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ \___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ \___________________/ \____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
+ vmull.u32 $D2,$H2#hi,${R0}[1]
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,${R0}[1]
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,${R0}[1]
+ vmlal.u32 $D2,$H1#hi,${R1}[1]
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,${R0}[1]
+
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,${R0}[1]
+ subs $len,$len,#64
+ vmlal.u32 $D0,$H4#hi,${S1}[1]
+# ifdef __thumb2__
+ it lo
+# endif
+ movlo $in2,$zeros
+ vmlal.u32 $D3,$H2#hi,${R1}[1]
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D1,$H0#hi,${R1}[1]
+ vmlal.u32 $D4,$H3#hi,${R1}[1]
+
+ vmlal.u32 $D0,$H3#hi,${S2}[1]
+ vmlal.u32 $D3,$H1#hi,${R2}[1]
+ vmlal.u32 $D4,$H2#hi,${R2}[1]
+ vmlal.u32 $D1,$H4#hi,${S2}[1]
+ vmlal.u32 $D2,$H0#hi,${R2}[1]
+
+ vmlal.u32 $D3,$H0#hi,${R3}[1]
+ vmlal.u32 $D0,$H2#hi,${S3}[1]
+ vmlal.u32 $D4,$H1#hi,${R3}[1]
+ vmlal.u32 $D1,$H3#hi,${S3}[1]
+ vmlal.u32 $D2,$H4#hi,${S3}[1]
+
+ vmlal.u32 $D3,$H4#hi,${S4}[1]
+ vmlal.u32 $D0,$H1#hi,${S4}[1]
+ vmlal.u32 $D4,$H0#hi,${R4}[1]
+ vmlal.u32 $D1,$H2#hi,${S4}[1]
+ vmlal.u32 $D2,$H3#hi,${S4}[1]
+
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 $D3,$H3#lo,${R0}[0]
+ vmlal.u32 $D0,$H0#lo,${R0}[0]
+ vmlal.u32 $D4,$H4#lo,${R0}[0]
+ vmlal.u32 $D1,$H1#lo,${R0}[0]
+ vmlal.u32 $D2,$H2#lo,${R0}[0]
+ vld1.32 ${S4}[0],[$tbl0,:32]
+
+ vmlal.u32 $D3,$H2#lo,${R1}[0]
+ vmlal.u32 $D0,$H4#lo,${S1}[0]
+ vmlal.u32 $D4,$H3#lo,${R1}[0]
+ vmlal.u32 $D1,$H0#lo,${R1}[0]
+ vmlal.u32 $D2,$H1#lo,${R1}[0]
+
+ vmlal.u32 $D3,$H1#lo,${R2}[0]
+ vmlal.u32 $D0,$H3#lo,${S2}[0]
+ vmlal.u32 $D4,$H2#lo,${R2}[0]
+ vmlal.u32 $D1,$H4#lo,${S2}[0]
+ vmlal.u32 $D2,$H0#lo,${R2}[0]
+
+ vmlal.u32 $D3,$H0#lo,${R3}[0]
+ vmlal.u32 $D0,$H2#lo,${S3}[0]
+ vmlal.u32 $D4,$H1#lo,${R3}[0]
+ vmlal.u32 $D1,$H3#lo,${S3}[0]
+ vmlal.u32 $D3,$H4#lo,${S4}[0]
+
+ vmlal.u32 $D2,$H4#lo,${S3}[0]
+ vmlal.u32 $D0,$H1#lo,${S4}[0]
+ vmlal.u32 $D4,$H0#lo,${R4}[0]
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vmlal.u32 $D1,$H2#lo,${S4}[0]
+ vmlal.u32 $D2,$H3#lo,${S4}[0]
+
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+ vrev32.8 $H3,$H3
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vshl.u32 $H3,$H3,#18
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vsri.u32 $H3,$H2,#14
+ vbic.i32 $D4#lo,#0xfc000000
+ vshl.u32 $H2,$H2,#12
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vbic.i32 $H3,#0xfc000000
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vsri.u32 $H2,$H1,#20
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vshl.u32 $H1,$H1,#6
+ vbic.i32 $D2#lo,#0xfc000000
+ vbic.i32 $H2,#0xfc000000
+
+ vshr.u32 $T0#lo,$D0#lo,#26
+ vbic.i32 $D0#lo,#0xfc000000
+ vsri.u32 $H1,$H0,#26
+ vbic.i32 $H0,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+ vbic.i32 $H1,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add $tbl1,$ctx,#(48+0*9*4)
+ add $tbl0,$ctx,#(48+1*9*4)
+ adds $len,$len,#32
+# ifdef __thumb2__
+ it ne
+# endif
+ movne $len,#0
+ bne .Long_tail
+
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
+ vmull.u32 $D2,$H2#hi,$R0
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,$R0
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,$R0
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,$R0
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,$R0
+
+ vmlal.u32 $D0,$H4#hi,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#hi,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#hi,$R1
+ vmlal.u32 $D4,$H3#hi,$R1
+ vmlal.u32 $D2,$H1#hi,$R1
+
+ vmlal.u32 $D3,$H1#hi,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#hi,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#hi,$R2
+ vmlal.u32 $D1,$H4#hi,$S2
+ vmlal.u32 $D2,$H0#hi,$R2
+
+ vmlal.u32 $D3,$H0#hi,$R3
+# ifdef __thumb2__
+ it ne
+# endif
+ addne $tbl1,$ctx,#(48+2*9*4)
+ vmlal.u32 $D0,$H2#hi,$S3
+# ifdef __thumb2__
+ it ne
+# endif
+ addne $tbl0,$ctx,#(48+3*9*4)
+ vmlal.u32 $D4,$H1#hi,$R3
+ vmlal.u32 $D1,$H3#hi,$S3
+ vmlal.u32 $D2,$H4#hi,$S3
+
+ vmlal.u32 $D3,$H4#hi,$S4
+ vmov.u64 $MASK,#-1 @ can be redundant
+ vmlal.u32 $D0,$H1#hi,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#hi,$R4
+ vmlal.u32 $D1,$H2#hi,$S4
+ vmlal.u32 $D2,$H3#hi,$S4
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+
+ vmlal.u32 $D2,$H2#lo,$R0
+ vmlal.u32 $D0,$H0#lo,$R0
+ vmlal.u32 $D3,$H3#lo,$R0
+ vmlal.u32 $D1,$H1#lo,$R0
+ vmlal.u32 $D4,$H4#lo,$R0
+
+ vmlal.u32 $D0,$H4#lo,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#lo,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#lo,$R1
+ vmlal.u32 $D4,$H3#lo,$R1
+ vmlal.u32 $D2,$H1#lo,$R1
+
+ vmlal.u32 $D3,$H1#lo,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#lo,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#lo,$R2
+ vmlal.u32 $D1,$H4#lo,$S2
+ vmlal.u32 $D2,$H0#lo,$R2
+
+ vmlal.u32 $D3,$H0#lo,$R3
+ vmlal.u32 $D0,$H2#lo,$S3
+ vmlal.u32 $D4,$H1#lo,$R3
+ vmlal.u32 $D1,$H3#lo,$S3
+ vmlal.u32 $D2,$H4#lo,$S3
+
+ vmlal.u32 $D3,$H4#lo,$S4
+ vmov.u64 $MASK,#-1
+ vmlal.u32 $D0,$H1#lo,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#lo,$R4
+ vmlal.u32 $D1,$H2#lo,$S4
+ vmlal.u32 $D2,$H3#lo,$S4
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 $T0,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vshr.u64 $T1,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+
+ vshr.u64 $T0,$D4,#26
+ vand.i64 $D4,$D4,$MASK
+ vshr.u64 $T1,$D1,#26
+ vand.i64 $D1,$D1,$MASK
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+
+ vadd.i64 $D0,$D0,$T0
+ vshl.u64 $T0,$T0,#2
+ vshr.u64 $T1,$D2,#26
+ vand.i64 $D2,$D2,$MASK
+ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
+ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
+
+ vshr.u64 $T0,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vshr.u64 $T1,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
+ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+
+ cmp $len,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ vst1.32 {$D4#lo[0]},[$ctx]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+.Lno_data_neon:
+ ret @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ stmdb sp!,{r4-r11}
+
+ tst ip,ip
+ beq .Lpoly1305_emit_enter
+
+ ldmia $ctx,{$h0-$h4}
+ eor $g0,$g0,$g0
+
+ adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $h1,$h1,lsr#6
+ adcs $h1,$h1,$h2,lsl#20
+ mov $h2,$h2,lsr#12
+ adcs $h2,$h2,$h3,lsl#14
+ mov $h3,$h3,lsr#18
+ adcs $h3,$h3,$h4,lsl#8
+ adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
+
+ and $g0,$h4,#-4 @ ... so reduce
+ and $h4,$h3,#3
+ add $g0,$g0,$g0,lsr#2 @ *= 5
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adc $h3,$h3,#0
+
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+# ifdef __thumb2__
+ it ne
+# endif
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+# ifdef __thumb2__
+ it ne
+# endif
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+# ifdef __thumb2__
+ it ne
+# endif
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+# ifdef __thumb2__
+ it ne
+# endif
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0 @ accumulate nonce
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0] @ store the result
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+
+ ldmia sp!,{r4-r11}
+ ret @ bx lr
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lpoly1305_init
+#endif
+___
+} }
+$code.=<<___;
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
new file mode 100755
index 0000000..79185d2
--- /dev/null
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -0,0 +1,925 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.63/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.39/+50% 1.18(*)
+# X-Gene 2.00/+68% 2.19
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+// forward "declarations" are required for Apple
+.extern OPENSSL_armcap_P
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifdef __ILP32__
+ ldrsw $t1,.LOPENSSL_armcap_P
+#else
+ ldr $t1,.LOPENSSL_armcap_P
+#endif
+ adr $t0,.LOPENSSL_armcap_P
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+ ldr w17,[$t0,$t1]
+#ifdef __ARMEB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ stp $r0,$r1,[$ctx,#32] // save key value
+
+ tst w17,#ARMV7_NEON
+
+ adr $d0,poly1305_blocks
+ adr $r0,poly1305_blocks_neon
+ adr $d1,poly1305_emit
+ adr $r1,poly1305_emit_neon
+
+ csel $d0,$d0,$r0,eq
+ csel $d1,$d1,$r1,eq
+
+ stp $d0,$d1,[$len]
+
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $r0,$r1,[$ctx,#32] // load key value
+ ldr $h2,[$ctx,#16]
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ b .Loop
+
+.align 5
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __ARMEB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adc $h1,$d1,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ str $h2,[$ctx,#16]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldr $h2,[$ctx,#16]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __ARMEB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __ARMEB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adc $h1,$d1,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 5
+poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.hs .Lblocks_neon
+ cbz $is_base2_26,poly1305_blocks
+
+.Lblocks_neon:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ ands $len,$len,#-16
+ b.eq .Lno_data_neon
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+ and $t0,$d2,#-4 // ... so reduce
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$h0,$t0
+ adc $h1,$h1,xzr
+
+#ifdef __ARMEB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+ ldr x30,[sp,#8]
+
+ cbz $padbit,.Lstore_base2_64_neon
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cbnz $len,.Leven_neon
+
+ stp w10,w11,[$ctx] // store hash value base 2^26
+ stp w12,w13,[$ctx,#8]
+ str w14,[$ctx,#16]
+ b .Lno_data_neon
+
+.align 4
+.Lstore_base2_64_neon:
+ stp $h0,$h1,[$ctx] // store hash value base 2^64
+ stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
+ b .Lno_data_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __ARMEB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+.Linit_neon:
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+ ldr x30,[sp,#8]
+
+ add $in2,$inp,#32
+ adr $zeros,.Lzeros
+ subs $len,$len,#64
+ csel $in2,$zeros,$in2,lo
+
+ mov x4,#1
+ str x4,[$ctx,#-24] // set is_base2_26
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ add $in2,$inp,#32
+ adr $zeros,.Lzeros
+ subs $len,$len,#64
+ csel $in2,$zeros,$in2,lo
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ ldp x9,x13,[$in2],#48
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+
+ ushr $T0.2d,$ACC3,#26
+ fmov $IN01_4,x12
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ xtn $H0,$ACC0
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+ bic $H0,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+ bic $H4,#0xfc,lsl#24
+ bic $H1,#0xfc,lsl#24
+
+ add $H0,$H0,$T0.2s
+ shl $T0.2s,$T0.2s,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $H0,$H0,$T0.2s // h4 -> h0
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ ushr $T0.2s,$H0,#26
+ bic $H0,#0xfc,lsl#24
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ movi $MASK.2d,#-1
+ add $IN01_2,$IN01_2,$H2
+ ushr $MASK.2d,$MASK.2d,#38
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC2,$ACC2,$ACC2
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d12,d13,[sp,#48]
+ addp $ACC3,$ACC3,$ACC3
+ ldp d14,d15,[sp,#64]
+ addp $ACC4,$ACC4,$ACC4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ st1 {$ACC4}[0],[$ctx]
+
+.Lno_data_neon:
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cbz $is_base2_26,poly1305_emit
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $h2,$h2,xzr // can be partially reduced...
+
+ ldp $t0,$t1,[$nonce] // load nonce
+
+ and $d0,$h2,#-4 // ... so reduce
+ add $d0,$d0,$h2,lsr#2
+ and $h2,$h2,#3
+ adds $h0,$h0,$d0
+ adc $h1,$h1,xzr
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __ARMEB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __ARMEB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
+.quad OPENSSL_armcap_P-.
+#endif
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+
+ print $_,"\n";
+}
+close STDOUT;
More information about the openssl-commits
mailing list