[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Wed Jun 21 13:22:53 UTC 2017
The branch master has been updated
via 1d23bbccd3dc966254368bcd102bb685c641b14f (commit)
from 9018f3ce0f9fd57e65d8e7d43741f08797811766 (commit)
- Log -----------------------------------------------------------------
commit 1d23bbccd3dc966254368bcd102bb685c641b14f
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Jun 17 20:29:52 2017 +0200
Add sha/asm/keccak1600-c64x.pl
[skip ci]
Reviewed-by: Bernd Edlinger <bernd.edlinger at hotmail.de>
(Merged from https://github.com/openssl/openssl/pull/3708)
-----------------------------------------------------------------------
Summary of changes:
crypto/sha/asm/keccak1600-c64x.pl | 882 ++++++++++++++++++++++++++++++++++++++
1 file changed, 882 insertions(+)
create mode 100755 crypto/sha/asm/keccak1600-c64x.pl
diff --git a/crypto/sha/asm/keccak1600-c64x.pl b/crypto/sha/asm/keccak1600-c64x.pl
new file mode 100755
index 0000000..585f64b
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-c64x.pl
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [ABI- and endian-neutral] Keccak-1600 for C64x.
+#
+# June 2017.
+#
+# This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c)
+# with bit interleaving. 64-bit values are simply split between A- and
+# B-files, with A-file holding least significant halves. This works
+# out perfectly, because all operations including cross-communications
+# [in rotate operations] are always complementary. Performance is
+# [incredible for a 32-bit processor] 10.9 cycles per processed byte
+# for r=1088, which corresponds to SHA3-256. This is >15x faster than
+# compiler-generated KECCAK_1X_ALT code, and >10x than other variants.
+# On average processor ends up issuing ~4.5 instructions per cycle...
+
+my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26));
+ $A[1][4] = 31; # B14 is reserved, A14 is used as iota[]
+ ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]);
+my @C = (0..4,$A[3][0],$A[4][0]);
+my $iotas = "A14";
+
+my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 36, 44, 6, 55, 20 ],
+ [ 3, 10, 43, 25, 39 ],
+ [ 41, 45, 15, 21, 8 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+sub ROL64 {
+ my ($src,$rot,$dst,$p) = @_;
+
+ if ($rot&1) {
+$code.=<<___;
+$p ROTL B$src,$rot/2+1,A$dst
+|| ROTL A$src,$rot/2, B$dst
+___
+ } else {
+$code.=<<___;
+$p ROTL A$src,$rot/2,A$dst
+|| ROTL B$src,$rot/2,B$dst
+___
+ }
+}
+
+########################################################################
+# Stack frame layout
+#
+# SP--->+------+------+
+# | | |
+# +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int
+# | | |
+# +2--->+------+------+<- -8
+# | | |
+# +3--->+------+------+<- -7
+# | A2 | A3 | A3:A2 are preserved by KeccakF1600_int
+# +4--->+------+------+<- -6
+# | B2 | B3 | B3:B2 are preserved by KeccakF1600_int
+# +5--->+------+------+<- -5 below is ABI-compliant layout
+# | A10 | A11 |
+# +6--->+------+------+<- -4
+# | A12 | A13 |
+# +7--->+------+------+<- -3
+# | A14 | B3 |
+# +8--->+------+------+<- -2
+# | B10 | B11 |
+# +9--->+------+------+<- -1
+# | B12 | B13 |
+# +------+------+<---FP
+# | A15 |
+# +------+--
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg KeccakF1600,_KeccakF1600
+ .asg SHA3_absorb,_SHA3_absorb
+ .asg SHA3_squeeze,_SHA3_squeeze
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .align 32
+_KeccakF1600_int:
+ .asmfunc
+ STDW A3:A2,*FP[-7]
+|| STDW B3:B2,*SP[4]
+_KeccakF1600_cheat:
+ .if __TI_EABI__
+ ADDKPC _KeccakF1600_int,B0
+|| MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
+ MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
+ .else
+ ADDKPC _KeccakF1600_int,B0
+|| MVKL (iotas-_KeccakF1600_int),$iotas
+ MVKH (iotas-_KeccakF1600_int),$iotas
+ .endif
+ ADD B0,$iotas,$iotas
+loop?:
+ XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta
+|| XOR B$A[0][2],B$A[1][2],B$C[2]
+|| XOR A$A[0][3],A$A[1][3],A$C[3]
+|| XOR B$A[0][3],B$A[1][3],B$C[3]
+|| XOR A$A[0][0],A$A[1][0],A$C[0]
+|| XOR B$A[0][0],B$A[1][0],B$C[0]
+ XOR A$A[2][2],A$C[2],A$C[2]
+|| XOR B$A[2][2],B$C[2],B$C[2]
+|| XOR A$A[2][3],A$C[3],A$C[3]
+|| XOR B$A[2][3],B$C[3],B$C[3]
+|| XOR A$A[2][0],A$C[0],A$C[0]
+|| XOR B$A[2][0],B$C[0],B$C[0]
+ XOR A$A[3][2],A$C[2],A$C[2]
+|| XOR B$A[3][2],B$C[2],B$C[2]
+|| XOR A$A[3][3],A$C[3],A$C[3]
+|| XOR B$A[3][3],B$C[3],B$C[3]
+|| XOR A$A[3][0],A$C[0],A$C[0]
+|| XOR B$A[3][0],B$C[0],B$C[0]
+ XOR A$A[4][2],A$C[2],A$C[2]
+|| XOR B$A[4][2],B$C[2],B$C[2]
+|| XOR A$A[4][3],A$C[3],A$C[3]
+|| XOR B$A[4][3],B$C[3],B$C[3]
+|| XOR A$A[4][0],A$C[0],A$C[0]
+|| XOR B$A[4][0],B$C[0],B$C[0]
+ XOR A$A[0][4],A$A[1][4],A$C[4]
+|| XOR B$A[0][4],B$A[1][4],B$C[4]
+|| XOR A$A[0][1],A$A[1][1],A$C[1]
+|| XOR B$A[0][1],B$A[1][1],B$C[1]
+|| STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data
+ STDW B$A[3][0]:B$A[4][0],*SP[2]
+|| XOR A$A[2][4],A$C[4],A$C[4]
+|| XOR B$A[2][4],B$C[4],B$C[4]
+|| XOR A$A[2][1],A$C[1],A$C[1]
+|| XOR B$A[2][1],B$C[1],B$C[1]
+|| ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1)
+|| ROTL A$C[2],0,B$C[5]
+ XOR A$A[3][4],A$C[4],A$C[4]
+|| XOR B$A[3][4],B$C[4],B$C[4]
+|| XOR A$A[3][1],A$C[1],A$C[1]
+|| XOR B$A[3][1],B$C[1],B$C[1]
+|| ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1)
+|| ROTL A$C[3],0,B$C[6]
+ XOR A$A[4][4],A$C[4],A$C[4]
+|| XOR B$A[4][4],B$C[4],B$C[4]
+|| XOR A$A[4][1],A$C[1],A$C[1]
+|| XOR B$A[4][1],B$C[1],B$C[1]
+|| XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1)
+|| XOR B$C[0],B$C[5],B$C[5]
+ XOR A$C[5],A$A[0][1],A$A[0][1]
+|| XOR B$C[5],B$A[0][1],B$A[0][1]
+|| XOR A$C[5],A$A[1][1],A$A[1][1]
+|| XOR B$C[5],B$A[1][1],B$A[1][1]
+|| XOR A$C[5],A$A[2][1],A$A[2][1]
+|| XOR B$C[5],B$A[2][1],B$A[2][1]
+ XOR A$C[5],A$A[3][1],A$A[3][1]
+|| XOR B$C[5],B$A[3][1],B$A[3][1]
+|| XOR A$C[5],A$A[4][1],A$A[4][1]
+|| XOR B$C[5],B$A[4][1],B$A[4][1]
+|| ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1)
+|| ROTL A$C[4],0,B$C[5]
+|| XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1)
+|| XOR B$C[1],B$C[6],B$C[6]
+ XOR A$C[6],A$A[0][2],A$A[0][2]
+|| XOR B$C[6],B$A[0][2],B$A[0][2]
+|| XOR A$C[6],A$A[1][2],A$A[1][2]
+|| XOR B$C[6],B$A[1][2],B$A[1][2]
+|| XOR A$C[6],A$A[2][2],A$A[2][2]
+|| XOR B$C[6],B$A[2][2],B$A[2][2]
+|| ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1)
+|| ROTL A$C[1],0,B$C[1]
+ XOR A$C[6],A$A[3][2],A$A[3][2]
+|| XOR B$C[6],B$A[3][2],B$A[3][2]
+|| XOR A$C[6],A$A[4][2],A$A[4][2]
+|| XOR B$C[6],B$A[4][2],B$A[4][2]
+|| ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1)
+|| ROTL A$C[0],0,B$C[6]
+|| XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1)
+|| XOR B$C[5],B$C[2],B$C[2]
+ XOR A$C[2],A$A[0][3],A$A[0][3]
+|| XOR B$C[2],B$A[0][3],B$A[0][3]
+|| XOR A$C[2],A$A[1][3],A$A[1][3]
+|| XOR B$C[2],B$A[1][3],B$A[1][3]
+|| XOR A$C[2],A$A[2][3],A$A[2][3]
+|| XOR B$C[2],B$A[2][3],B$A[2][3]
+ XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1)
+|| XOR B$C[6],B$C[3],B$C[3]
+|| LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data
+|| LDDW *SP[2],B$A[3][0]:B$A[4][0]
+|| XOR A$C[2],A$A[3][3],A$A[3][3]
+|| XOR B$C[2],B$A[3][3],B$A[3][3]
+ XOR A$C[2],A$A[4][3],A$A[4][3]
+|| XOR B$C[2],B$A[4][3],B$A[4][3]
+|| XOR A$C[3],A$A[0][4],A$A[0][4]
+|| XOR B$C[3],B$A[0][4],B$A[0][4]
+|| XOR A$C[3],A$A[1][4],A$A[1][4]
+|| XOR B$C[3],B$A[1][4],B$A[1][4]
+ XOR A$C[3],A$A[2][4],A$A[2][4]
+|| XOR B$C[3],B$A[2][4],B$A[2][4]
+|| XOR A$C[3],A$A[3][4],A$A[3][4]
+|| XOR B$C[3],B$A[3][4],B$A[3][4]
+|| XOR A$C[3],A$A[4][4],A$A[4][4]
+|| XOR B$C[3],B$A[4][4],B$A[4][4]
+ XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1)
+|| XOR B$C[1],B$C[4],B$C[4]
+|| MV A$A[0][1],A$C[1] ; Rho+Pi, "early start"
+|| MV B$A[0][1],B$C[1]
+___
+ &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||");
+$code.=<<___;
+ XOR A$C[4],A$A[0][0],A$A[0][0]
+|| XOR B$C[4],B$A[0][0],B$A[0][0]
+|| XOR A$C[4],A$A[1][0],A$A[1][0]
+|| XOR B$C[4],B$A[1][0],B$A[1][0]
+|| MV A$A[0][3],A$C[3]
+|| MV B$A[0][3],B$C[3]
+___
+ &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||");
+$code.=<<___;
+ XOR A$C[4],A$A[2][0],A$A[2][0]
+|| XOR B$C[4],B$A[2][0],B$A[2][0]
+|| XOR A$C[4],A$A[3][0],A$A[3][0]
+|| XOR B$C[4],B$A[3][0],B$A[3][0]
+|| MV A$A[0][2],A$C[2]
+|| MV B$A[0][2],B$C[2]
+___
+ &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||");
+$code.=<<___;
+ XOR A$C[4],A$A[4][0],A$A[4][0]
+|| XOR B$C[4],B$A[4][0],B$A[4][0]
+|| MV A$A[0][4],A$C[4]
+|| MV B$A[0][4],B$C[4]
+___
+ &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||");
+
+ &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]);
+$code.=<<___;
+|| LDW *${iotas}++[2],A$C[0]
+___
+ &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]);
+$code.=<<___;
+|| LDW *${iotas}[-1],B$C[0]
+___
+ &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]);
+ &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]);
+
+ &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]);
+ &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]);
+ &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]);
+ &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]);
+
+ &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]);
+ &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]);
+ &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]);
+ &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]);
+
+ &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]);
+ &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]);
+ &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]);
+ &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]);
+
+ #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below
+ &ROL64 ($C[1], $rhotates[0][1],$A[2][0]);
+ &ROL64 ($C[4], $rhotates[0][4],$A[3][0]);
+ &ROL64 ($C[2], $rhotates[0][2],$A[4][0]);
+$code.=<<___;
+|| ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota
+|| ANDN B$A[0][2],B$A[0][1],B$C[4]
+|| ANDN A$A[0][3],A$A[0][2],A$C[1]
+|| ANDN B$A[0][3],B$A[0][2],B$C[1]
+|| ANDN A$A[0][4],A$A[0][3],A$C[2]
+|| ANDN B$A[0][4],B$A[0][3],B$C[2]
+___
+ &ROL64 ($C[3], $rhotates[0][3],$A[1][0]);
+$code.=<<___;
+|| ANDN A$A[0][0],A$A[0][4],A$C[3]
+|| ANDN B$A[0][0],B$A[0][4],B$C[3]
+|| XOR A$C[4],A$A[0][0],A$A[0][0]
+|| XOR B$C[4],B$A[0][0],B$A[0][0]
+|| ANDN A$A[0][1],A$A[0][0],A$C[4]
+|| ANDN B$A[0][1],B$A[0][0],B$C[4]
+ XOR A$C[1],A$A[0][1],A$A[0][1]
+|| XOR B$C[1],B$A[0][1],B$A[0][1]
+|| XOR A$C[2],A$A[0][2],A$A[0][2]
+|| XOR B$C[2],B$A[0][2],B$A[0][2]
+|| XOR A$C[3],A$A[0][3],A$A[0][3]
+|| XOR B$C[3],B$A[0][3],B$A[0][3]
+ XOR A$C[4],A$A[0][4],A$A[0][4]
+|| XOR B$C[4],B$A[0][4],B$A[0][4]
+|| XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++];
+|| XOR B$C[0],B$A[0][0],B$A[0][0]
+|| EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done?
+
+ ANDN A$A[1][2],A$A[1][1],A$C[4]
+|| ANDN B$A[1][2],B$A[1][1],B$C[4]
+|| ANDN A$A[1][3],A$A[1][2],A$C[1]
+|| ANDN B$A[1][3],B$A[1][2],B$C[1]
+|| ANDN A$A[1][4],A$A[1][3],A$C[2]
+|| ANDN B$A[1][4],B$A[1][3],B$C[2]
+ ANDN A$A[1][0],A$A[1][4],A$C[3]
+|| ANDN B$A[1][0],B$A[1][4],B$C[3]
+|| XOR A$C[4],A$A[1][0],A$A[1][0]
+|| XOR B$C[4],B$A[1][0],B$A[1][0]
+|| ANDN A$A[1][1],A$A[1][0],A$C[4]
+|| ANDN B$A[1][1],B$A[1][0],B$C[4]
+ XOR A$C[1],A$A[1][1],A$A[1][1]
+|| XOR B$C[1],B$A[1][1],B$A[1][1]
+|| XOR A$C[2],A$A[1][2],A$A[1][2]
+|| XOR B$C[2],B$A[1][2],B$A[1][2]
+|| XOR A$C[3],A$A[1][3],A$A[1][3]
+|| XOR B$C[3],B$A[1][3],B$A[1][3]
+ XOR A$C[4],A$A[1][4],A$A[1][4]
+|| XOR B$C[4],B$A[1][4],B$A[1][4]
+
+|| ANDN A$A[2][2],A$A[2][1],A$C[4]
+|| ANDN B$A[2][2],B$A[2][1],B$C[4]
+|| ANDN A$A[2][3],A$A[2][2],A$C[1]
+|| ANDN B$A[2][3],B$A[2][2],B$C[1]
+ ANDN A$A[2][4],A$A[2][3],A$C[2]
+|| ANDN B$A[2][4],B$A[2][3],B$C[2]
+|| ANDN A$A[2][0],A$A[2][4],A$C[3]
+|| ANDN B$A[2][0],B$A[2][4],B$C[3]
+|| XOR A$C[4],A$A[2][0],A$A[2][0]
+|| XOR B$C[4],B$A[2][0],B$A[2][0]
+ ANDN A$A[2][1],A$A[2][0],A$C[4]
+|| ANDN B$A[2][1],B$A[2][0],B$C[4]
+|| XOR A$C[1],A$A[2][1],A$A[2][1]
+|| XOR B$C[1],B$A[2][1],B$A[2][1]
+|| XOR A$C[2],A$A[2][2],A$A[2][2]
+|| XOR B$C[2],B$A[2][2],B$A[2][2]
+ XOR A$C[3],A$A[2][3],A$A[2][3]
+|| XOR B$C[3],B$A[2][3],B$A[2][3]
+|| XOR A$C[4],A$A[2][4],A$A[2][4]
+|| XOR B$C[4],B$A[2][4],B$A[2][4]
+
+ ANDN A$A[3][2],A$A[3][1],A$C[4]
+|| ANDN B$A[3][2],B$A[3][1],B$C[4]
+|| ANDN A$A[3][3],A$A[3][2],A$C[1]
+|| ANDN B$A[3][3],B$A[3][2],B$C[1]
+|| ANDN A$A[3][4],A$A[3][3],A$C[2]
+|| ANDN B$A[3][4],B$A[3][3],B$C[2]
+ ANDN A$A[3][0],A$A[3][4],A$C[3]
+|| ANDN B$A[3][0],B$A[3][4],B$C[3]
+|| XOR A$C[4],A$A[3][0],A$A[3][0]
+|| XOR B$C[4],B$A[3][0],B$A[3][0]
+|| ANDN A$A[3][1],A$A[3][0],A$C[4]
+|| ANDN B$A[3][1],B$A[3][0],B$C[4]
+ XOR A$C[1],A$A[3][1],A$A[3][1]
+|| XOR B$C[1],B$A[3][1],B$A[3][1]
+|| XOR A$C[2],A$A[3][2],A$A[3][2]
+|| XOR B$C[2],B$A[3][2],B$A[3][2]
+|| XOR A$C[3],A$A[3][3],A$A[3][3]
+||[A0] BNOP loop?
+ XOR B$C[3],B$A[3][3],B$A[3][3]
+|| XOR A$C[4],A$A[3][4],A$A[3][4]
+|| XOR B$C[4],B$A[3][4],B$A[3][4]
+||[!A0] LDDW *FP[-7],A3:A2
+||[!A0] LDDW *SP[4], RA:B2
+
+ ANDN A$A[4][2],A$A[4][1],A$C[4]
+|| ANDN B$A[4][2],B$A[4][1],B$C[4]
+|| ANDN A$A[4][3],A$A[4][2],A$C[1]
+|| ANDN B$A[4][3],B$A[4][2],B$C[1]
+|| ANDN A$A[4][4],A$A[4][3],A$C[2]
+|| ANDN B$A[4][4],B$A[4][3],B$C[2]
+ ANDN A$A[4][0],A$A[4][4],A$C[3]
+|| ANDN B$A[4][0],B$A[4][4],B$C[3]
+|| XOR A$C[4],A$A[4][0],A$A[4][0]
+|| XOR B$C[4],B$A[4][0],B$A[4][0]
+|| ANDN A$A[4][1],A$A[4][0],A$C[4]
+|| ANDN B$A[4][1],B$A[4][0],B$C[4]
+ XOR A$C[1],A$A[4][1],A$A[4][1]
+|| XOR B$C[1],B$A[4][1],B$A[4][1]
+|| XOR A$C[2],A$A[4][2],A$A[4][2]
+|| XOR B$C[2],B$A[4][2],B$A[4][2]
+|| XOR A$C[3],A$A[4][3],A$A[4][3]
+|| XOR B$C[3],B$A[4][3],B$A[4][3]
+ XOR A$C[4],A$A[4][4],A$A[4][4]
+|| XOR B$C[4],B$A[4][4],B$A[4][4]
+;;===== branch to loop? is taken here
+
+ BNOP RA,5
+ .endasmfunc
+
+ .newblock
+ .global _KeccakF1600
+ .align 32
+_KeccakF1600:
+ .asmfunc stack_usage(80)
+ STW FP,*SP--(80) ; save frame pointer
+|| MV SP,FP
+ STDW B13:B12,*SP[9]
+|| STDW A13:A12,*FP[-4]
+ STDW B11:B10,*SP[8]
+|| STDW A11:A10,*FP[-5]
+ STW RA, *SP[15]
+|| STW A14,*FP[-6]
+|| MV A4,A2
+|| ADD 4,A4,B2
+
+ LDW *A2++[2],A$A[0][0] ; load A[5][5]
+|| LDW *B2++[2],B$A[0][0]
+ LDW *A2++[2],A$A[0][1]
+|| LDW *B2++[2],B$A[0][1]
+ LDW *A2++[2],A$A[0][2]
+|| LDW *B2++[2],B$A[0][2]
+ LDW *A2++[2],A$A[0][3]
+|| LDW *B2++[2],B$A[0][3]
+ LDW *A2++[2],A$A[0][4]
+|| LDW *B2++[2],B$A[0][4]
+
+ LDW *A2++[2],A$A[1][0]
+|| LDW *B2++[2],B$A[1][0]
+ LDW *A2++[2],A$A[1][1]
+|| LDW *B2++[2],B$A[1][1]
+ LDW *A2++[2],A$A[1][2]
+|| LDW *B2++[2],B$A[1][2]
+ LDW *A2++[2],A$A[1][3]
+|| LDW *B2++[2],B$A[1][3]
+ LDW *A2++[2],A$A[1][4]
+|| LDW *B2++[2],B$A[1][4]
+
+ LDW *A2++[2],A$A[2][0]
+|| LDW *B2++[2],B$A[2][0]
+ LDW *A2++[2],A$A[2][1]
+|| LDW *B2++[2],B$A[2][1]
+ LDW *A2++[2],A$A[2][2]
+|| LDW *B2++[2],B$A[2][2]
+ LDW *A2++[2],A$A[2][3]
+|| LDW *B2++[2],B$A[2][3]
+ LDW *A2++[2],A$A[2][4]
+|| LDW *B2++[2],B$A[2][4]
+
+ LDW *A2++[2],A$A[3][0]
+|| LDW *B2++[2],B$A[3][0]
+ LDW *A2++[2],A$A[3][1]
+|| LDW *B2++[2],B$A[3][1]
+ LDW *A2++[2],A$A[3][2]
+|| LDW *B2++[2],B$A[3][2]
+ LDW *A2++[2],A$A[3][3]
+|| LDW *B2++[2],B$A[3][3]
+ LDW *A2++[2],A$A[3][4]
+|| LDW *B2++[2],B$A[3][4]
+|| BNOP _KeccakF1600_int
+
+ ADDKPC ret?,RA
+|| LDW *A2++[2],A$A[4][0]
+|| LDW *B2++[2],B$A[4][0]
+ LDW *A2++[2],A$A[4][1]
+|| LDW *B2++[2],B$A[4][1]
+ LDW *A2++[2],A$A[4][2]
+|| LDW *B2++[2],B$A[4][2]
+ LDW *A2++[2],A$A[4][3]
+|| LDW *B2++[2],B$A[4][3]
+ LDW *A2,A$A[4][4]
+|| LDW *B2,B$A[4][4]
+|| ADDK -192,A2 ; rewind
+|| ADDK -192,B2
+
+ .align 16
+ret?:
+ STW A$A[0][0],*A2++[2] ; store A[5][5]
+|| STW B$A[0][0],*B2++[2]
+ STW A$A[0][1],*A2++[2]
+|| STW B$A[0][1],*B2++[2]
+ STW A$A[0][2],*A2++[2]
+|| STW B$A[0][2],*B2++[2]
+ STW A$A[0][3],*A2++[2]
+|| STW B$A[0][3],*B2++[2]
+ STW A$A[0][4],*A2++[2]
+|| STW B$A[0][4],*B2++[2]
+
+ STW A$A[1][0],*A2++[2]
+|| STW B$A[1][0],*B2++[2]
+ STW A$A[1][1],*A2++[2]
+|| STW B$A[1][1],*B2++[2]
+ STW A$A[1][2],*A2++[2]
+|| STW B$A[1][2],*B2++[2]
+ STW A$A[1][3],*A2++[2]
+|| STW B$A[1][3],*B2++[2]
+ STW A$A[1][4],*A2++[2]
+|| STW B$A[1][4],*B2++[2]
+
+ STW A$A[2][0],*A2++[2]
+|| STW B$A[2][0],*B2++[2]
+ STW A$A[2][1],*A2++[2]
+|| STW B$A[2][1],*B2++[2]
+ STW A$A[2][2],*A2++[2]
+|| STW B$A[2][2],*B2++[2]
+ STW A$A[2][3],*A2++[2]
+|| STW B$A[2][3],*B2++[2]
+ STW A$A[2][4],*A2++[2]
+|| STW B$A[2][4],*B2++[2]
+
+ STW A$A[3][0],*A2++[2]
+|| STW B$A[3][0],*B2++[2]
+ STW A$A[3][1],*A2++[2]
+|| STW B$A[3][1],*B2++[2]
+ STW A$A[3][2],*A2++[2]
+|| STW B$A[3][2],*B2++[2]
+ STW A$A[3][3],*A2++[2]
+|| STW B$A[3][3],*B2++[2]
+ STW A$A[3][4],*A2++[2]
+|| STW B$A[3][4],*B2++[2]
+
+ LDW *SP[15],RA
+|| LDW *FP[-6],A14
+
+ STW A$A[4][0],*A2++[2]
+|| STW B$A[4][0],*B2++[2]
+ STW A$A[4][1],*A2++[2]
+|| STW B$A[4][1],*B2++[2]
+ STW A$A[4][2],*A2++[2]
+|| STW B$A[4][2],*B2++[2]
+ STW A$A[4][3],*A2++[2]
+|| STW B$A[4][3],*B2++[2]
+ STW A$A[4][4],*A2
+|| STW B$A[4][4],*B2
+|| ADDK -192,A2 ; rewind
+
+ MV A2,A4 ; return original A4
+|| LDDW *SP[8], B11:B10
+|| LDDW *FP[-5],A11:A10
+ LDDW *SP[9], B13:B12
+|| LDDW *FP[-4],A13:A12
+|| BNOP RA
+ LDW *++SP(80),FP ; restore frame pointer
+ NOP 4 ; wait till FP is committed
+ .endasmfunc
+
+ .newblock
+ .asg B2,BSZ
+ .asg A2,INP
+ .asg A3,LEN
+ .global _SHA3_absorb
+ .align 32
+_SHA3_absorb:
+ .asmfunc stack_usage(80)
+ STW FP,*SP--(80) ; save frame pointer
+|| MV SP,FP
+ STDW B13:B12,*SP[9]
+|| STDW A13:A12,*FP[-4]
+ STDW B11:B10,*SP[8]
+|| STDW A11:A10,*FP[-5]
+ STW RA, *SP[15]
+|| STW A14,*FP[-6]
+
+ STW A4,*SP[1] ; save A[][]
+|| MV B4,INP ; reassign arguments
+|| MV A6,LEN
+|| MV B6,BSZ
+|| ADD 4,A4,B4
+
+ LDW *A4++[2],A$A[0][0] ; load A[5][5]
+|| LDW *B4++[2],B$A[0][0]
+ LDW *A4++[2],A$A[0][1]
+|| LDW *B4++[2],B$A[0][1]
+ LDW *A4++[2],A$A[0][2]
+|| LDW *B4++[2],B$A[0][2]
+ LDW *A4++[2],A$A[0][3]
+|| LDW *B4++[2],B$A[0][3]
+ LDW *A4++[2],A$A[0][4]
+|| LDW *B4++[2],B$A[0][4]
+
+ LDW *A4++[2],A$A[1][0]
+|| LDW *B4++[2],B$A[1][0]
+ LDW *A4++[2],A$A[1][1]
+|| LDW *B4++[2],B$A[1][1]
+ LDW *A4++[2],A$A[1][2]
+|| LDW *B4++[2],B$A[1][2]
+ LDW *A4++[2],A$A[1][3]
+|| LDW *B4++[2],B$A[1][3]
+ LDW *A4++[2],A$A[1][4]
+|| LDW *B4++[2],B$A[1][4]
+
+ LDW *A4++[2],A$A[2][0]
+|| LDW *B4++[2],B$A[2][0]
+ LDW *A4++[2],A$A[2][1]
+|| LDW *B4++[2],B$A[2][1]
+ LDW *A4++[2],A$A[2][2]
+|| LDW *B4++[2],B$A[2][2]
+ LDW *A4++[2],A$A[2][3]
+|| LDW *B4++[2],B$A[2][3]
+ LDW *A4++[2],A$A[2][4]
+|| LDW *B4++[2],B$A[2][4]
+
+ LDW *A4++[2],A$A[3][0]
+|| LDW *B4++[2],B$A[3][0]
+ LDW *A4++[2],A$A[3][1]
+|| LDW *B4++[2],B$A[3][1]
+ LDW *A4++[2],A$A[3][2]
+|| LDW *B4++[2],B$A[3][2]
+ LDW *A4++[2],A$A[3][3]
+|| LDW *B4++[2],B$A[3][3]
+ LDW *A4++[2],A$A[3][4]
+|| LDW *B4++[2],B$A[3][4]
+
+ LDW *A4++[2],A$A[4][0]
+|| LDW *B4++[2],B$A[4][0]
+ LDW *A4++[2],A$A[4][1]
+|| LDW *B4++[2],B$A[4][1]
+ LDW *A4++[2],A$A[4][2]
+|| LDW *B4++[2],B$A[4][2]
+ LDW *A4++[2],A$A[4][3]
+|| LDW *B4++[2],B$A[4][3]
+ LDW *A4,A$A[4][4]
+|| LDW *B4,B$A[4][4]
+|| ADDKPC loop?,RA
+ STDW RA:BSZ,*SP[4]
+
+loop?:
+ CMPLTU LEN,BSZ,A0 ; len < bsz?
+|| SHRU BSZ,3,BSZ
+ [A0] BNOP ret?
+||[A0] ZERO BSZ
+||[A0] LDW *SP[1],A2 ; pull A[][]
+ [BSZ] LDNDW *INP++,A1:A0
+||[BSZ] SUB LEN,8,LEN
+||[BSZ] SUB BSZ,1,BSZ
+ NOP 4
+___
+for ($y = 0; $y < 5; $y++) {
+ for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) {
+$code.=<<___;
+ .if .BIG_ENDIAN
+ SWAP2 A0,A1
+|| SWAP2 A1,A0
+ SWAP4 A0,A0
+ SWAP4 A1,A1
+||[!BSZ]BNOP _KeccakF1600_cheat
+||[!BSZ]STDW LEN:INP,*SP[3]
+|| DEAL A0,A0
+ .else
+ [!BSZ]BNOP _KeccakF1600_cheat
+||[!BSZ]STDW LEN:INP,*SP[3]
+|| DEAL A0,A0
+ .endif
+ [BSZ] LDNDW *INP++,A1:A0
+|| DEAL A1,A1
+ [BSZ] SUB LEN,8,LEN
+||[BSZ] SUB BSZ,1,BSZ
+ PACK2 A1,A0,A0
+|| PACKH2 A1,A0,A1
+ XOR A0,A$A[$y][$x],A$A[$y][$x]
+ XOR A1,B$A[$y][$x],B$A[$y][$x]
+___
+ }
+}
+$code.=<<___;
+ .if .BIG_ENDIAN
+ SWAP2 A0,A1
+|| SWAP2 A1,A0
+ SWAP4 A0,A0
+ SWAP4 A1,A1
+ .endif
+ BNOP _KeccakF1600_cheat
+|| STDW LEN:INP,*SP[3]
+|| DEAL A0,A0
+ DEAL A1,A1
+ NOP
+ PACK2 A1,A0,A0
+|| PACKH2 A1,A0,A1
+ XOR A0,A$A[4][4],A$A[4][4]
+ XOR A1,B$A[4][4],B$A[4][4]
+
+ .align 16
+ret?:
+ MV LEN,A4 ; return value
+|| ADD 4,A2,B2
+
+ STW A$A[0][0],*A2++[2] ; store A[5][5]
+|| STW B$A[0][0],*B2++[2]
+ STW A$A[0][1],*A2++[2]
+|| STW B$A[0][1],*B2++[2]
+ STW A$A[0][2],*A2++[2]
+|| STW B$A[0][2],*B2++[2]
+ STW A$A[0][3],*A2++[2]
+|| STW B$A[0][3],*B2++[2]
+ STW A$A[0][4],*A2++[2]
+|| STW B$A[0][4],*B2++[2]
+
+ STW A$A[1][0],*A2++[2]
+|| STW B$A[1][0],*B2++[2]
+ STW A$A[1][1],*A2++[2]
+|| STW B$A[1][1],*B2++[2]
+ STW A$A[1][2],*A2++[2]
+|| STW B$A[1][2],*B2++[2]
+ STW A$A[1][3],*A2++[2]
+|| STW B$A[1][3],*B2++[2]
+ STW A$A[1][4],*A2++[2]
+|| STW B$A[1][4],*B2++[2]
+
+ STW A$A[2][0],*A2++[2]
+|| STW B$A[2][0],*B2++[2]
+ STW A$A[2][1],*A2++[2]
+|| STW B$A[2][1],*B2++[2]
+ STW A$A[2][2],*A2++[2]
+|| STW B$A[2][2],*B2++[2]
+ STW A$A[2][3],*A2++[2]
+|| STW B$A[2][3],*B2++[2]
+ STW A$A[2][4],*A2++[2]
+|| STW B$A[2][4],*B2++[2]
+
+ LDW *SP[15],RA
+|| LDW *FP[-6],A14
+
+ STW A$A[3][0],*A2++[2]
+|| STW B$A[3][0],*B2++[2]
+ STW A$A[3][1],*A2++[2]
+|| STW B$A[3][1],*B2++[2]
+ STW A$A[3][2],*A2++[2]
+|| STW B$A[3][2],*B2++[2]
+ STW A$A[3][3],*A2++[2]
+|| STW B$A[3][3],*B2++[2]
+ STW A$A[3][4],*A2++[2]
+|| STW B$A[3][4],*B2++[2]
+
+ LDDW *SP[8], B11:B10
+|| LDDW *FP[-5],A11:A10
+ LDDW *SP[9], B13:B12
+|| LDDW *FP[-4],A13:A12
+ BNOP RA
+|| LDW *++SP(80),FP ; restore frame pointer
+
+ STW A$A[4][0],*A2++[2]
+|| STW B$A[4][0],*B2++[2]
+ STW A$A[4][1],*A2++[2]
+|| STW B$A[4][1],*B2++[2]
+ STW A$A[4][2],*A2++[2]
+|| STW B$A[4][2],*B2++[2]
+ STW A$A[4][3],*A2++[2]
+|| STW B$A[4][3],*B2++[2]
+ STW A$A[4][4],*A2++[2]
+|| STW B$A[4][4],*B2++[2]
+ .endasmfunc
+
+ .newblock
+ .global _SHA3_squeeze
+ .asg A12,OUT
+ .asg A13,LEN
+ .asg A14,BSZ
+ .align 32
+_SHA3_squeeze:
+ .asmfunc stack_usage(24)
+ STW FP,*SP--(24) ; save frame pointer
+|| MV SP,FP
+ STW RA, *SP[5]
+|| STW A14,*FP[-2]
+ STDW A13:A12,*FP[-2]
+|| MV B4,OUT ; reassign arguments
+ MV A6,LEN
+|| MV B6,BSZ
+
+loop?:
+ LDW *SP[5],RA ; reload RA
+|| SHRU BSZ,3,A1
+|| MV A4,A8
+|| ADD 4,A4,B8
+block?:
+ CMPLTU LEN,8,A0 ; len < 8?
+ [A0] BNOP tail?
+ LDW *A8++[2],A9
+|| LDW *B8++[2],B9
+|| SUB LEN,8,LEN ; len -= 8
+ MV LEN,A0
+|| SUB A1,1,A1 ; bsz--
+|| NOP 4
+ .if .BIG_ENDIAN
+ SWAP4 A9,A9
+|| SWAP4 B9,B9
+ SWAP2 A9,A9
+|| SWAP2 B9,B9
+ .endif
+ [!A0] BNOP ret?
+||[!A0] ZERO A1
+ PACK2 B9,A9,B7
+||[A1] BNOP block?
+ PACKH2 B9,A9,B9
+|| SHFL B7,B7
+ SHFL B9,B9
+ STNW B7,*OUT++
+ STNW B9,*OUT++
+ NOP
+
+ BNOP _KeccakF1600,4
+ ADDKPC loop?,RA
+
+ .align 16
+tail?:
+ .if .BIG_ENDIAN
+ SWAP4 A9,A9
+|| SWAP4 B9,B9
+ SWAP2 A9,A9
+|| SWAP2 B9,B9
+ .endif
+ PACK2 B9,A9,B7
+ PACKH2 B9,A9,B9
+|| SHFL B7,B7
+ SHFL B9,B9
+
+ STB B7,*OUT++
+|| SHRU B7,8,B7
+|| ADD LEN,7,A0
+ [A0] STB B7,*OUT++
+||[A0] SHRU B7,8,B7
+||[A0] SUB A0,1,A0
+ [A0] STB B7,*OUT++
+||[A0] SHRU B7,8,B7
+||[A0] SUB A0,1,A0
+ [A0] STB B7,*OUT++
+||[A0] SUB A0,1,A0
+ [A0] STB B9,*OUT++
+||[A0] SHRU B9,8,B9
+||[A0] SUB A0,1,A0
+ [A0] STB B9,*OUT++
+||[A0] SHRU B9,8,B9
+||[A0] SUB A0,1,A0
+ [A0] STB B9,*OUT++
+
+ret?:
+ LDDW *FP[-2],A13:A12
+ BNOP RA
+|| LDW *FP[-2],A14
+ LDW *++SP(24),FP ; restore frame pointer
+ NOP 4 ; wait till FP is committed
+ .endasmfunc
+
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
+ .sect ".const:sha_asm"
+ .endif
+ .align 256
+ .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+iotas:
+ .uword 0x00000001, 0x00000000
+ .uword 0x00000000, 0x00000089
+ .uword 0x00000000, 0x8000008b
+ .uword 0x00000000, 0x80008080
+ .uword 0x00000001, 0x0000008b
+ .uword 0x00000001, 0x00008000
+ .uword 0x00000001, 0x80008088
+ .uword 0x00000001, 0x80000082
+ .uword 0x00000000, 0x0000000b
+ .uword 0x00000000, 0x0000000a
+ .uword 0x00000001, 0x00008082
+ .uword 0x00000000, 0x00008003
+ .uword 0x00000001, 0x0000808b
+ .uword 0x00000001, 0x8000000b
+ .uword 0x00000001, 0x8000008a
+ .uword 0x00000001, 0x80000081
+ .uword 0x00000000, 0x80000081
+ .uword 0x00000000, 0x80000008
+ .uword 0x00000000, 0x00000083
+ .uword 0x00000000, 0x80008003
+ .uword 0x00000001, 0x80008088
+ .uword 0x00000000, 0x80000088
+ .uword 0x00000001, 0x00008000
+ .uword 0x00000000, 0x80008082
+
+ .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
More information about the openssl-commits
mailing list