[openssl-commits] [openssl] OpenSSL-fips-2_0-dev update
Dr. Stephen Henson
steve at openssl.org
Wed Aug 30 20:25:23 UTC 2017
The branch OpenSSL-fips-2_0-dev has been updated
via 781280094ad389e8958631b97e70f498becbd9cb (commit)
via 5526e5791f1426553b6f4806d1ac82efd6ab33bc (commit)
from fe36a698477e7cb1a49de3f4cba5ad7f89f5ad4c (commit)
- Log -----------------------------------------------------------------
commit 781280094ad389e8958631b97e70f498becbd9cb
Author: Andy Polyakov <appro at openssl.org>
Date: Fri Nov 25 11:52:06 2016 +0100
c6x/* "facelift":
- make scripts executable;
- "parameterize" platform selection in c6x/do_fips;
- add c6x/fips_algvs.mak;
- add c6x/run6x.js launcher for more recent CCS versions;
Reviewed-by: Rich Salz <rsalz at openssl.org>
Reviewed-by: Tim Hudson <tjh at openssl.org>
Reviewed-by: Stephen Henson <steve at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4265)
commit 5526e5791f1426553b6f4806d1ac82efd6ab33bc
Author: Andy Polyakov <appro at openssl.org>
Date: Fri Nov 25 13:11:09 2016 +0100
Add some C64x assembly modules [by minor adjustments of C64x+ modules].
AES, SHA256 and SHA512 modules can actually replace corresponding
C64x+ modules. This is because C64x+ instructions don't actually
provide "killer-argument" advantage in these modules. As for SHA1,
even though its performance exactly same, C64x+ module is more
responsive to interrupts, i.e. doesn't inhibit them for as long
periods as C64x module.
Reviewed-by: Rich Salz <rsalz at openssl.org>
Reviewed-by: Tim Hudson <tjh at openssl.org>
Reviewed-by: Stephen Henson <steve at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4265)
-----------------------------------------------------------------------
Summary of changes:
Configure | 3 +-
c6x/do_fips | 7 +-
c6x/fips_algvs.mak | 14 ++
c6x/fips_standalone_sha1 | 0
c6x/incore6x | 0
c6x/run6x | 0
c6x/run6x.js | 91 ++++++++
crypto/aes/asm/{aes-c64xplus.pl => aes-c64x.pl} | 176 ++++++++++------
crypto/{c64xpluscpuid.pl => c64xcpuid.pl} | 170 +++++++++++----
crypto/sha/asm/sha1-c64x-large.pl | 230 +++++++++++++++++++++
crypto/sha/asm/{sha1-c64xplus.pl => sha1-c64x.pl} | 85 ++++----
.../sha/asm/{sha256-c64xplus.pl => sha256-c64x.pl} | 49 +++--
.../sha/asm/{sha512-c64xplus.pl => sha512-c64x.pl} | 75 ++++---
test/fips_algvs.c | 2 +-
util/mk1mf.pl | 2 +-
15 files changed, 713 insertions(+), 191 deletions(-)
mode change 100644 => 100755 c6x/do_fips
create mode 100644 c6x/fips_algvs.mak
mode change 100644 => 100755 c6x/fips_standalone_sha1
mode change 100644 => 100755 c6x/incore6x
mode change 100644 => 100755 c6x/run6x
create mode 100755 c6x/run6x.js
copy crypto/aes/asm/{aes-c64xplus.pl => aes-c64x.pl} (93%)
copy crypto/{c64xpluscpuid.pl => c64xcpuid.pl} (56%)
create mode 100644 crypto/sha/asm/sha1-c64x-large.pl
copy crypto/sha/asm/{sha1-c64xplus.pl => sha1-c64x.pl} (85%)
copy crypto/sha/asm/{sha256-c64xplus.pl => sha256-c64x.pl} (89%)
copy crypto/sha/asm/{sha512-c64xplus.pl => sha512-c64x.pl} (89%)
diff --git a/Configure b/Configure
index 84a2bc2..679252e 100755
--- a/Configure
+++ b/Configure
@@ -636,13 +636,14 @@ my %table=(
"uClinux-dist64","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):SIXTY_FOUR_BIT_LONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::",
"c64xplus","cl6x:-mv6400+ -o2 -ox -ms -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:",
+"c64x","cl6x:-mv6400 -o2 -ox -ms -as -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS:::c64xcpuid.o:::aes-c64x.o aes_cbc.o aes_ctr.o:::sha1-c64x.o sha256-c64x.o sha512-c64x.o:::::::::void:",
);
my @MK1MF_Builds=qw(VC-WIN64I VC-WIN64A
debug-VC-WIN64I debug-VC-WIN64A
VC-NT VC-CE VC-WIN32 debug-VC-WIN32
- BC-32 c64xplus
+ BC-32 c64xplus c64x
netware-clib netware-clib-bsdsock
netware-libc netware-libc-bsdsock);
diff --git a/c6x/do_fips b/c6x/do_fips
old mode 100644
new mode 100755
index c1c29fc..4045e60
--- a/c6x/do_fips
+++ b/c6x/do_fips
@@ -1,6 +1,11 @@
#!/bin/sh
-perl Configure c64xplus fipscanisteronly no-engine
+if ! which cl6x > /dev/null 2>&1; then
+ echo 'fatal: cl6x is not on $PATH'
+ exit 1
+fi
+
+perl Configure ${C6XPLATFORM:-c64xplus} fipscanisteronly no-engine
perl util/mkfiles.pl > MINFO
perl util/mk1mf.pl auto > c6x/fips.mak
make -f c6x/fips.mak
diff --git a/c6x/fips_algvs.mak b/c6x/fips_algvs.mak
new file mode 100644
index 0000000..7f67927
--- /dev/null
+++ b/c6x/fips_algvs.mak
@@ -0,0 +1,14 @@
+CC=cl6x
+CFLAGS=-mv$${C6XSILICON:-6400+} -o2 -I. -Ic6x/inc -Ifips -DNO_SYS_TYPES_H
+OBJ_D=c6x/tmp
+OUT_D=c6x
+
+all: $(OUT_D)/fips_algvs.out
+
+$(OBJ_D)/fips_algvs.obj: test/fips_algvs.c
+ $(CC) --obj_directory=$(OBJ_D) $(CFLAGS) -c $<
+
+$(OUT_D)/fips_algvs.out: $(OBJ_D)/fips_algvs.obj $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+ $(OUT_D)/fips_standalone_sha1 -verify $(OUT_D)/fipscanister.obj
+ $(CC) -z -o $@ -m $(OUT_D)/fips_algvs.map $< $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+ $(OUT_D)/incore6x $@ || rm $@
diff --git a/c6x/fips_standalone_sha1 b/c6x/fips_standalone_sha1
old mode 100644
new mode 100755
diff --git a/c6x/incore6x b/c6x/incore6x
old mode 100644
new mode 100755
diff --git a/c6x/run6x b/c6x/run6x
old mode 100644
new mode 100755
diff --git a/c6x/run6x.js b/c6x/run6x.js
new file mode 100755
index 0000000..6d94949
--- /dev/null
+++ b/c6x/run6x.js
@@ -0,0 +1,91 @@
+#!/usr/bin/env dss.sh
+//
+// Debug Server Scripting C6x launcher.
+//
+
+importPackage(Packages.com.ti.debug.engine.scripting);
+importPackage(Packages.com.ti.ccstudio.scripting.environment);
+importPackage(Packages.java.lang);
+
+if (arguments.length == 0) {
+ // Extract script name from eclipse
+ var regex = new RegExp("-dss\\.rhinoArgs\n(.*)");
+ var matches = regex.exec(environment["eclipse.commands"]);
+
+ System.err.println("Usage: " + matches[1] + " executable [args]");
+ System.err.println();
+ System.err.println("You're also required to set CCSTARGETCONFIG " +
+ "environment variable to appoint");
+ System.err.println("proper .ccxml file, customarily one of " +
+ "$HOME/ti/CCSTargetConfigurations/*.ccxml");
+ quit(1);
+}
+
+try {
+ var prog = arguments[0];
+ var script = ScriptingEnvironment.instance();
+
+ var debugServer = script.getServer("DebugServer.1");
+
+ // CCSTARGETCONFIG environment variable should point at proper .ccxml,
+ // customarily one of $HOME/ti/CCSTargetConfigurations/*.ccxml.
+ debugServer.setConfig(System.getenv("CCSTARGETCONFIG"));
+
+ var debugSession = debugServer.openSession("*", "*");
+
+ // Redirect GEL output to |prog|.gel file, so that it doesn't clobber
+ // standard output from the program...
+ var dot = prog.lastIndexOf(".");
+ var gel_out = prog + ".gel";
+ if (dot > 0) {
+ gel_out = prog.substr(0,dot) + ".gel";
+ }
+ debugSession.expression.evaluate('GEL_EnableFileOutput("'
+ + gel_out + '", 0, 0)');
+
+ debugSession.target.connect();
+
+ // It should be noted that "current working directory" for program
+ // executed on the target system is one where |prog| resides, and
+ // not where script executed [as one would expect]...
+ debugSession.memory.loadProgram(prog, arguments);
+
+ // Pull exit()'s address and set breakpoint, then just execute till
+ // it's reached...
+ var exitAddr = debugSession.symbol.getAddress("exit");
+ debugSession.breakpoint.add(exitAddr);
+
+ while (1) {
+ debugSession.target.run();
+
+ var PC = debugSession.expression.evaluate("PC");
+ if (PC == exitAddr) {
+ break;
+ }
+ }
+
+ // Snatch value passed to exit(), so that it can be passed down to
+ // shell as exit code from this script...
+ var exitCode = debugSession.expression.evaluate("A4");
+
+ // Last run to termination...
+ debugSession.target.run();
+ // Clean up...
+ debugSession.terminate();
+ debugServer.stop();
+
+ // It should be noted that there is kind of a bug in C6x run-time.
+ // Return value from main() is not passed to last implicit exit()
+ // call [as it would on other systems], but instead constant 1 is
+ // passed, which conventionally indicates an error. So that if one
+ // wants to pass specific exit code, or even 0 indicating "success",
+ // one has to call exit() explicitly instead of relying on value
+ // returned by main()...
+ quit(exitCode);
+
+} catch (e) {
+ // We catch everything, because default handler terminates script with
+ // "success" exit code upon exception...
+ System.err.println(e.rhinoException);
+ quit(139);
+}
diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64x.pl
similarity index 93%
copy from crypto/aes/asm/aes-c64xplus.pl
copy to crypto/aes/asm/aes-c64x.pl
index 206d7dc..0817128 100644
--- a/crypto/aes/asm/aes-c64xplus.pl
+++ b/crypto/aes/asm/aes-c64x.pl
@@ -7,9 +7,9 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
-# [Endian-neutral] AES for C64x+.
+# [Endian-neutral] AES for C64x.
#
-# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# Even though loops are scheduled for 13 cycles, and thus expected
# performance is ~8.5 cycles per byte processed with 128-bit key,
# measured performance turned to be ~10 cycles per byte. Discrepancy
# must be caused by limitations of L1D memory banking(*), see SPRU871
@@ -45,6 +45,18 @@ open STDOUT,">$output";
$code=<<___;
.text
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg AES_encrypt,_AES_encrypt
+ .asg AES_decrypt,_AES_decrypt
+ .asg AES_set_encrypt_key,_AES_set_encrypt_key
+ .asg AES_set_decrypt_key,_AES_set_decrypt_key
+ .asg AES_ctr32_encrypt,_AES_ctr32_encrypt
+ .endif
+
.asg B3,RA
.asg A4,INP
.asg B4,OUT
@@ -75,13 +87,23 @@ _AES_encrypt:
.asmfunc
MVK 1,B2
__encrypt:
+ .if __TI_EABI__
[B2] LDNDW *INP++,A9:A8 ; load input
-|| MVKL (AES_Te-_AES_encrypt),$TEA
-|| ADDKPC _AES_encrypt,B0
+|| MVKL \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+|| ADDKPC __encrypt,B0
[B2] LDNDW *INP++,B9:B8
-|| MVKH (AES_Te-_AES_encrypt),$TEA
+|| MVKH \$PCR_OFFSET(AES_Te,__encrypt),$TEA
|| ADD 0,KEY,$KPA
|| ADD 4,KEY,$KPB
+ .else
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL (AES_Te-__encrypt),$TEA
+|| ADDKPC __encrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH (AES_Te-__encrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .endif
LDW *$KPA++[2],$Te0[0] ; zero round key
|| LDW *$KPB++[2],$Te0[1]
|| MVK 60,A0
@@ -107,15 +129,14 @@ __encrypt:
|| XOR $Te0[1],$s[1],$s[1]
|| LDW *$KPA++[2],$K[0] ; 1st round key
|| LDW *$KPB++[2],$K[1]
- SUB B0,2,B0
- SPLOOPD 13
-|| MVC B0,ILC
-|| LDW *$KPA++[2],$K[2]
+ LDW *$KPA++[2],$K[2]
|| LDW *$KPB++[2],$K[3]
-;;====================================================================
- EXTU $s[1],EXT1,24,$Te1[1]
+|| EXTU $s[1],EXT1,24,$Te1[1]
|| EXTU $s[0],EXT3,24,$Te3[0]
+|| SUB B0,1,B0
+;;====================================================================
+enc_loop?:
LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
@@ -150,12 +171,14 @@ __encrypt:
|| ROTL $Te1[0],TBL1,$Te3[1] ; t3
|| EXTU $s[2],EXT0,24,$Te0[2]
|| EXTU $s[3],EXT0,24,$Te0[3]
+|| [B0] SUB B0,1,B0
LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2
|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3
|| ROTL $Te2[2],TBL2,$Te2[2] ; t0
|| ROTL $Te2[3],TBL2,$Te2[3] ; t1
|| XOR $K[0],$Te3[0],$s[0]
|| XOR $K[1],$Te1[1],$s[1]
+|| [B0] BNOP enc_loop?
ROTL $Te3[3],TBL3,$Te1[2] ; t0
|| ROTL $Te1[2],TBL1,$Te3[3] ; t1
|| XOR $K[2],$Te1[0],$s[2]
@@ -176,14 +199,13 @@ __encrypt:
|| XOR $s[3],$Te2[1],$s[3]
|| XOR $s[0],$Te0[0],$s[0]
|| XOR $s[1],$Te0[1],$s[1]
- SPKERNEL
-|| XOR.L $s[2],$Te3[2],$s[2]
-|| XOR.L $s[3],$Te1[3],$s[3]
-;;====================================================================
- ADD.D ${TEA},A0,${TEA} ; point to Te4
-|| ADD.D ${TEB},A0,${TEB}
+ XOR $s[2],$Te3[2],$s[2]
+|| XOR $s[3],$Te1[3],$s[3]
|| EXTU $s[1],EXT1,24,$Te1[1]
|| EXTU $s[0],EXT3,24,$Te3[0]
+||[!B0] ADD ${TEA},A0,${TEA} ; point to Te4
+||[!B0] ADD ${TEB},A0,${TEB}
+;;====================================================================
LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
@@ -277,13 +299,23 @@ _AES_decrypt:
.asmfunc
MVK 1,B2
__decrypt:
+ .if __TI_EABI__
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADDKPC __decrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .else
[B2] LDNDW *INP++,A9:A8 ; load input
-|| MVKL (AES_Td-_AES_decrypt),$TEA
-|| ADDKPC _AES_decrypt,B0
+|| MVKL (AES_Td-__decrypt),$TEA
+|| ADDKPC __decrypt,B0
[B2] LDNDW *INP++,B9:B8
-|| MVKH (AES_Td-_AES_decrypt),$TEA
+|| MVKH (AES_Td-__decrypt),$TEA
|| ADD 0,KEY,$KPA
|| ADD 4,KEY,$KPB
+ .endif
LDW *$KPA++[2],$Td0[0] ; zero round key
|| LDW *$KPB++[2],$Td0[1]
|| MVK 60,A0
@@ -309,15 +341,14 @@ __decrypt:
|| XOR $Td0[1],$s[1],$s[1]
|| LDW *$KPA++[2],$K[0] ; 1st round key
|| LDW *$KPB++[2],$K[1]
- SUB B0,2,B0
- SPLOOPD 13
-|| MVC B0,ILC
-|| LDW *$KPA++[2],$K[2]
+ LDW *$KPA++[2],$K[2]
|| LDW *$KPB++[2],$K[3]
-;;====================================================================
- EXTU $s[1],EXT3,24,$Td3[1]
+|| EXTU $s[1],EXT3,24,$Td3[1]
|| EXTU $s[0],EXT1,24,$Td1[0]
+|| SUB B0,1,B0
+;;====================================================================
+dec_loop?:
LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
@@ -352,12 +383,14 @@ __decrypt:
|| ROTL $Td3[0],TBL3,$Td1[1] ; t3
|| EXTU $s[2],EXT0,24,$Td0[2]
|| EXTU $s[3],EXT0,24,$Td0[3]
+|| [B0] SUB B0,1,B0
LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2
|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3
|| ROTL $Td2[2],TBL2,$Td2[2] ; t0
|| ROTL $Td2[3],TBL2,$Td2[3] ; t1
|| XOR $K[0],$Td1[0],$s[0]
|| XOR $K[1],$Td3[1],$s[1]
+|| [B0] BNOP dec_loop?
ROTL $Td1[3],TBL1,$Td3[2] ; t0
|| ROTL $Td3[2],TBL3,$Td1[3] ; t1
|| XOR $K[2],$Td3[0],$s[2]
@@ -378,14 +411,13 @@ __decrypt:
|| XOR $s[3],$Td2[1],$s[3]
|| XOR $s[0],$Td0[0],$s[0]
|| XOR $s[1],$Td0[1],$s[1]
- SPKERNEL
-|| XOR.L $s[2],$Td1[2],$s[2]
-|| XOR.L $s[3],$Td3[3],$s[3]
-;;====================================================================
- ADD.D ${TEA},A0,${TEA} ; point to Td4
-|| ADD.D ${TEB},A0,${TEB}
+ XOR $s[2],$Td1[2],$s[2]
+|| XOR $s[3],$Td3[3],$s[3]
|| EXTU $s[1],EXT3,24,$Td3[1]
|| EXTU $s[0],EXT1,24,$Td1[0]
+||[!B0] ADD ${TEA},A0,${TEA} ; point to Td4
+||[!B0] ADD ${TEB},A0,${TEB}
+;;====================================================================
LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
@@ -515,17 +547,26 @@ __set_encrypt_key:
[B0] B key256?
|| [A1] LDNDW *INP++,B19:B18
+ .if __TI_EABI__
[A0] ADD 0,KEY,$KPA
|| [A0] ADD 4,KEY,$KPB
-|| [A0] MVKL (AES_Te4-_AES_set_encrypt_key),$TEA
-|| [A0] ADDKPC _AES_set_encrypt_key,B6
- [A0] MVKH (AES_Te4-_AES_set_encrypt_key),$TEA
+|| [A0] MVKL \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
[A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .else
+ [A0] ADD 0,KEY,$KPA
+|| [A0] ADD 4,KEY,$KPB
+|| [A0] MVKL (AES_Te4-__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH (AES_Te4-__set_encrypt_key),$TEA
+ [A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .endif
NOP
NOP
BNOP RA,5
-|| MVK -2,RET ; unknown bit lenght
+|| MVK -2,RET ; unknown bit length
|| MVK 0,B0 ; redundant
;;====================================================================
;;====================================================================
@@ -543,13 +584,12 @@ key128?:
.endif
MVK 256,A0
-|| MVK 9,B0
+|| MVK 8,B0
- SPLOOPD 14
-|| MVC B0,ILC
-|| MV $TEA,$TEB
+ MV $TEA,$TEB
|| ADD $TEA,A0,A30 ; rcon
;;====================================================================
+loop128?:
LDW *A30++[1],A31 ; rcon[i]
|| MV $Te4[2],$K[2]
|| EXTU $K[3],EXT1,24,$Te4[0]
@@ -576,10 +616,12 @@ key128?:
.if .BIG_ENDIAN
PACK2 $Te4[0],$Te4[1],$Te4[1]
PACK2 $Te4[3],A0,$Te4[3]
+|| BDEC loop128?,B0
PACKL4 $Te4[1],$Te4[3],$Te4[3]
.else
PACK2 $Te4[1],$Te4[0],$Te4[1]
PACK2 $Te4[3],A0,$Te4[3]
+|| BDEC loop128?,B0
PACKL4 $Te4[3],$Te4[1],$Te4[3]
.endif
XOR $Te4[3],$K[0],$Te4[0] ; K[0]
@@ -587,7 +629,6 @@ key128?:
MV $Te4[0],$K[0]
|| XOR $K[1],$K[2],$Te4[2] ; K[2]
XOR $Te4[2],$K[3],$K[3] ; K[3]
- SPKERNEL
;;====================================================================
BNOP RA
MV $Te4[2],$K[2]
@@ -802,17 +843,15 @@ _AES_set_decrypt_key:
ret?: ; B0 holds rounds or zero
[!B0] BNOP B31 ; return if zero
[B0] SHL B0,4,A0 ; offset to last round key
- [B0] SHRU B0,1,B1
- [B0] SUB B1,1,B1
- [B0] MVK 0x0000001B,B3 ; AES polynomial
+ [B0] SHRU B0,1,B2
+ [B0] SUB B2,2,B2
+|| [B0] MVK 0x0000001B,B3 ; AES polynomial
[B0] MVKH 0x07000000,B3
-
- SPLOOPD 9 ; flip round keys
-|| MVC B1,ILC
-|| MV B30,$KPA
-|| ADD B30,A0,$KPB
-|| MVK 16,A0 ; sizeof(round key)
+|| [B0] MV B30,$KPA
+ [B0] ADD B30,A0,$KPB
+|| [B0] MVK 16,A0 ; sizeof(round key)
;;====================================================================
+flip_loop?:
LDW *${KPA}[0],A16
|| LDW *${KPB}[0],B16
LDW *${KPA}[1],A17
@@ -823,6 +862,7 @@ ret?: ; B0 holds rounds or zero
|| ADD $KPA,A0,$KPA
|| LDW *${KPB}[3],B19
|| SUB $KPB,A0,$KPB
+|| BDEC flip_loop?,B2
NOP
STW B16,*${KPA}[-4]
|| STW A16,*${KPB}[4]
@@ -832,7 +872,6 @@ ret?: ; B0 holds rounds or zero
|| STW A18,*${KPB}[6]
STW B19,*${KPA}[-1]
|| STW A19,*${KPB}[7]
- SPKERNEL
;;====================================================================
SUB B0,1,B0 ; skip last round
|| ADD B30,A0,$KPA ; skip first round
@@ -847,10 +886,9 @@ ret?: ; B0 holds rounds or zero
|| MVK 0x00000B0B,B24
MVKH 0x09090000,A24
|| MVKH 0x0B0B0000,B24
- MVC B0,ILC
-|| SUB B0,1,B0
+ SUB B0,1,B0
- GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+ GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
|| GMPY4 $K[1],A24,$Kx9[1]
|| MVK 0x00000D0D,A25
|| MVK 0x00000E0E,B25
@@ -859,14 +897,14 @@ ret?: ; B0 holds rounds or zero
|| MVKH 0x0D0D0000,A25
|| MVKH 0x0E0E0000,B25
- GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+ GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
|| GMPY4 $K[1],B24,$KxB[1]
GMPY4 $K[2],B24,$KxB[2]
|| GMPY4 $K[3],B24,$KxB[3]
- SPLOOP 11 ; InvMixColumns
;;====================================================================
- GMPY4 $K[0],A25,$KxD[0] ; ·0x0D
+invmix_loop?:
+ GMPY4 $K[0],A25,$KxD[0] ; ·0x0D
|| GMPY4 $K[1],A25,$KxD[1]
|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16
|| SWAP2 $Kx9[1],$Kx9[1]
@@ -883,7 +921,7 @@ ret?: ; B0 holds rounds or zero
|| [B0] LDW *${KPA}[6],$K[2]
|| [B0] LDW *${KPB}[7],$K[3]
- GMPY4 $s[0],B25,$KxE[0] ; ·0x0E
+ GMPY4 $s[0],B25,$KxE[0] ; ·0x0E
|| GMPY4 $s[1],B25,$KxE[1]
|| XOR $Kx9[0],$KxB[0],$KxB[0]
|| XOR $Kx9[1],$KxB[1],$KxB[1]
@@ -900,10 +938,11 @@ ret?: ; B0 holds rounds or zero
|| ROTL $KxB[3],TBL3,$KxB[3]
|| SWAP2 $KxD[2],$KxD[2]
|| SWAP2 $KxD[3],$KxD[3]
+|| [B0] B invmix_loop?
XOR $KxE[0],$KxD[0],$KxE[0]
|| XOR $KxE[1],$KxD[1],$KxE[1]
-|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
|| [B0] GMPY4 $K[1],A24,$Kx9[1]
|| ADDAW $KPA,4,$KPA
XOR $KxE[2],$KxD[2],$KxE[2]
@@ -914,7 +953,7 @@ ret?: ; B0 holds rounds or zero
XOR $KxB[0],$KxE[0],$KxE[0]
|| XOR $KxB[1],$KxE[1],$KxE[1]
-|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
|| [B0] GMPY4 $K[1],B24,$KxB[1]
XOR $KxB[2],$KxE[2],$KxE[2]
|| XOR $KxB[3],$KxE[3],$KxE[3]
@@ -925,7 +964,6 @@ ret?: ; B0 holds rounds or zero
STW $KxE[2],*${KPA}[-2]
|| STW $KxE[3],*${KPB}[-1]
|| [B0] SUB B0,1,B0
- SPKERNEL
;;====================================================================
BNOP B31,3
MVC B30,GFPGFR ; restore GFPGFR(*)
@@ -943,7 +981,8 @@ _AES_ctr32_encrypt:
.asmfunc
LDNDW *${ivp}[0],A31:A30 ; load counter value
|| MV $blocks,A2 ; reassign $blocks
-|| DMV RA,$key,B27:B26 ; reassign RA and $key
+|| MV RA,B27 ; reassign RA
+|| MV $key,B26 ; reassign $key
LDNDW *${ivp}[1],B31:B30
|| MVK 0,B2 ; don't let __encrypt load input
|| MVK 0,A1 ; and postpone writing output
@@ -965,13 +1004,15 @@ ctr32_loop?:
|| [A2] LDNDW *INP++,B29:B28
.if .BIG_ENDIAN
[A1] STNDW A9:A8,*OUT++ ; save output
-|| [A2] DMV A31,A30,A9:A8 ; pass counter value to __encrypt
+|| [A2] MV A31,A9 ; pass counter value to __encrypt
+|| [A2] MV A30,A8 ; pass counter value to __encrypt
[A1] STNDW B9:B8,*OUT++
|| [A2] DMV B31,B30,B9:B8
|| [A2] ADD B30,1,B30 ; counter++
.else
[A1] STNDW A9:A8,*OUT++ ; save output
-|| [A2] DMV A31,A30,A9:A8
+|| [A2] MV A31,A9
+|| [A2] MV A30,A8
|| [A2] SWAP2 B31,B0
|| [A2] ADD B31,1,B31 ; counter++
[A1] STNDW B9:B8,*OUT++
@@ -989,7 +1030,11 @@ ___
}
# Tables are kept in endian-neutral manner
$code.=<<___;
+ .if __TI_EABI__
+ .sect ".text:aes_asm.const"
+ .else
.sect ".const:aes_asm"
+ .endif
.align 128
AES_Te:
.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84
@@ -1322,8 +1367,9 @@ AES_Td4:
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
- .cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .cstring "AES for C64x, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
print $code;
+close STDOUT;
diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xcpuid.pl
similarity index 56%
copy from crypto/c64xpluscpuid.pl
copy to crypto/c64xcpuid.pl
index 067b693..88fd153 100644
--- a/crypto/c64xpluscpuid.pl
+++ b/crypto/c64xcpuid.pl
@@ -1,5 +1,10 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -7,17 +12,39 @@ open STDOUT,">$output";
$code.=<<___;
.text
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg OPENSSL_rdtsc,_OPENSSL_rdtsc
+ .asg OPENSSL_cleanse,_OPENSSL_cleanse
+ .asg CRYPTO_memcmp,_CRYPTO_memcmp
+ .asg OPENSSL_atomic_add,_OPENSSL_atomic_add
+ .asg OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
+ .asg OPENSSL_instrument_bus,_OPENSSL_instrument_bus
+ .asg OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
+ .endif
+
.asg B3,RA
+ .asg 0x01AC0000,TIMER_BASE ; Timer 2
.global _OPENSSL_rdtsc
_OPENSSL_rdtsc:
.asmfunc
- B RA
- MVC TSCL,B0
- MVC TSCH,B1
- [!B0] MVC B0,TSCL ; start TSC
- MV B0,A4
- MV B1,A5
+ MVKL TIMER_BASE,A5
+ MVKH TIMER_BASE,A5
+ LDW *A5[0],A2 ; load CTL
+ LDW *A5[2],A4 ; load CTN
+ NOP 2
+ .if .BIG_ENDIAN
+ MVK 0x2c0,A7 ; internal clock source, don't hold, go
+|| MVK -1,A6 ; maximum period
+ .else
+ MVK 0x2c0,A6 ; internal clock source, don't hold, go
+|| MVK -1,A7 ; maximum period
+ .endif
+ [!A2] STDW A7:A6,*A5[0] ; fire it up
+|| BNOP RA,5
.endasmfunc
.global _OPENSSL_cleanse
@@ -28,28 +55,34 @@ _OPENSSL_cleanse:
|| SHRU B4,3,B0 ; is length >= 8
|| ADD 1,A4,B6
[!B0] BNOP RA
+|| [B0] SUB B0,1,B2
|| ZERO A1
|| ZERO B1
- [B0] MVC B0,ILC
+ [B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 0,B4,A1
||[!B0] CMPLT 1,B4,B1
+|| ZERO B5
[A1] STB A2,*A4++[2]
-|| [B1] STB B2,*B6++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 2,B4,A1
||[!B0] CMPLT 3,B4,B1
[A1] STB A2,*A4++[2]
-|| [B1] STB B2,*B6++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 4,B4,A1
||[!B0] CMPLT 5,B4,B1
[A1] STB A2,*A4++[2]
-|| [B1] STB B2,*B6++[2]
+|| [B1] STB B5,*B6++[2]
+|| [B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 6,B4,A1
[A1] STB A2,*A4++[2]
+|| [B2] BDEC cleanse_loop?,B2
- SPLOOP 1
+cleanse_loop?:
STNDW A3:A2,*A4++
|| SUB B4,8,B4
- SPKERNEL
+|| [B2] BDEC cleanse_loop?,B2
MV B4,B0 ; remaining bytes
|| ADD 1,A4,B6
@@ -57,33 +90,73 @@ _OPENSSL_cleanse:
[B0] CMPLT 0,B0,A1
|| [B0] CMPLT 1,B0,B1
[A1] STB A2,*A4++[2]
-|| [B1] STB B2,*B6++[2]
+|| [B1] STB B5,*B6++[2]
|| [B0] CMPLT 2,B0,A1
|| [B0] CMPLT 3,B0,B1
[A1] STB A2,*A4++[2]
-|| [B1] STB B2,*B6++[2]
+|| [B1] STB B5,*B6++[2]
|| [B0] CMPLT 4,B0,A1
|| [B0] CMPLT 5,B0,B1
[A1] STB A2,*A4++[2]
-|| [B1] STB B2,*B6++[2]
+|| [B1] STB B5,*B6++[2]
|| [B0] CMPLT 6,B0,A1
[A1] STB A2,*A4++[2]
.endasmfunc
+ .if 0
+ .global _CRYPTO_memcmp
+_CRYPTO_memcmp:
+ .asmfunc
+ MV A6,B0
+ [!B0] BNOP RA
+||[!B0] ZERO A4
+|| [B0] ZERO A1:A0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+ XOR A5,B5,A1
+|| [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+
+memcmp_loop?:
+ OR A1,A0,A0
+|| XOR A5,B5,A1
+|| [B0] LDBU *A4++,A5
+|| [B0] LDBU *B4++,B5
+|| [B0] BDEC memcmp_loop?,B0
+
+ BNOP RA,3
+ ZERO A4
+ [A0] MVK 1,A4
+ .endasmfunc
+ .endif
+
.global _OPENSSL_atomic_add
_OPENSSL_atomic_add:
.asmfunc
- MV A4,B0
-atomic_add?:
- LL *B0,B5
- NOP 4
+ BNOP atomic_store? ; pre-C64x+ systems are uni-processor, it's
+|| LDW *A4,B5 ; enough to hold interrupts off through
+ ; the load-update-store cycle to achieve
+ ; atomicity
+ NOP
+ BNOP RA,3 ; and this branch stretches even over store
ADD B4,B5,B5
- SL B5,*B0
- CMTL *B0,B1
- NOP 4
- [!B1] B atomic_add?
- [B1] BNOP RA,4
- MV B5,A4
+atomic_store?:
+ STW B5,*A4
+|| MV B5,A4
.endasmfunc
.global _OPENSSL_wipe_cpu
@@ -150,35 +223,34 @@ _OPENSSL_instrument_bus:
MV B4,B0 ; reassign sizeof(output)
|| MV A4,B4 ; reassign output
|| MVK 0x00004030,A3
+|| MVKL TIMER_BASE,B16
MV B0,A4 ; return value
|| MVK 1,A1
|| MVKH 0x01840000,A3 ; L1DWIBAR
- MVC TSCL,B8 ; collect 1st tick
+|| MVKH TIMER_BASE,B16
+ LDW *B16[2],B8 ; collect 1st tick
|| MVK 0x00004010,A5
+ NOP 4
MV B8,B9 ; lasttick = tick
|| MVK 0,B7 ; lastdiff = 0
|| MVKH 0x01840000,A5 ; L2WIBAR
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
- LL *B4,B5
+ LDW *B4,B5
NOP 4
ADD B7,B5,B5
- SL B5,*B4
- CMTL *B4,B1
- NOP 4
STW B5,*B4
bus_loop1?:
- MVC TSCL,B8
+ LDW *B16[2],B8
|| [B0] SUB B0,1,B0
+ NOP 4
SUB B8,B9,B7 ; lastdiff = tick - lasttick
|| MV B8,B9 ; lasttick = tick
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
- LL *B4,B5
+ LDW *B4,B5
NOP 4
ADD B7,B5,B5
- SL B5,*B4
- CMTL *B4,B1
STW B5,*B4 ; [!B1] is removed to flatten samples
|| ADDK 4,B4
|| [B0] BNOP bus_loop1?,5
@@ -192,42 +264,42 @@ _OPENSSL_instrument_bus2:
MV A6,B0 ; reassign max
|| MV B4,A6 ; reassing sizeof(output)
|| MVK 0x00004030,A3
+|| MVKL TIMER_BASE,B16
MV A4,B4 ; reassign output
|| MVK 0,A4 ; return value
|| MVK 1,A1
|| MVKH 0x01840000,A3 ; L1DWIBAR
+|| MVKH TIMER_BASE,B16
- MVC TSCL,B8 ; collect 1st tick
+ LDW *B16[2],B8 ; collect 1st tick
|| MVK 0x00004010,A5
+ NOP 4
MV B8,B9 ; lasttick = tick
|| MVK 0,B7 ; lastdiff = 0
|| MVKH 0x01840000,A5 ; L2WIBAR
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
- LL *B4,B5
+ LDW *B4,B5
NOP 4
ADD B7,B5,B5
- SL B5,*B4
- CMTL *B4,B1
- NOP 4
STW B5,*B4
- MVC TSCL,B8 ; collect 1st diff
+ LDW *B16[2],B8 ; collect 1st diff
+ NOP 4
SUB B8,B9,B7 ; lastdiff = tick - lasttick
|| MV B8,B9 ; lasttick = tick
|| SUB B0,1,B0
bus_loop2?:
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
- LL *B4,B5
+ LDW *B4,B5
NOP 4
ADD B7,B5,B5
- SL B5,*B4
- CMTL *B4,B1
STW B5,*B4 ; [!B1] is removed to flatten samples
||[!B0] BNOP bus_loop2_done?,2
|| SUB B0,1,B0
- MVC TSCL,B8
+ LDW *B16[2],B8
+ NOP 4
SUB B8,B9,B8
|| MV B8,B9
CMPEQ B8,B7,B2
@@ -240,6 +312,14 @@ bus_loop2?:
bus_loop2_done?:
BNOP RA,5
.endasmfunc
+
+ .if __TI_EABI__
+ .sect ".init_array"
+ .else
+ .sect ".pinit"
+ .endif
+ .align 4
+ .long _OPENSSL_rdtsc ; auto-start timer
___
print $code;
diff --git a/crypto/sha/asm/sha1-c64x-large.pl b/crypto/sha/asm/sha1-c64x-large.pl
new file mode 100644
index 0000000..3916ff3
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64x-large.pl
@@ -0,0 +1,230 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# This is fully-unrolled SHA1 implementation. It's 25% faster than
+# one with compact loops, doesn't use in-memory ring buffer, as
+# everything is accomodated in registers, and has "perfect" interrupt
+# agility. Drawback is obviously the code size...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24));
+ at V = ($A,$B,$C,$D,$E);
+ at X = map("B$_",(16..31));
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___ if ($i<14);
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| LDNW *${INP}++, at X[$i+2]
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| SWAP2 @X[$i+1], at X[$i+1]
+|| ADD @X[$i],$e,$e ; E+=X[i]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| SWAP4 @X[$i+1], at X[$i+1]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+$code.=<<___ if ($i==14);
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i],$e,$e ; E+=X[i]
+|| SWAP2 @X[$i+1], at X[$i+1]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| SWAP4 @X[$i+1], at X[$i+1]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+$code.=<<___ if ($i==15);
+|| XOR @X[($j+2)&15], at X[$j], at X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15], at X[$j], at X[$j]
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i],$e,$e ; E+=X[i]
+|| XOR @X[($j+13)&15], at X[$j], at X[$j]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1, at X[$j]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+$code.=<<___ if ($i>15);
+|| XOR @X[($j+2)&15], at X[$j], at X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| ANDN $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15], at X[$j], at X[$j]
+ OR $F0,$F,$F ; F_00_19(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+|| XOR @X[($j+13)&15], at X[$j], at X[$j]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1, at X[$j]
+ ADD $F,$e,$e ; E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___ if ($i<79);
+|| XOR @X[($j+2)&15], at X[$j], at X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| XOR $c,$b,$F
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15], at X[$j], at X[$j]
+ XOR $d,$F,$F ; F_20_39(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+|| XOR @X[($j+13)&15], at X[$j], at X[$j]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1, at X[$j]
+ ADD $F,$e,$e ; E+=F_20_39(B,C,D)
+___
+$code.=<<___ if ($i==79);
+|| [A0] B loop?
+|| [A0] LDNW *${INP}++, at X[0] ; pre-fetch input
+ ROTL $a,5,$Arot ;; $i
+|| XOR $c,$b,$F
+|| ADD $K,$e,$e ; E+=K
+|| [A0] LDNW *${INP}++, at X[1]
+ XOR $d,$F,$F ; F_20_39(B,C,D)
+|| ROTL $b,30,$b
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+ ADD $Arot,$e,$e ; E+=rot(A,5)
+ ADD $F,$e,$e ; E+=F_20_39(B,C,D)
+|| ADD $Bctx,$a,$a ; accumulate context
+|| ADD $Cctx,$b,$b
+ ADD $Dctx,$c,$c
+|| ADD $Ectx,$d,$d
+|| ADD $Actx,$e,$e
+;;===== branch to loop? is taken here
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___;
+|| XOR @X[($j+2)&15], at X[$j], at X[$j]
+ ROTL $a,5,$Arot ;; $i
+|| AND $c,$b,$F
+|| AND $d,$b,$F0
+|| ADD $K,$e,$e ; E+=K
+|| XOR @X[($j+8)&15], at X[$j], at X[$j]
+ XOR $F0,$F,$F
+|| AND $c,$d,$F0
+|| ROTL $b,30,$b
+|| XOR @X[($j+13)&15], at X[$j], at X[$j]
+|| ADD @X[$i&15],$e,$e ; E+=X[i]
+ XOR $F0,$F,$F ; F_40_59(B,C,D)
+|| ADD $Arot,$e,$e ; E+=rot(A,5)
+|| ROTL @X[$j],1, at X[$j]
+ ADD $F,$e,$e ; E+=F_20_39(B,C,D)
+___
+}
+
+$code=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg sha1_block_data_order,_sha1_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg MV,SWAP2
+ .asg MV,SWAP4
+ .endif
+
+ .global _sha1_block_data_order
+_sha1_block_data_order:
+ .asmfunc
+ MV $NUM,A0 ; reassign $NUM
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] LDW *${CTX}[0],$A ; load A-E...
+ [A0] LDW *${CTX}[1],$B
+ [A0] LDW *${CTX}[2],$C
+ [A0] LDW *${CTX}[3],$D
+ [A0] LDW *${CTX}[4],$E
+ [A0] LDNW *${INP}++, at X[0] ; pre-fetch input
+ [A0] LDNW *${INP}++, at X[1]
+ NOP 3
+
+loop?:
+ SUB A0,1,A0
+|| MV $A,$Actx
+|| MVD $B,$Bctx
+|| SWAP2 @X[0], at X[0]
+|| MVKL 0x5a827999,$K
+ MVKH 0x5a827999,$K ; K_00_19
+|| MV $C,$Cctx
+|| MV $D,$Dctx
+|| MVD $E,$Ectx
+|| SWAP4 @X[0], at X[0]
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+|| MVKL 0x6ed9eba1,$K
+ MVKH 0x6ed9eba1,$K ; K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+|| MVKL 0x8f1bbcdc,$K
+ MVKH 0x8f1bbcdc,$K ; K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+|| MVKL 0xca62c1d6,$K
+ MVKH 0xca62c1d6,$K ; K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ BNOP RA ; return
+ STW $A,*${CTX}[0] ; emit A-E...
+ STW $B,*${CTX}[1]
+ STW $C,*${CTX}[2]
+ STW $D,*${CTX}[3]
+ STW $E,*${CTX}[4]
+ .endasmfunc
+
+ .sect .const
+ .cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64x.pl
similarity index 85%
copy from crypto/sha/asm/sha1-c64xplus.pl
copy to crypto/sha/asm/sha1-c64x.pl
index 87000d1..d7a9dd1 100644
--- a/crypto/sha/asm/sha1-c64xplus.pl
+++ b/crypto/sha/asm/sha1-c64x.pl
@@ -7,19 +7,19 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
-# SHA1 for C64x+.
+# SHA1 for C64x.
#
-# November 2011
+# November 2016
#
# If compared to compiler-generated code with similar characteristics,
# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
# this implementation is 25% smaller and >2x faster. In absolute terms
# performance is (quite impressive) ~6.5 cycles per processed byte.
-# Fully unrolled assembler would be ~5x larger and is likely to be
-# ~15% faster. It would be free from references to intermediate ring
-# buffer, but put more pressure on L1P [both because the code would be
-# larger and won't be using SPLOOP buffer]. There are no plans to
-# realize fully unrolled variant though...
+# Unlike its predecessor, sha1-c64xplus module, this module has worse
+# interrupt agility. While original added up to 5 cycles delay to
+# response to interrupt, this module adds up to 100. Fully unrolled
+# implementation doesn't add any delay and even 25% faster, but is
+# almost 5x larger...
#
# !!! Note that this module uses AMR, which means that all interrupt
# service routines are expected to preserve it and for own well-being
@@ -39,6 +39,13 @@ open STDOUT,">$output";
$code=<<___;
.text
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg sha1_block_data_order,_sha1_block_data_order
+ .endif
+
.asg B3,RA
.asg A15,FP
.asg B15,SP
@@ -70,21 +77,21 @@ _sha1_block_data_order:
NOP 1
loop?:
- MVK 0x00007999,$K
-|| ADDAW SP,2,$XPA
-|| SUB A0,1,A0
-|| MVK 13,B0
- MVKH 0x5a820000,$K ; K_00_19
+ MVKL 0x5a827999,$K
|| ADDAW SP,2,$XPB
+|| SUB A0,1,A0
+ MVKH 0x5a827999,$K ; K_00_19
|| MV $A,$Actx
|| MV $B,$Bctx
;;==================================================
- SPLOOPD 5 ; BODY_00_13
+ B body_00_13? ; BODY_00_13
+|| MVK 11,B0
+|| MV $XPB,$XPA
|| MV $C,$Cctx
|| MV $D,$Dctx
-|| MV $E,$Ectx
-|| MVC B0,ILC
+|| MVD $E,$Ectx
+body_00_13?:
ROTL $A,5,$Arot
|| AND $C,$B,$F
|| ANDN $D,$B,$F0
@@ -105,7 +112,7 @@ loop?:
ADD $TX3,$T,$A ; A=T+Xi
|| STW $TX3,*${XPB}++
- SPKERNEL
+|| BDEC body_00_13?,B0
;;==================================================
ROTL $A,5,$Arot ; BODY_14
|| AND $C,$B,$F
@@ -160,11 +167,11 @@ loop?:
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++
|| XOR $TX0,$TX1,$TX1
-|| MVK 3,B0
;;==================================================
- SPLOOPD 5 ; BODY_16_19
-|| MVC B0,ILC
+|| B body_16_19? ; BODY_16_19
+|| MVK 1,B0
+body_16_19?:
ROTL $A,5,$Arot
|| AND $C,$B,$F
|| ANDN $D,$B,$F0
@@ -191,18 +198,19 @@ loop?:
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++
|| XOR $TX0,$TX1,$TX1
- SPKERNEL
+|| BDEC body_16_19?,B0
- MVK 0xffffeba1,$K
-|| MVK 19,B0
- MVKH 0x6ed90000,$K ; K_20_39
+ MVKL 0x6ed9eba1,$K
+|| MVK 17,B0
+ MVKH 0x6ed9eba1,$K ; K_20_39
___
sub BODY_20_39 {
+my $label = shift;
$code.=<<___;
;;==================================================
- SPLOOPD 5 ; BODY_20_39
-|| MVC B0,ILC
+|| B $label ; BODY_20_39
+$label:
ROTL $A,5,$Arot
|| XOR $B,$C,$F
|| ADD $K,$E,$T ; T=E+K
@@ -228,20 +236,19 @@ $code.=<<___;
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++ ; last one is redundant
|| XOR $TX0,$TX1,$TX1
- SPKERNEL
-___
-$code.=<<___ if (!shift);
- MVK 0xffffbcdc,$K
- MVKH 0x8f1b0000,$K ; K_40_59
+|| BDEC $label,B0
___
-} &BODY_20_39();
+} &BODY_20_39("body_20_39?");
$code.=<<___;
;;==================================================
- SPLOOPD 5 ; BODY_40_59
-|| MVC B0,ILC
+ MVKL 0x8f1bbcdc,$K
+|| MVK 17,B0
+ MVKH 0x8f1bbcdc,$K ; K_40_59
+|| B body_40_59? ; BODY_40_59
|| AND $B,$C,$F
|| AND $B,$D,$F0
+body_40_59?:
ROTL $A,5,$Arot
|| XOR $F0,$F,$F
|| AND $C,$D,$F0
@@ -270,13 +277,13 @@ $code.=<<___;
|| XOR $TX0,$TX1,$TX1
|| AND $B,$C,$F
|| AND $B,$D,$F0
- SPKERNEL
+|| BDEC body_40_59?,B0
- MVK 0xffffc1d6,$K
-|| MVK 18,B0
- MVKH 0xca620000,$K ; K_60_79
+ MVKL 0xca62c1d6,$K
+|| MVK 16,B0
+ MVKH 0xca62c1d6,$K ; K_60_79
___
- &BODY_20_39(-1); # BODY_60_78
+ &BODY_20_39("body_60_78?"); # BODY_60_78
$code.=<<___;
;;==================================================
[A0] B loop?
@@ -315,7 +322,7 @@ $code.=<<___;
.endasmfunc
.sect .const
- .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64x.pl
similarity index 89%
copy from crypto/sha/asm/sha256-c64xplus.pl
copy to crypto/sha/asm/sha256-c64x.pl
index 8b92c84..fbe99c0 100644
--- a/crypto/sha/asm/sha256-c64xplus.pl
+++ b/crypto/sha/asm/sha256-c64x.pl
@@ -7,9 +7,9 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
-# SHA256 for C64x+.
+# SHA256 for C64x.
#
-# January 2012
+# November 2016
#
# Performance is just below 10 cycles per processed byte, which is
# almost 40% faster than compiler-generated code. Unroll is unlikely
@@ -39,6 +39,14 @@ open STDOUT,">$output";
$code.=<<___;
.text
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg sha256_block_data_order,_sha256_block_data_order
+ .endif
+
.asg B3,RA
.asg A15,FP
.asg B15,SP
@@ -50,6 +58,7 @@ $code.=<<___;
.global _sha256_block_data_order
_sha256_block_data_order:
+__sha256_block:
.asmfunc stack_usage(64)
MV $NUM,A0 ; reassign $NUM
|| MVK -64,B0
@@ -58,10 +67,17 @@ _sha256_block_data_order:
|| [A0] MV SP,FP
[A0] ADDKPC _sha256_block_data_order,B2
|| [A0] AND B0,SP,SP ; align stack at 64 bytes
+ .if __TI_EABI__
[A0] MVK 0x00404,B1
-|| [A0] MVKL (K256-_sha256_block_data_order),$K256
+|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
[A0] MVKH 0x50000,B1
-|| [A0] MVKH (K256-_sha256_block_data_order),$K256
+|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
+ .else
+ [A0] MVK 0x00404,B1
+|| [A0] MVKL (K256-__sha256_block),$K256
+ [A0] MVKH 0x50000,B1
+|| [A0] MVKH (K256-__sha256_block),$K256
+ .endif
[A0] MVC B1,AMR ; setup circular addressing
|| [A0] MV SP,$Xia
[A0] MV SP,$Xib
@@ -79,9 +95,8 @@ _sha256_block_data_order:
LDNW *$INP++,$Xn ; pre-fetch input
LDW *$K256++,$K ; pre-fetch K256[0]
- MVK 14,B0 ; loop counters
- MVK 47,B1
-|| ADDAW $Xia,9,$Xia
+ NOP
+ ADDAW $Xia,9,$Xia
outerloop?:
SUB A0,1,A0
|| MV $A,$Actx
@@ -94,10 +109,10 @@ outerloop?:
|| MVD $H,$Hctx
|| SWAP4 $Xn,$X0
- SPLOOPD 8 ; BODY_00_14
-|| MVC B0,ILC
+ MVK 14,B0 ; loop counter
|| SWAP2 $X0,$X0
+loop_00_14?: ; BODY_00_14
LDNW *$INP++,$Xn
|| ROTL $A,30,$S0
|| OR $A,$B,$Maj
@@ -113,6 +128,7 @@ outerloop?:
|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
|| ROTL $E,7,$t1e
|| ADD $K,$H,$T1 ; T1 = h + K256[i]
+|| [B0] BDEC loop_00_14?,B0
ADD $X0,$T1,$T1 ; T1 += X[i];
|| STW $X0,*$Xib++
|| XOR $t0a,$S0,$S0
@@ -134,7 +150,7 @@ outerloop?:
MV $B,$C ; c = b
|| MV $A,$B ; b = a
|| ADD $T1,$T2,$A ; a = T1 + T2
- SPKERNEL
+;;===== branch to loop00_14? is taken here
ROTL $A,30,$S0 ; BODY_15
|| OR $A,$B,$Maj
@@ -178,11 +194,11 @@ outerloop?:
|| MV $A,$B ; b = a
|| ADD $T1,$T2,$A ; a = T1 + T2
- SPLOOPD 10 ; BODY_16_63
-|| MVC B1,ILC
+ MVK 47,B1 ; loop counter
|| ROTL $X1,14,$t1e ; modulo-scheduled
|| ROTL $X14,13,$t1a ; modulo-scheduled
+loop_16_63?: ; BODY_16_63
XOR $t0e,$s0,$s0
|| XOR $t0a,$s1,$s1
|| MV $X15,$X14
@@ -207,6 +223,7 @@ outerloop?:
|| ROTL $E,7,$t1e
|| ADD $H,$K,$T1 ; T1 = h + K256[i]
|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
+|| [B1] BDEC loop_16_63?,B1
XOR $t0a,$S0,$S0
|| XOR $t0e,$S1,$S1
|| ADD $X0,$T1,$T1 ; T1 += X[i]
@@ -234,7 +251,7 @@ outerloop?:
|| ADD $T1,$T2,$A ; a = T1 + T2
|| SHRU $X1,3,$s0 ; modulo-scheduled
|| SHRU $X14,10,$s1 ; modulo-scheduled
- SPKERNEL
+;;===== branch to loop16_63? is taken here
[A0] B outerloop?
|| [A0] LDNW *$INP++,$Xn ; pre-fetch input
@@ -265,7 +282,11 @@ outerloop?:
|| STW $H,*${CTXB}[7]
.endasmfunc
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
.sect ".const:sha_asm"
+ .endif
.align 128
K256:
.uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@@ -284,7 +305,7 @@ K256:
.uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
- .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .cstring "SHA256 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64x.pl
similarity index 89%
copy from crypto/sha/asm/sha512-c64xplus.pl
copy to crypto/sha/asm/sha512-c64x.pl
index 56c8583..e35a72a 100644
--- a/crypto/sha/asm/sha512-c64xplus.pl
+++ b/crypto/sha/asm/sha512-c64x.pl
@@ -7,11 +7,11 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
-# SHA512 for C64x+.
+# SHA512 for C64x.
#
-# January 2012
+# November 2016
#
-# Performance is 19 cycles per processed byte. Compared to block
+# Performance is ~19 cycles per processed byte. Compared to block
# transform function from sha512.c compiled with cl6x with -mv6400+
# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
# Loop unroll won't make it, this implementation, any faster, because
@@ -47,6 +47,14 @@ open STDOUT,">$output";
$code.=<<___;
.text
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg sha512_block_data_order,_sha512_block_data_order
+ .endif
+
.asg B3,RA
.asg A15,FP
.asg B15,SP
@@ -61,6 +69,7 @@ $code.=<<___;
.global _sha512_block_data_order
_sha512_block_data_order:
+__sha512_block:
.asmfunc stack_usage(40+128)
MV $NUM,A0 ; reassign $NUM
|| MVK -128,B0
@@ -75,13 +84,21 @@ _sha512_block_data_order:
[A0] STDW A11:A10,*SP[1]
|| [A0] MVC B1,AMR ; setup circular addressing
|| [A0] ADD B0,SP,SP ; alloca(128)
+ .if __TI_EABI__
[A0] AND B0,SP,SP ; align stack at 128 bytes
-|| [A0] ADDKPC _sha512_block_data_order,B1
-|| [A0] MVKL (K512-_sha512_block_data_order),$K512
- [A0] MVKH (K512-_sha512_block_data_order),$K512
+|| [A0] ADDKPC __sha512_block,B1
+|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512
+ [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512
|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ .else
+ [A0] AND B0,SP,SP ; align stack at 128 bytes
+|| [A0] ADDKPC __sha512_block,B1
+|| [A0] MVKL (K512-__sha512_block),$K512
+ [A0] MVKH (K512-__sha512_block),$K512
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ .endif
ADDAW SP,3,$Xilo
- ADDAW SP,2,$Xihi
+ ADD SP,4*2,$Xihi ; ADDAW SP,2,$Xihi
|| MV $CTXA,$CTXB
LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
@@ -134,13 +151,13 @@ loop0_15?:
SWAP2 $T1hi,$T1hi
|| SWAP2 $T1lo,$T1lo
.endif
-loop16_79?:
- STW $T1hi,*$Xihi++[2]
+ STW $T1hi,*$Xihi++[2] ; original loop16_79?
|| STW $T1lo,*$Xilo++[2] ; X[i] = T1
|| ADD $Hhi,$T1hi,$T1hi
|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
|| SHRU $Ehi,14,$S1hi
|| SHL $Ehi,32-14,$S1lo
+loop16_79?:
XOR $Fhi,$Ghi,$CHhi
|| XOR $Flo,$Glo,$CHlo
|| ADD KHI,$T1hi,$T1hi
@@ -213,21 +230,21 @@ loop16_79?:
|| XOR $t0lo,$S0lo,$S0lo
|| ADD $Ehi,$T1hi,$T1hi
|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e
-|| [B0] BNOP loop0_15?
|| SHRU $Ahi,39-32,$t0lo
|| SHL $Ahi,64-39,$t0hi
+ [B0] BNOP loop0_15?
+|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
-|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
-||[!B1] BNOP break?
|| SHRU $Alo,39-32,$t0hi
|| SHL $Alo,64-39,$t0lo
+||[!B0] LDW *${Xihi}[28],$T1hi
+||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a)
|| ADD $T1carry,$T1hi,$Ehi
-|| MV $T1lo,$Elo ; e = T1
-||[!B0] LDW *${Xihi}[28],$T1hi
-||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
+|| ROTL $T1lo,0,$Elo ; e = T1, "ghost" value
+||[!B1] BNOP break?
ADD $S0hi,$T2hi,$T2hi
|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a)
|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i]
@@ -236,14 +253,13 @@ loop16_79?:
|| MV $T2lo,$Alo ; a = T2
|| [B0] SUB B0,1,B0
;;===== branch to loop00_15? is taken here
- NOP
+ [B1] LDW *${Xihi}[2],$T2hi
+|| [B1] LDW *${Xilo}[2],$T2lo ; X[i+1]
+|| [B1] SHRU $T1hi,19,$S1hi
+|| [B1] SHL $T1hi,32-19,$S1lo
+ [B1] SHRU $T1lo,19,$t0lo
+|| [B1] SHL $T1lo,32-19,$t0hi
;;===== branch to break? is taken here
- LDW *${Xihi}[2],$T2hi
-|| LDW *${Xilo}[2],$T2lo ; X[i+1]
-|| SHRU $T1hi,19,$S1hi
-|| SHL $T1hi,32-19,$S1lo
- SHRU $T1lo,19,$t0lo
-|| SHL $T1lo,32-19,$t0hi
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| SHRU $T1hi,61-32,$t0lo
@@ -281,7 +297,6 @@ loop16_79?:
|| XOR $t0lo,$S0lo,$S0lo
|| ADD $S1hi,$T1hi,$T1hi
|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1()
-|| [B1] BNOP loop16_79?
|| SHRU $T2hi,7,$t0hi
|| SHL $T2hi,32-7,$t0lo
XOR $t0hi,$S0hi,$S0hi
@@ -289,6 +304,7 @@ loop16_79?:
|| ADD $CHhi,$T1hi,$T1hi
|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i]
|| SHRU $T2lo,7,$t0lo
+|| [B1] BNOP loop16_79?
XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1]
ADD $S0hi,$T1hi,$T1hi
@@ -296,6 +312,13 @@ loop16_79?:
|| [B1] SUB B1,1,B1
NOP ; avoid cross-path stall
ADD $T1carry,$T1hi,$T1hi
+
+ STW $T1hi,*$Xihi++[2] ; copied "top" bundle
+|| STW $T1lo,*$Xilo++[2] ; X[i] = T1
+|| ADD $Hhi,$T1hi,$T1hi
+|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
+|| SHRU $Ehi,14,$S1hi
+|| SHL $Ehi,32-14,$S1lo
;;===== branch to loop16_79? is taken here
break?:
@@ -359,7 +382,11 @@ break?:
NOP 2 ; wait till FP is committed
.endasmfunc
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
.sect ".const:sha_asm"
+ .endif
.align 128
K512:
.uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
@@ -402,7 +429,7 @@ K512:
.uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
.uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
.uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
- .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .cstring "SHA512 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
diff --git a/test/fips_algvs.c b/test/fips_algvs.c
index 8ff75dc..2bfd213 100644
--- a/test/fips_algvs.c
+++ b/test/fips_algvs.c
@@ -150,7 +150,7 @@ extern int fips_rsavtest_main(int argc, char **argv);
extern int fips_shatest_main(int argc, char **argv);
extern int fips_test_suite_main(int argc, char **argv);
-#if !defined(_TMS320C6400_PLUS)
+#if !defined(_TMS320C6400_PLUS) && !defined(_TMS320C6400)
#include "fips_aesavs.c"
#include "fips_cmactest.c"
#include "fips_desmovs.c"
diff --git a/util/mk1mf.pl b/util/mk1mf.pl
index 8934aba..5c4c50a 100755
--- a/util/mk1mf.pl
+++ b/util/mk1mf.pl
@@ -249,7 +249,7 @@ elsif (($platform eq "netware-clib") || ($platform eq "netware-libc") ||
$BSDSOCK=1 if ($platform eq "netware-libc-bsdsock") || ($platform eq "netware-clib-bsdsock");
require 'netware.pl';
}
-elsif ($platform eq "c64xplus")
+elsif ($platform =~ /^c64x/)
{
require "TI_CGTOOLS.pl";
}
More information about the openssl-commits
mailing list