[openssl-commits] [openssl] OpenSSL-fips-2_0-dev update

Wed Aug 30 20:25:23 UTC 2017

The branch OpenSSL-fips-2_0-dev has been updated
       via  781280094ad389e8958631b97e70f498becbd9cb (commit)
       via  5526e5791f1426553b6f4806d1ac82efd6ab33bc (commit)
      from  fe36a698477e7cb1a49de3f4cba5ad7f89f5ad4c (commit)


- Log -----------------------------------------------------------------
commit 781280094ad389e8958631b97e70f498becbd9cb
Author: Andy Polyakov <appro at openssl.org>
Date:   Fri Nov 25 11:52:06 2016 +0100

    c6x/* "facelift":
    
    - make scripts executable;
    - "parameterize" platform selection in c6x/do_fips;
    - add c6x/fips_algvs.mak;
    - add c6x/run6x.js launcher for more recent CCS versions;
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    Reviewed-by: Tim Hudson <tjh at openssl.org>
    Reviewed-by: Stephen Henson <steve at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/4265)

commit 5526e5791f1426553b6f4806d1ac82efd6ab33bc
Author: Andy Polyakov <appro at openssl.org>
Date:   Fri Nov 25 13:11:09 2016 +0100

    Add some C64x assembly modules [by minor adjustments of C64x+ modules].
    
    AES, SHA256 and SHA512 modules can actually replace corresponding
    C64x+ modules. This is because C64x+ instructions don't actually
    provide "killer-argument" advantage in these modules. As for SHA1,
    even though its performance exactly same, C64x+ module is more
    responsive to interrupts, i.e. doesn't inhibit them for as long
    periods as C64x module.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    Reviewed-by: Tim Hudson <tjh at openssl.org>
    Reviewed-by: Stephen Henson <steve at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/4265)

-----------------------------------------------------------------------

Summary of changes:
 Configure                                          |   3 +-
 c6x/do_fips                                        |   7 +-
 c6x/fips_algvs.mak                                 |  14 ++
 c6x/fips_standalone_sha1                           |   0
 c6x/incore6x                                       |   0
 c6x/run6x                                          |   0
 c6x/run6x.js                                       |  91 ++++++++
 crypto/aes/asm/{aes-c64xplus.pl => aes-c64x.pl}    | 176 ++++++++++------
 crypto/{c64xpluscpuid.pl => c64xcpuid.pl}          | 170 +++++++++++----
 crypto/sha/asm/sha1-c64x-large.pl                  | 230 +++++++++++++++++++++
 crypto/sha/asm/{sha1-c64xplus.pl => sha1-c64x.pl}  |  85 ++++----
 .../sha/asm/{sha256-c64xplus.pl => sha256-c64x.pl} |  49 +++--
 .../sha/asm/{sha512-c64xplus.pl => sha512-c64x.pl} |  75 ++++---
 test/fips_algvs.c                                  |   2 +-
 util/mk1mf.pl                                      |   2 +-
 15 files changed, 713 insertions(+), 191 deletions(-)
 mode change 100644 => 100755 c6x/do_fips
 create mode 100644 c6x/fips_algvs.mak
 mode change 100644 => 100755 c6x/fips_standalone_sha1
 mode change 100644 => 100755 c6x/incore6x
 mode change 100644 => 100755 c6x/run6x
 create mode 100755 c6x/run6x.js
 copy crypto/aes/asm/{aes-c64xplus.pl => aes-c64x.pl} (93%)
 copy crypto/{c64xpluscpuid.pl => c64xcpuid.pl} (56%)
 create mode 100644 crypto/sha/asm/sha1-c64x-large.pl
 copy crypto/sha/asm/{sha1-c64xplus.pl => sha1-c64x.pl} (85%)
 copy crypto/sha/asm/{sha256-c64xplus.pl => sha256-c64x.pl} (89%)
 copy crypto/sha/asm/{sha512-c64xplus.pl => sha512-c64x.pl} (89%)

diff --git a/Configure b/Configure
index 84a2bc2..679252e 100755
--- a/Configure
+++ b/Configure
@@ -636,13 +636,14 @@ my %table=(
 "uClinux-dist64","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):SIXTY_FOUR_BIT_LONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::",
 
 "c64xplus","cl6x:-mv6400+ -o2 -ox -ms -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:",
+"c64x","cl6x:-mv6400 -o2 -ox -ms -as -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS:::c64xcpuid.o:::aes-c64x.o aes_cbc.o aes_ctr.o:::sha1-c64x.o sha256-c64x.o sha512-c64x.o:::::::::void:",
 
 );
 
 my @MK1MF_Builds=qw(VC-WIN64I VC-WIN64A
 		    debug-VC-WIN64I debug-VC-WIN64A
 		    VC-NT VC-CE VC-WIN32 debug-VC-WIN32
-		    BC-32 c64xplus
+		    BC-32 c64xplus c64x
 		    netware-clib netware-clib-bsdsock
 		    netware-libc netware-libc-bsdsock);
 
diff --git a/c6x/do_fips b/c6x/do_fips
old mode 100644
new mode 100755
index c1c29fc..4045e60
--- a/c6x/do_fips
+++ b/c6x/do_fips
@@ -1,6 +1,11 @@
 #!/bin/sh
 
-perl Configure c64xplus fipscanisteronly no-engine
+if ! which cl6x > /dev/null 2>&1; then
+	echo 'fatal: cl6x is not on $PATH'
+	exit 1
+fi
+
+perl Configure ${C6XPLATFORM:-c64xplus} fipscanisteronly no-engine
 perl util/mkfiles.pl > MINFO
 perl util/mk1mf.pl auto > c6x/fips.mak
 make -f c6x/fips.mak
diff --git a/c6x/fips_algvs.mak b/c6x/fips_algvs.mak
new file mode 100644
index 0000000..7f67927
--- /dev/null
+++ b/c6x/fips_algvs.mak
@@ -0,0 +1,14 @@
+CC=cl6x
+CFLAGS=-mv$${C6XSILICON:-6400+} -o2 -I. -Ic6x/inc -Ifips -DNO_SYS_TYPES_H
+OBJ_D=c6x/tmp
+OUT_D=c6x
+
+all:	$(OUT_D)/fips_algvs.out
+
+$(OBJ_D)/fips_algvs.obj:	test/fips_algvs.c
+	$(CC) --obj_directory=$(OBJ_D) $(CFLAGS) -c $<
+
+$(OUT_D)/fips_algvs.out:	$(OBJ_D)/fips_algvs.obj $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+	$(OUT_D)/fips_standalone_sha1 -verify $(OUT_D)/fipscanister.obj
+	$(CC) -z -o $@ -m $(OUT_D)/fips_algvs.map $< $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+	$(OUT_D)/incore6x $@ || rm $@
diff --git a/c6x/fips_standalone_sha1 b/c6x/fips_standalone_sha1
old mode 100644
new mode 100755
diff --git a/c6x/incore6x b/c6x/incore6x
old mode 100644
new mode 100755
diff --git a/c6x/run6x b/c6x/run6x
old mode 100644
new mode 100755
diff --git a/c6x/run6x.js b/c6x/run6x.js
new file mode 100755
index 0000000..6d94949
--- /dev/null
+++ b/c6x/run6x.js
@@ -0,0 +1,91 @@
+#!/usr/bin/env dss.sh
+//
+// Debug Server Scripting C6x launcher.
+//
+
+importPackage(Packages.com.ti.debug.engine.scripting);
+importPackage(Packages.com.ti.ccstudio.scripting.environment);
+importPackage(Packages.java.lang);
+
+if (arguments.length == 0) {
+    // Extract script name from eclipse
+    var regex = new RegExp("-dss\\.rhinoArgs\n(.*)");
+    var matches = regex.exec(environment["eclipse.commands"]);
+
+    System.err.println("Usage: " + matches[1] + " executable [args]");
+    System.err.println();
+    System.err.println("You're also required to set CCSTARGETCONFIG " +
+                       "environment variable to appoint");
+    System.err.println("proper .ccxml file, customarily one of " +
+                       "$HOME/ti/CCSTargetConfigurations/*.ccxml");
+    quit(1);
+}
+
+try {
+    var prog = arguments[0];
+    var script = ScriptingEnvironment.instance();
+
+    var debugServer = script.getServer("DebugServer.1");
+
+    // CCSTARGETCONFIG environment variable should point at proper .ccxml,
+    // customarily one of $HOME/ti/CCSTargetConfigurations/*.ccxml.
+    debugServer.setConfig(System.getenv("CCSTARGETCONFIG"));
+
+    var debugSession = debugServer.openSession("*", "*");
+
+    // Redirect GEL output to |prog|.gel file, so that it doesn't clobber
+    // standard output from the program...
+    var dot = prog.lastIndexOf(".");
+    var gel_out = prog + ".gel";
+    if (dot > 0) {
+        gel_out = prog.substr(0,dot) + ".gel";
+    }
+    debugSession.expression.evaluate('GEL_EnableFileOutput("'
+                                      + gel_out + '", 0, 0)');
+
+    debugSession.target.connect();
+
+    // It should be noted that "current working directory" for program
+    // executed on the target system is one where |prog| resides, and
+    // not where script executed [as one would expect]...
+    debugSession.memory.loadProgram(prog, arguments);
+
+    // Pull exit()'s address and set breakpoint, then just execute till
+    // it's reached...
+    var exitAddr = debugSession.symbol.getAddress("exit");
+    debugSession.breakpoint.add(exitAddr);
+
+    while (1) {
+        debugSession.target.run();
+
+        var PC = debugSession.expression.evaluate("PC");
+        if (PC == exitAddr) {
+            break;
+        }
+    }
+
+    // Snatch value passed to exit(), so that it can be passed down to
+    // shell as exit code from this script...
+    var exitCode = debugSession.expression.evaluate("A4");
+
+    // Last run to termination...
+    debugSession.target.run();
+    // Clean up...
+    debugSession.terminate();
+    debugServer.stop();
+
+    // It should be noted that there is kind of a bug in C6x run-time.
+    // Return value from main() is not passed to last implicit exit()
+    // call [as it would on other systems], but instead constant 1 is
+    // passed, which conventionally indicates an error. So that if one
+    // wants to pass specific exit code, or even 0 indicating "success",
+    // one has to call exit() explicitly instead of relying on value
+    // returned by main()...
+    quit(exitCode);
+
+} catch (e) {
+    // We catch everything, because default handler terminates script with
+    // "success" exit code upon exception...
+    System.err.println(e.rhinoException);
+    quit(139);
+}
diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64x.pl
similarity index 93%
copy from crypto/aes/asm/aes-c64xplus.pl
copy to crypto/aes/asm/aes-c64x.pl
index 206d7dc..0817128 100644
--- a/crypto/aes/asm/aes-c64xplus.pl
+++ b/crypto/aes/asm/aes-c64x.pl
@@ -7,9 +7,9 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# [Endian-neutral] AES for C64x+.
+# [Endian-neutral] AES for C64x.
 #
-# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# Even though loops are scheduled for 13 cycles, and thus expected
 # performance is ~8.5 cycles per byte processed with 128-bit key,
 # measured performance turned to be ~10 cycles per byte. Discrepancy
 # must be caused by limitations of L1D memory banking(*), see SPRU871
@@ -45,6 +45,18 @@ open STDOUT,">$output";
 $code=<<___;
 	.text
 
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	AES_encrypt,_AES_encrypt
+	.asg	AES_decrypt,_AES_decrypt
+	.asg	AES_set_encrypt_key,_AES_set_encrypt_key
+	.asg	AES_set_decrypt_key,_AES_set_decrypt_key
+	.asg	AES_ctr32_encrypt,_AES_ctr32_encrypt
+	.endif
+
 	.asg	B3,RA
 	.asg	A4,INP
 	.asg	B4,OUT
@@ -75,13 +87,23 @@ _AES_encrypt:
 	.asmfunc
 	MVK	1,B2
 __encrypt:
+	.if	__TI_EABI__
    [B2]	LDNDW	*INP++,A9:A8			; load input
-||	MVKL	(AES_Te-_AES_encrypt),$TEA
-||	ADDKPC	_AES_encrypt,B0
+||	MVKL	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||	ADDKPC	__encrypt,B0
    [B2]	LDNDW	*INP++,B9:B8
-||	MVKH	(AES_Te-_AES_encrypt),$TEA
+||	MVKH	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
 ||	ADD	0,KEY,$KPA
 ||	ADD	4,KEY,$KPB
+	.else
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	(AES_Te-__encrypt),$TEA
+||	ADDKPC	__encrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	(AES_Te-__encrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.endif
 	LDW	*$KPA++[2],$Te0[0]		; zero round key
 ||	LDW	*$KPB++[2],$Te0[1]
 ||	MVK	60,A0
@@ -107,15 +129,14 @@ __encrypt:
 ||	XOR	$Te0[1],$s[1],$s[1]
 ||	LDW	*$KPA++[2],$K[0]		; 1st round key
 ||	LDW	*$KPB++[2],$K[1]
-	SUB	B0,2,B0
 
-	SPLOOPD	13
-||	MVC	B0,ILC
-||	LDW	*$KPA++[2],$K[2]
+	LDW	*$KPA++[2],$K[2]
 ||	LDW	*$KPB++[2],$K[3]
-;;====================================================================
-	EXTU	$s[1],EXT1,24,$Te1[1]
+||	EXTU	$s[1],EXT1,24,$Te1[1]
 ||	EXTU	$s[0],EXT3,24,$Te3[0]
+||	SUB	B0,1,B0
+;;====================================================================
+enc_loop?:
 	LDW	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
 ||	LDW	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
 ||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
@@ -150,12 +171,14 @@ __encrypt:
 ||	ROTL	$Te1[0],TBL1,$Te3[1]		; t3
 ||	EXTU	$s[2],EXT0,24,$Te0[2]
 ||	EXTU	$s[3],EXT0,24,$Te0[3]
+|| [B0]	SUB	B0,1,B0
 	LDW	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
 ||	LDW	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
 ||	ROTL	$Te2[2],TBL2,$Te2[2]		; t0
 ||	ROTL	$Te2[3],TBL2,$Te2[3]		; t1
 ||	XOR	$K[0],$Te3[0],$s[0]
 ||	XOR	$K[1],$Te1[1],$s[1]
+|| [B0]	BNOP	enc_loop?
 	ROTL	$Te3[3],TBL3,$Te1[2]		; t0
 ||	ROTL	$Te1[2],TBL1,$Te3[3]		; t1
 ||	XOR	$K[2],$Te1[0],$s[2]
@@ -176,14 +199,13 @@ __encrypt:
 ||	XOR	$s[3],$Te2[1],$s[3]
 ||	XOR	$s[0],$Te0[0],$s[0]
 ||	XOR	$s[1],$Te0[1],$s[1]
-	SPKERNEL
-||	XOR.L	$s[2],$Te3[2],$s[2]
-||	XOR.L	$s[3],$Te1[3],$s[3]
-;;====================================================================
-	ADD.D	${TEA},A0,${TEA}		; point to Te4
-||	ADD.D	${TEB},A0,${TEB}
+	XOR	$s[2],$Te3[2],$s[2]
+||	XOR	$s[3],$Te1[3],$s[3]
 ||	EXTU	$s[1],EXT1,24,$Te1[1]
 ||	EXTU	$s[0],EXT3,24,$Te3[0]
+||[!B0]	ADD	${TEA},A0,${TEA}		; point to Te4
+||[!B0]	ADD	${TEB},A0,${TEB}
+;;====================================================================
 	LDBU	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
 ||	LDBU	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
 ||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
@@ -277,13 +299,23 @@ _AES_decrypt:
 	.asmfunc
 	MVK	1,B2
 __decrypt:
+	.if	__TI_EABI__
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||	ADDKPC	__decrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.else
    [B2]	LDNDW	*INP++,A9:A8			; load input
-||	MVKL	(AES_Td-_AES_decrypt),$TEA
-||	ADDKPC	_AES_decrypt,B0
+||	MVKL	(AES_Td-__decrypt),$TEA
+||	ADDKPC	__decrypt,B0
    [B2]	LDNDW	*INP++,B9:B8
-||	MVKH	(AES_Td-_AES_decrypt),$TEA
+||	MVKH	(AES_Td-__decrypt),$TEA
 ||	ADD	0,KEY,$KPA
 ||	ADD	4,KEY,$KPB
+	.endif
 	LDW	*$KPA++[2],$Td0[0]		; zero round key
 ||	LDW	*$KPB++[2],$Td0[1]
 ||	MVK	60,A0
@@ -309,15 +341,14 @@ __decrypt:
 ||	XOR	$Td0[1],$s[1],$s[1]
 ||	LDW	*$KPA++[2],$K[0]		; 1st round key
 ||	LDW	*$KPB++[2],$K[1]
-	SUB	B0,2,B0
 
-	SPLOOPD	13
-||	MVC	B0,ILC
-||	LDW	*$KPA++[2],$K[2]
+	LDW	*$KPA++[2],$K[2]
 ||	LDW	*$KPB++[2],$K[3]
-;;====================================================================
-	EXTU	$s[1],EXT3,24,$Td3[1]
+||	EXTU	$s[1],EXT3,24,$Td3[1]
 ||	EXTU	$s[0],EXT1,24,$Td1[0]
+||	SUB	B0,1,B0
+;;====================================================================
+dec_loop?:
 	LDW	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
 ||	LDW	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
 ||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
@@ -352,12 +383,14 @@ __decrypt:
 ||	ROTL	$Td3[0],TBL3,$Td1[1]		; t3
 ||	EXTU	$s[2],EXT0,24,$Td0[2]
 ||	EXTU	$s[3],EXT0,24,$Td0[3]
+|| [B0]	SUB	B0,1,B0
 	LDW	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
 ||	LDW	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
 ||	ROTL	$Td2[2],TBL2,$Td2[2]		; t0
 ||	ROTL	$Td2[3],TBL2,$Td2[3]		; t1
 ||	XOR	$K[0],$Td1[0],$s[0]
 ||	XOR	$K[1],$Td3[1],$s[1]
+|| [B0]	BNOP	dec_loop?
 	ROTL	$Td1[3],TBL1,$Td3[2]		; t0
 ||	ROTL	$Td3[2],TBL3,$Td1[3]		; t1
 ||	XOR	$K[2],$Td3[0],$s[2]
@@ -378,14 +411,13 @@ __decrypt:
 ||	XOR	$s[3],$Td2[1],$s[3]
 ||	XOR	$s[0],$Td0[0],$s[0]
 ||	XOR	$s[1],$Td0[1],$s[1]
-	SPKERNEL
-||	XOR.L	$s[2],$Td1[2],$s[2]
-||	XOR.L	$s[3],$Td3[3],$s[3]
-;;====================================================================
-	ADD.D	${TEA},A0,${TEA}		; point to Td4
-||	ADD.D	${TEB},A0,${TEB}
+	XOR	$s[2],$Td1[2],$s[2]
+||	XOR	$s[3],$Td3[3],$s[3]
 ||	EXTU	$s[1],EXT3,24,$Td3[1]
 ||	EXTU	$s[0],EXT1,24,$Td1[0]
+||[!B0]	ADD	${TEA},A0,${TEA}		; point to Td4
+||[!B0]	ADD	${TEB},A0,${TEB}
+;;====================================================================
 	LDBU	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
 ||	LDBU	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
 ||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
@@ -515,17 +547,26 @@ __set_encrypt_key:
    [B0]	B	key256?
 || [A1]	LDNDW	*INP++,B19:B18
 
+	.if	__TI_EABI__
    [A0]	ADD	0,KEY,$KPA
 || [A0]	ADD	4,KEY,$KPB
-|| [A0]	MVKL	(AES_Te4-_AES_set_encrypt_key),$TEA
-|| [A0]	ADDKPC	_AES_set_encrypt_key,B6
-   [A0]	MVKH	(AES_Te4-_AES_set_encrypt_key),$TEA
+|| [A0]	MVKL	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0]	ADDKPC	__set_encrypt_key,B6
+   [A0]	MVKH	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
    [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
+	.else
+   [A0]	ADD	0,KEY,$KPA
+|| [A0]	ADD	4,KEY,$KPB
+|| [A0]	MVKL	(AES_Te4-__set_encrypt_key),$TEA
+|| [A0]	ADDKPC	__set_encrypt_key,B6
+   [A0]	MVKH	(AES_Te4-__set_encrypt_key),$TEA
+   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
+	.endif
 	NOP
 	NOP
 
 	BNOP	RA,5
-||	MVK	-2,RET				; unknown bit lenght
+||	MVK	-2,RET				; unknown bit length
 ||	MVK	0,B0				; redundant
 ;;====================================================================
 ;;====================================================================
@@ -543,13 +584,12 @@ key128?:
 	.endif
 
 	MVK	256,A0
-||	MVK	9,B0
+||	MVK	8,B0
 
-	SPLOOPD	14
-||	MVC	B0,ILC
-||	MV	$TEA,$TEB
+	MV	$TEA,$TEB
 ||	ADD	$TEA,A0,A30			; rcon
 ;;====================================================================
+loop128?:
 	LDW	*A30++[1],A31			; rcon[i]
 ||	MV	$Te4[2],$K[2]
 ||	EXTU	$K[3],EXT1,24,$Te4[0]
@@ -576,10 +616,12 @@ key128?:
 	.if	.BIG_ENDIAN
 	PACK2	$Te4[0],$Te4[1],$Te4[1]
 	PACK2	$Te4[3],A0,$Te4[3]
+||	BDEC	loop128?,B0
 	PACKL4	$Te4[1],$Te4[3],$Te4[3]
 	.else
 	PACK2	$Te4[1],$Te4[0],$Te4[1]
 	PACK2	$Te4[3],A0,$Te4[3]
+||	BDEC	loop128?,B0
 	PACKL4	$Te4[3],$Te4[1],$Te4[3]
 	.endif
 	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
@@ -587,7 +629,6 @@ key128?:
 	MV	$Te4[0],$K[0]
 ||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
 	XOR	$Te4[2],$K[3],$K[3]		; K[3]
-	SPKERNEL
 ;;====================================================================
 	BNOP	RA
 	MV	$Te4[2],$K[2]
@@ -802,17 +843,15 @@ _AES_set_decrypt_key:
 ret?:						; B0 holds rounds or zero
   [!B0]	BNOP	B31				; return if zero
    [B0]	SHL	B0,4,A0				; offset to last round key
-   [B0]	SHRU	B0,1,B1
-   [B0]	SUB	B1,1,B1
-   [B0]	MVK	0x0000001B,B3			; AES polynomial
+   [B0]	SHRU	B0,1,B2
+   [B0]	SUB	B2,2,B2
+|| [B0]	MVK	0x0000001B,B3			; AES polynomial
    [B0]	MVKH	0x07000000,B3
-
-	SPLOOPD	9				; flip round keys
-||	MVC	B1,ILC
-||	MV	B30,$KPA
-||	ADD	B30,A0,$KPB
-||	MVK	16,A0				; sizeof(round key)
+|| [B0]	MV	B30,$KPA
+   [B0]	ADD	B30,A0,$KPB
+|| [B0]	MVK	16,A0				; sizeof(round key)
 ;;====================================================================
+flip_loop?:
 	LDW	*${KPA}[0],A16
 ||	LDW	*${KPB}[0],B16
 	LDW	*${KPA}[1],A17
@@ -823,6 +862,7 @@ ret?:						; B0 holds rounds or zero
 ||	ADD	$KPA,A0,$KPA
 ||	LDW	*${KPB}[3],B19
 ||	SUB	$KPB,A0,$KPB
+||	BDEC	flip_loop?,B2
 	NOP
 	STW	B16,*${KPA}[-4]
 ||	STW	A16,*${KPB}[4]
@@ -832,7 +872,6 @@ ret?:						; B0 holds rounds or zero
 ||	STW	A18,*${KPB}[6]
 	STW	B19,*${KPA}[-1]
 ||	STW	A19,*${KPB}[7]
-	SPKERNEL
 ;;====================================================================
 	SUB	B0,1,B0				; skip last round
 ||	ADD	B30,A0,$KPA			; skip first round
@@ -847,10 +886,9 @@ ret?:						; B0 holds rounds or zero
 ||	MVK	0x00000B0B,B24
 	MVKH	0x09090000,A24
 ||	MVKH	0x0B0B0000,B24
-	MVC	B0,ILC
-||	SUB	B0,1,B0
+	SUB	B0,1,B0
 
-	GMPY4	$K[0],A24,$Kx9[0]		; ·0x09
+	GMPY4	$K[0],A24,$Kx9[0]		; Â·0x09
 ||	GMPY4	$K[1],A24,$Kx9[1]
 ||	MVK	0x00000D0D,A25
 ||	MVK	0x00000E0E,B25
@@ -859,14 +897,14 @@ ret?:						; B0 holds rounds or zero
 ||	MVKH	0x0D0D0000,A25
 ||	MVKH	0x0E0E0000,B25
 
-	GMPY4	$K[0],B24,$KxB[0]		; ·0x0B
+	GMPY4	$K[0],B24,$KxB[0]		; Â·0x0B
 ||	GMPY4	$K[1],B24,$KxB[1]
 	GMPY4	$K[2],B24,$KxB[2]
 ||	GMPY4	$K[3],B24,$KxB[3]
 
-	SPLOOP	11				; InvMixColumns
 ;;====================================================================
-	GMPY4	$K[0],A25,$KxD[0]		; ·0x0D
+invmix_loop?:
+	GMPY4	$K[0],A25,$KxD[0]		; Â·0x0D
 ||	GMPY4	$K[1],A25,$KxD[1]
 ||	SWAP2	$Kx9[0],$Kx9[0]			; rotate by 16
 ||	SWAP2	$Kx9[1],$Kx9[1]
@@ -883,7 +921,7 @@ ret?:						; B0 holds rounds or zero
 || [B0]	LDW	*${KPA}[6],$K[2]
 || [B0]	LDW	*${KPB}[7],$K[3]
 
-	GMPY4	$s[0],B25,$KxE[0]		; ·0x0E
+	GMPY4	$s[0],B25,$KxE[0]		; Â·0x0E
 ||	GMPY4	$s[1],B25,$KxE[1]
 ||	XOR	$Kx9[0],$KxB[0],$KxB[0]
 ||	XOR	$Kx9[1],$KxB[1],$KxB[1]
@@ -900,10 +938,11 @@ ret?:						; B0 holds rounds or zero
 ||	ROTL	$KxB[3],TBL3,$KxB[3]
 ||	SWAP2	$KxD[2],$KxD[2]
 ||	SWAP2	$KxD[3],$KxD[3]
+|| [B0]	B	invmix_loop?
 
 	XOR	$KxE[0],$KxD[0],$KxE[0]
 ||	XOR	$KxE[1],$KxD[1],$KxE[1]
-|| [B0]	GMPY4	$K[0],A24,$Kx9[0]		; ·0x09
+|| [B0]	GMPY4	$K[0],A24,$Kx9[0]		; Â·0x09
 || [B0]	GMPY4	$K[1],A24,$Kx9[1]
 ||	ADDAW	$KPA,4,$KPA
 	XOR	$KxE[2],$KxD[2],$KxE[2]
@@ -914,7 +953,7 @@ ret?:						; B0 holds rounds or zero
 
 	XOR	$KxB[0],$KxE[0],$KxE[0]
 ||	XOR	$KxB[1],$KxE[1],$KxE[1]
-|| [B0]	GMPY4	$K[0],B24,$KxB[0]		; ·0x0B
+|| [B0]	GMPY4	$K[0],B24,$KxB[0]		; Â·0x0B
 || [B0]	GMPY4	$K[1],B24,$KxB[1]
 	XOR	$KxB[2],$KxE[2],$KxE[2]
 ||	XOR	$KxB[3],$KxE[3],$KxE[3]
@@ -925,7 +964,6 @@ ret?:						; B0 holds rounds or zero
 	STW	$KxE[2],*${KPA}[-2]
 ||	STW	$KxE[3],*${KPB}[-1]
 || [B0]	SUB	B0,1,B0
-	SPKERNEL
 ;;====================================================================
 	BNOP	B31,3
 	MVC	B30,GFPGFR			; restore GFPGFR(*)
@@ -943,7 +981,8 @@ _AES_ctr32_encrypt:
 	.asmfunc
 	LDNDW	*${ivp}[0],A31:A30	; load counter value
 ||	MV	$blocks,A2		; reassign $blocks
-||	DMV	RA,$key,B27:B26		; reassign RA and $key
+||	MV	RA,B27			; reassign RA
+||	MV	$key,B26		; reassign $key
 	LDNDW	*${ivp}[1],B31:B30
 ||	MVK	0,B2			; don't let __encrypt load input
 ||	MVK	0,A1			; and postpone writing output
@@ -965,13 +1004,15 @@ ctr32_loop?:
 || [A2]	LDNDW	*INP++,B29:B28
 	.if	.BIG_ENDIAN
    [A1]	STNDW	A9:A8,*OUT++		; save output
-|| [A2]	DMV	A31,A30,A9:A8		; pass counter value to __encrypt
+|| [A2]	MV	A31,A9			; pass counter value to __encrypt
+|| [A2]	MV	A30,A8			; pass counter value to __encrypt
    [A1]	STNDW	B9:B8,*OUT++
 || [A2]	DMV	B31,B30,B9:B8
 || [A2]	ADD	B30,1,B30		; counter++
 	.else
    [A1]	STNDW	A9:A8,*OUT++		; save output
-|| [A2]	DMV	A31,A30,A9:A8
+|| [A2]	MV	A31,A9
+|| [A2]	MV	A30,A8
 || [A2]	SWAP2	B31,B0
 || [A2]	ADD	B31,1,B31		; counter++
    [A1]	STNDW	B9:B8,*OUT++
@@ -989,7 +1030,11 @@ ___
 }
 # Tables are kept in endian-neutral manner
 $code.=<<___;
+	.if	__TI_EABI__
+	.sect	".text:aes_asm.const"
+	.else
 	.sect	".const:aes_asm"
+	.endif
 	.align	128
 AES_Te:
 	.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84
@@ -1322,8 +1367,9 @@ AES_Td4:
 	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
 	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
 	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-	.cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.cstring "AES for C64x, CRYPTOGAMS by <appro\@openssl.org>"
 	.align	4
 ___
 
 print $code;
+close STDOUT;
diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xcpuid.pl
similarity index 56%
copy from crypto/c64xpluscpuid.pl
copy to crypto/c64xcpuid.pl
index 067b693..88fd153 100644
--- a/crypto/c64xpluscpuid.pl
+++ b/crypto/c64xcpuid.pl
@@ -1,5 +1,10 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
 #
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
 
 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -7,17 +12,39 @@ open STDOUT,">$output";
 $code.=<<___;
 	.text
 
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	OPENSSL_rdtsc,_OPENSSL_rdtsc
+	.asg	OPENSSL_cleanse,_OPENSSL_cleanse
+	.asg	CRYPTO_memcmp,_CRYPTO_memcmp
+	.asg	OPENSSL_atomic_add,_OPENSSL_atomic_add
+	.asg	OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
+	.asg	OPENSSL_instrument_bus,_OPENSSL_instrument_bus
+	.asg	OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
+	.endif
+
 	.asg	B3,RA
+	.asg	0x01AC0000,TIMER_BASE	; Timer 2
 
 	.global	_OPENSSL_rdtsc
 _OPENSSL_rdtsc:
 	.asmfunc
-	B	RA
-	MVC	TSCL,B0
-	MVC	TSCH,B1
-  [!B0]	MVC	B0,TSCL		; start TSC
-	MV	B0,A4
-	MV	B1,A5
+	MVKL	TIMER_BASE,A5
+	MVKH	TIMER_BASE,A5
+	LDW	*A5[0],A2	; load CTL
+	LDW	*A5[2],A4	; load CTN
+	NOP	2
+	.if	.BIG_ENDIAN
+	MVK	0x2c0,A7	; internal clock source, don't hold, go
+||	MVK	-1,A6		; maximum period
+	.else
+	MVK	0x2c0,A6	; internal clock source, don't hold, go
+||	MVK	-1,A7		; maximum period
+	.endif
+  [!A2]	STDW	A7:A6,*A5[0]	; fire it up
+||	BNOP	RA,5
 	.endasmfunc
 
 	.global	_OPENSSL_cleanse
@@ -28,28 +55,34 @@ _OPENSSL_cleanse:
 ||	SHRU	B4,3,B0		; is length >= 8
 ||	ADD	1,A4,B6
   [!B0]	BNOP	RA
+|| [B0]	SUB	B0,1,B2
 ||	ZERO	A1
 ||	ZERO	B1
-   [B0]	MVC	B0,ILC
+   [B2]	BDEC	cleanse_loop?,B2
 ||[!B0]	CMPLT	0,B4,A1
 ||[!B0]	CMPLT	1,B4,B1
+||	ZERO	B5
    [A1]	STB	A2,*A4++[2]
-|| [B1] STB	B2,*B6++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
 ||[!B0]	CMPLT	2,B4,A1
 ||[!B0]	CMPLT	3,B4,B1
    [A1]	STB	A2,*A4++[2]
-|| [B1] STB	B2,*B6++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
 ||[!B0]	CMPLT	4,B4,A1
 ||[!B0]	CMPLT	5,B4,B1
    [A1]	STB	A2,*A4++[2]
-|| [B1] STB	B2,*B6++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
 ||[!B0]	CMPLT	6,B4,A1
    [A1]	STB	A2,*A4++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
 
-	SPLOOP	1
+cleanse_loop?:
 	STNDW	A3:A2,*A4++
 ||	SUB	B4,8,B4
-	SPKERNEL
+|| [B2]	BDEC	cleanse_loop?,B2
 
 	MV	B4,B0		; remaining bytes
 ||	ADD	1,A4,B6
@@ -57,33 +90,73 @@ _OPENSSL_cleanse:
    [B0]	CMPLT	0,B0,A1
 || [B0]	CMPLT	1,B0,B1
    [A1]	STB	A2,*A4++[2]
-|| [B1] STB	B2,*B6++[2]
+|| [B1] STB	B5,*B6++[2]
 || [B0]	CMPLT	2,B0,A1
 || [B0]	CMPLT	3,B0,B1
    [A1]	STB	A2,*A4++[2]
-|| [B1] STB	B2,*B6++[2]
+|| [B1] STB	B5,*B6++[2]
 || [B0]	CMPLT	4,B0,A1
 || [B0]	CMPLT	5,B0,B1
    [A1]	STB	A2,*A4++[2]
-|| [B1] STB	B2,*B6++[2]
+|| [B1] STB	B5,*B6++[2]
 || [B0]	CMPLT	6,B0,A1
    [A1]	STB	A2,*A4++[2]
 	.endasmfunc
 
+	.if	0
+	.global	_CRYPTO_memcmp
+_CRYPTO_memcmp:
+	.asmfunc
+	MV	A6,B0
+  [!B0]	BNOP	RA
+||[!B0]	ZERO	A4
+|| [B0]	ZERO	A1:A0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+	XOR	A5,B5,A1
+|| [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+
+memcmp_loop?:
+	OR	A1,A0,A0
+||	XOR	A5,B5,A1
+|| [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+
+	BNOP	RA,3
+	ZERO	A4
+  [A0]	MVK	1,A4
+	.endasmfunc
+	.endif
+
 	.global	_OPENSSL_atomic_add
 _OPENSSL_atomic_add:
 	.asmfunc
-	MV	A4,B0
-atomic_add?:
-	LL	*B0,B5
-	NOP	4
+	BNOP	atomic_store?	; pre-C64x+ systems are uni-processor, it's
+||	LDW	*A4,B5		; enough to hold interrupts off through
+				; the load-update-store cycle to achieve
+				; atomicity
+	NOP
+	BNOP	RA,3		; and this branch stretches even over store
 	ADD	B4,B5,B5
-	SL	B5,*B0
-	CMTL	*B0,B1
-	NOP	4
-  [!B1]	B	atomic_add?
-   [B1]	BNOP	RA,4
-	MV	B5,A4
+atomic_store?:
+	STW	B5,*A4
+||	MV	B5,A4
 	.endasmfunc
 
 	.global	_OPENSSL_wipe_cpu
@@ -150,35 +223,34 @@ _OPENSSL_instrument_bus:
 	MV	B4,B0			; reassign sizeof(output)
 ||	MV	A4,B4			; reassign output
 ||	MVK	0x00004030,A3
+||	MVKL	TIMER_BASE,B16
 	MV	B0,A4			; return value
 ||	MVK	1,A1
 ||	MVKH	0x01840000,A3		; L1DWIBAR
-	MVC	TSCL,B8			; collect 1st tick
+||	MVKH	TIMER_BASE,B16
+	LDW	*B16[2],B8		; collect 1st tick
 ||	MVK	0x00004010,A5
+	NOP	4
 	MV	B8,B9			; lasttick = tick
 ||	MVK	0,B7			; lastdiff = 0
 ||	MVKH	0x01840000,A5		; L2WIBAR
 	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
 	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
-	LL	*B4,B5
+	LDW	*B4,B5
 	NOP	4
 	ADD	B7,B5,B5
-	SL	B5,*B4
-	CMTL	*B4,B1
-	NOP	4
 	STW	B5,*B4
 bus_loop1?:
-	MVC	TSCL,B8
+	LDW	*B16[2],B8
 || [B0]	SUB	B0,1,B0
+	NOP	4
 	SUB	B8,B9,B7		; lastdiff = tick - lasttick
 ||	MV	B8,B9			; lasttick = tick
 	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
 	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
-	LL	*B4,B5
+	LDW	*B4,B5
 	NOP	4
 	ADD	B7,B5,B5
-	SL	B5,*B4
-	CMTL	*B4,B1
 	STW	B5,*B4			; [!B1] is removed to flatten samples
 ||	ADDK	4,B4
 || [B0]	BNOP	bus_loop1?,5
@@ -192,42 +264,42 @@ _OPENSSL_instrument_bus2:
 	MV	A6,B0			; reassign max
 ||	MV	B4,A6			; reassing sizeof(output)
 ||	MVK	0x00004030,A3
+||	MVKL	TIMER_BASE,B16
 	MV	A4,B4			; reassign output
 ||	MVK	0,A4			; return value
 ||	MVK	1,A1
 ||	MVKH	0x01840000,A3		; L1DWIBAR
+||	MVKH	TIMER_BASE,B16
 
-	MVC	TSCL,B8			; collect 1st tick
+	LDW	*B16[2],B8		; collect 1st tick
 ||	MVK	0x00004010,A5
+	NOP	4
 	MV	B8,B9			; lasttick = tick
 ||	MVK	0,B7			; lastdiff = 0
 ||	MVKH	0x01840000,A5		; L2WIBAR
 	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
 	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
-	LL	*B4,B5
+	LDW	*B4,B5
 	NOP	4
 	ADD	B7,B5,B5
-	SL	B5,*B4
-	CMTL	*B4,B1
-	NOP	4
 	STW	B5,*B4
 
-	MVC	TSCL,B8			; collect 1st diff
+	LDW	*B16[2],B8		; collect 1st diff
+	NOP	4
 	SUB	B8,B9,B7		; lastdiff = tick - lasttick
 ||	MV	B8,B9			; lasttick = tick
 ||	SUB	B0,1,B0
 bus_loop2?:
 	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
 	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
-	LL	*B4,B5
+	LDW	*B4,B5
 	NOP	4
 	ADD	B7,B5,B5
-	SL	B5,*B4
-	CMTL	*B4,B1
 	STW	B5,*B4			; [!B1] is removed to flatten samples
 ||[!B0]	BNOP	bus_loop2_done?,2
 ||	SUB	B0,1,B0
-	MVC	TSCL,B8
+	LDW	*B16[2],B8
+	NOP	4
 	SUB	B8,B9,B8
 ||	MV	B8,B9
 	CMPEQ	B8,B7,B2
@@ -240,6 +312,14 @@ bus_loop2?:
 bus_loop2_done?:
 	BNOP	RA,5
 	.endasmfunc
+
+	.if	__TI_EABI__
+	.sect	".init_array"
+	.else
+	.sect	".pinit"
+	.endif
+	.align	4
+	.long	_OPENSSL_rdtsc		; auto-start timer
 ___
 
 print $code;
diff --git a/crypto/sha/asm/sha1-c64x-large.pl b/crypto/sha/asm/sha1-c64x-large.pl
new file mode 100644
index 0000000..3916ff3
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64x-large.pl
@@ -0,0 +1,230 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# This is fully-unrolled SHA1 implementation. It's 25% faster than
+# one with compact loops, doesn't use in-memory ring buffer, as
+# everything is accomodated in registers, and has "perfect" interrupt
+# agility. Drawback is obviously the code size...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24));
+ at V = ($A,$B,$C,$D,$E);
+ at X = map("B$_",(16..31));
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___				if ($i<14);
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 LDNW	*${INP}++, at X[$i+2]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	 SWAP2	@X[$i+1], at X[$i+1]
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 SWAP4	@X[$i+1], at X[$i+1]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i==14);
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+||	 SWAP2	@X[$i+1], at X[$i+1]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 SWAP4	@X[$i+1], at X[$i+1]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i==15);
+||	 XOR	@X[($j+2)&15], at X[$j], at X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15], at X[$j], at X[$j]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15], at X[$j], at X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1, at X[$j]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i>15);
+||	 XOR	@X[($j+2)&15], at X[$j], at X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15], at X[$j], at X[$j]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15], at X[$j], at X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1, at X[$j]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___				if ($i<79);
+||	 XOR	@X[($j+2)&15], at X[$j], at X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	XOR	$c,$b,$F
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15], at X[$j], at X[$j]
+	XOR	$d,$F,$F		; F_20_39(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15], at X[$j], at X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1, at X[$j]
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+___
+$code.=<<___				if ($i==79);
+|| [A0]	B	loop?
+|| [A0]	LDNW	*${INP}++, at X[0]		; pre-fetch input
+	ROTL	$a,5,$Arot		;; $i
+||	XOR	$c,$b,$F
+||	ADD	$K,$e,$e		; E+=K
+|| [A0]	LDNW	*${INP}++, at X[1]
+	XOR	$d,$F,$F		; F_20_39(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+||	ADD	$Bctx,$a,$a		; accumulate context
+||	ADD	$Cctx,$b,$b
+	ADD	$Dctx,$c,$c
+||	ADD	$Ectx,$d,$d
+||	ADD	$Actx,$e,$e
+;;===== branch to loop? is taken here
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___;
+||	 XOR	@X[($j+2)&15], at X[$j], at X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	AND	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15], at X[$j], at X[$j]
+	XOR	$F0,$F,$F
+||	AND	$c,$d,$F0
+||	ROTL	$b,30,$b
+||	 XOR	@X[($j+13)&15], at X[$j], at X[$j]
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+	XOR	$F0,$F,$F		; F_40_59(B,C,D)
+||	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1, at X[$j]
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+___
+}
+
+$code=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	sha1_block_data_order,_sha1_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	MV,SWAP2
+	.asg	MV,SWAP4
+	.endif
+
+	.global	_sha1_block_data_order
+_sha1_block_data_order:
+	.asmfunc
+	MV	$NUM,A0			; reassign $NUM
+  [!A0]	BNOP	RA			; if ($NUM==0) return;
+|| [A0]	LDW	*${CTX}[0],$A		; load A-E...
+   [A0]	LDW	*${CTX}[1],$B
+   [A0]	LDW	*${CTX}[2],$C
+   [A0]	LDW	*${CTX}[3],$D
+   [A0]	LDW	*${CTX}[4],$E
+   [A0]	LDNW	*${INP}++, at X[0]		; pre-fetch input
+   [A0]	LDNW	*${INP}++, at X[1]
+	NOP	3
+
+loop?:
+	SUB	A0,1,A0
+||	MV	$A,$Actx
+||	MVD	$B,$Bctx
+||	SWAP2	@X[0], at X[0]
+||	MVKL	0x5a827999,$K
+	MVKH	0x5a827999,$K		; K_00_19
+||	MV	$C,$Cctx
+||	MV	$D,$Dctx
+||	MVD	$E,$Ectx
+||	SWAP4	@X[0], at X[0]
+___
+for ($i=0;$i<20;$i++)	{ &BODY_00_19($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0x6ed9eba1,$K
+	MVKH	0x6ed9eba1,$K		; K_20_39
+___
+for (;$i<40;$i++)	{ &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0x8f1bbcdc,$K
+	MVKH	0x8f1bbcdc,$K		; K_40_59
+___
+for (;$i<60;$i++)	{ &BODY_40_59($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0xca62c1d6,$K
+	MVKH	0xca62c1d6,$K		; K_60_79
+___
+for (;$i<80;$i++)	{ &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	BNOP	RA			; return
+	STW	$A,*${CTX}[0]		; emit A-E...
+	STW	$B,*${CTX}[1]
+	STW	$C,*${CTX}[2]
+	STW	$D,*${CTX}[3]
+	STW	$E,*${CTX}[4]
+	.endasmfunc
+
+	.sect	.const
+	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64x.pl
similarity index 85%
copy from crypto/sha/asm/sha1-c64xplus.pl
copy to crypto/sha/asm/sha1-c64x.pl
index 87000d1..d7a9dd1 100644
--- a/crypto/sha/asm/sha1-c64xplus.pl
+++ b/crypto/sha/asm/sha1-c64x.pl
@@ -7,19 +7,19 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# SHA1 for C64x+.
+# SHA1 for C64x.
 #
-# November 2011
+# November 2016
 #
 # If compared to compiler-generated code with similar characteristics,
 # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
 # this implementation is 25% smaller and >2x faster. In absolute terms
 # performance is (quite impressive) ~6.5 cycles per processed byte.
-# Fully unrolled assembler would be ~5x larger and is likely to be
-# ~15% faster. It would be free from references to intermediate ring
-# buffer, but put more pressure on L1P [both because the code would be
-# larger and won't be using SPLOOP buffer]. There are no plans to
-# realize fully unrolled variant though...
+# Unlike its predecessor, sha1-c64xplus module, this module has worse
+# interrupt agility. While original added up to 5 cycles delay to
+# response to interrupt, this module adds up to 100. Fully unrolled
+# implementation doesn't add any delay and even 25% faster, but is
+# almost 5x larger...
 #
 # !!! Note that this module uses AMR, which means that all interrupt
 # service routines are expected to preserve it and for own well-being
@@ -39,6 +39,13 @@ open STDOUT,">$output";
 $code=<<___;
 	.text
 
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	sha1_block_data_order,_sha1_block_data_order
+	.endif
+
 	.asg	B3,RA
 	.asg	A15,FP
 	.asg	B15,SP
@@ -70,21 +77,21 @@ _sha1_block_data_order:
 	NOP	1
 
 loop?:
-	MVK	0x00007999,$K
-||	ADDAW	SP,2,$XPA
-||	SUB	A0,1,A0
-||	MVK	13,B0
-	MVKH	0x5a820000,$K		; K_00_19
+	MVKL	0x5a827999,$K
 ||	ADDAW	SP,2,$XPB
+||	SUB	A0,1,A0
+	MVKH	0x5a827999,$K		; K_00_19
 ||	MV	$A,$Actx
 ||	MV	$B,$Bctx
 ;;==================================================
-	SPLOOPD	5			; BODY_00_13
+	B	body_00_13?		; BODY_00_13
+||	MVK	11,B0
+||	MV	$XPB,$XPA
 ||	MV	$C,$Cctx
 ||	MV	$D,$Dctx
-||	MV	$E,$Ectx
-||	MVC	B0,ILC
+||	MVD	$E,$Ectx
 
+body_00_13?:
 	ROTL	$A,5,$Arot
 ||	AND	$C,$B,$F
 ||	ANDN	$D,$B,$F0
@@ -105,7 +112,7 @@ loop?:
 
 	ADD	$TX3,$T,$A		; A=T+Xi
 ||	STW	$TX3,*${XPB}++
-	SPKERNEL
+||	BDEC	body_00_13?,B0
 ;;==================================================
 	ROTL	$A,5,$Arot		; BODY_14
 ||	AND	$C,$B,$F
@@ -160,11 +167,11 @@ loop?:
 	ADD	$TX2,$T,$A		; A=T+Xi
 ||	STW	$TX2,*${XPB}++
 ||	XOR	$TX0,$TX1,$TX1
-||	MVK	3,B0
 ;;==================================================
-	SPLOOPD	5			; BODY_16_19
-||	MVC	B0,ILC
+||	B	body_16_19?		; BODY_16_19
+||	MVK	1,B0
 
+body_16_19?:
 	ROTL	$A,5,$Arot
 ||	AND	$C,$B,$F
 ||	ANDN	$D,$B,$F0
@@ -191,18 +198,19 @@ loop?:
 	ADD	$TX2,$T,$A		; A=T+Xi
 ||	STW	$TX2,*${XPB}++
 ||	XOR	$TX0,$TX1,$TX1
-	SPKERNEL
+||	BDEC	body_16_19?,B0
 
-	MVK	0xffffeba1,$K
-||	MVK	19,B0
-	MVKH	0x6ed90000,$K		; K_20_39
+	MVKL	0x6ed9eba1,$K
+||	MVK	17,B0
+	MVKH	0x6ed9eba1,$K		; K_20_39
 ___
 sub BODY_20_39 {
+my $label = shift;
 $code.=<<___;
 ;;==================================================
-	SPLOOPD	5			; BODY_20_39
-||	MVC	B0,ILC
+||	B	$label			; BODY_20_39
 
+$label:
 	ROTL	$A,5,$Arot
 ||	XOR	$B,$C,$F
 ||	ADD	$K,$E,$T		; T=E+K
@@ -228,20 +236,19 @@ $code.=<<___;
 	ADD	$TX2,$T,$A		; A=T+Xi
 ||	STW	$TX2,*${XPB}++		; last one is redundant
 ||	XOR	$TX0,$TX1,$TX1
-	SPKERNEL
-___
-$code.=<<___ if (!shift);
-	MVK	0xffffbcdc,$K
-	MVKH	0x8f1b0000,$K		; K_40_59
+||	BDEC	$label,B0
 ___
-}	&BODY_20_39();
+}	&BODY_20_39("body_20_39?");
 $code.=<<___;
 ;;==================================================
-	SPLOOPD	5			; BODY_40_59
-||	MVC	B0,ILC
+	MVKL	0x8f1bbcdc,$K
+||	MVK	17,B0
+	MVKH	0x8f1bbcdc,$K		; K_40_59
+||	B	body_40_59?		; BODY_40_59
 ||	AND	$B,$C,$F
 ||	AND	$B,$D,$F0
 
+body_40_59?:
 	ROTL	$A,5,$Arot
 ||	XOR	$F0,$F,$F
 ||	AND	$C,$D,$F0
@@ -270,13 +277,13 @@ $code.=<<___;
 ||	XOR	$TX0,$TX1,$TX1
 ||	AND	$B,$C,$F
 ||	AND	$B,$D,$F0
-	SPKERNEL
+||	BDEC	body_40_59?,B0
 
-	MVK	0xffffc1d6,$K
-||	MVK	18,B0
-	MVKH	0xca620000,$K		; K_60_79
+	MVKL	0xca62c1d6,$K
+||	MVK	16,B0
+	MVKH	0xca62c1d6,$K		; K_60_79
 ___
-	&BODY_20_39(-1);		# BODY_60_78
+	&BODY_20_39("body_60_78?");	# BODY_60_78
 $code.=<<___;
 ;;==================================================
    [A0]	B	loop?
@@ -315,7 +322,7 @@ $code.=<<___;
 	.endasmfunc
 
 	.sect	.const
-	.cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
 	.align	4
 ___
 
diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64x.pl
similarity index 89%
copy from crypto/sha/asm/sha256-c64xplus.pl
copy to crypto/sha/asm/sha256-c64x.pl
index 8b92c84..fbe99c0 100644
--- a/crypto/sha/asm/sha256-c64xplus.pl
+++ b/crypto/sha/asm/sha256-c64x.pl
@@ -7,9 +7,9 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# SHA256 for C64x+.
+# SHA256 for C64x.
 #
-# January 2012
+# November 2016
 #
 # Performance is just below 10 cycles per processed byte, which is
 # almost 40% faster than compiler-generated code. Unroll is unlikely
@@ -39,6 +39,14 @@ open STDOUT,">$output";
 $code.=<<___;
 	.text
 
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	sha256_block_data_order,_sha256_block_data_order
+	.endif
+
 	.asg	B3,RA
 	.asg	A15,FP
 	.asg	B15,SP
@@ -50,6 +58,7 @@ $code.=<<___;
 
 	.global	_sha256_block_data_order
 _sha256_block_data_order:
+__sha256_block:
 	.asmfunc stack_usage(64)
 	MV	$NUM,A0				; reassign $NUM
 ||	MVK	-64,B0
@@ -58,10 +67,17 @@ _sha256_block_data_order:
 || [A0]	MV	SP,FP
    [A0]	ADDKPC	_sha256_block_data_order,B2
 || [A0]	AND	B0,SP,SP			; align stack at 64 bytes
+	.if	__TI_EABI__
    [A0]	MVK	0x00404,B1
-|| [A0]	MVKL	(K256-_sha256_block_data_order),$K256
+|| [A0]	MVKL	\$PCR_OFFSET(K256,__sha256_block),$K256
    [A0]	MVKH	0x50000,B1
-|| [A0]	MVKH	(K256-_sha256_block_data_order),$K256
+|| [A0]	MVKH	\$PCR_OFFSET(K256,__sha256_block),$K256
+	.else
+   [A0]	MVK	0x00404,B1
+|| [A0]	MVKL	(K256-__sha256_block),$K256
+   [A0]	MVKH	0x50000,B1
+|| [A0]	MVKH	(K256-__sha256_block),$K256
+	.endif
    [A0]	MVC	B1,AMR				; setup circular addressing
 || [A0]	MV	SP,$Xia
    [A0]	MV	SP,$Xib
@@ -79,9 +95,8 @@ _sha256_block_data_order:
 
 	LDNW	*$INP++,$Xn			; pre-fetch input
 	LDW	*$K256++,$K			; pre-fetch K256[0]
-	MVK	14,B0				; loop counters
-	MVK	47,B1
-||	ADDAW	$Xia,9,$Xia
+	NOP
+	ADDAW	$Xia,9,$Xia
 outerloop?:
 	SUB	A0,1,A0
 ||	MV	$A,$Actx
@@ -94,10 +109,10 @@ outerloop?:
 ||	MVD	$H,$Hctx
 ||	SWAP4	$Xn,$X0
 
-	SPLOOPD	8				; BODY_00_14
-||	MVC	B0,ILC
+	MVK	14,B0				; loop counter
 ||	SWAP2	$X0,$X0
 
+loop_00_14?:					; BODY_00_14
 	LDNW	*$INP++,$Xn
 ||	ROTL	$A,30,$S0
 ||	OR	$A,$B,$Maj
@@ -113,6 +128,7 @@ outerloop?:
 ||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
 ||	ROTL	$E,7,$t1e
 ||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+|| [B0]	BDEC	loop_00_14?,B0
 	ADD	$X0,$T1,$T1			; T1 += X[i];
 ||	STW	$X0,*$Xib++
 ||	XOR	$t0a,$S0,$S0
@@ -134,7 +150,7 @@ outerloop?:
 	MV	$B,$C				; c = b
 ||	MV	$A,$B				; b = a
 ||	ADD	$T1,$T2,$A			; a = T1 + T2
-	SPKERNEL
+;;===== branch to loop00_14? is taken here
 
 	ROTL	$A,30,$S0			; BODY_15
 ||	OR	$A,$B,$Maj
@@ -178,11 +194,11 @@ outerloop?:
 ||	MV	$A,$B				; b = a
 ||	ADD	$T1,$T2,$A			; a = T1 + T2
 
-	SPLOOPD	10				; BODY_16_63
-||	MVC	B1,ILC
+	MVK	47,B1				; loop counter
 ||	ROTL	$X1,14,$t1e			; modulo-scheduled
 ||	ROTL	$X14,13,$t1a			; modulo-scheduled
 
+loop_16_63?:					; BODY_16_63
 	XOR	$t0e,$s0,$s0
 ||	XOR	$t0a,$s1,$s1
 ||	MV	$X15,$X14
@@ -207,6 +223,7 @@ outerloop?:
 ||	ROTL	$E,7,$t1e
 ||	ADD	$H,$K,$T1			; T1 = h + K256[i]
 ||	ADD	$s1,$X0,$X0			; X[i] += sigma1(X[i+14])
+|| [B1]	BDEC	loop_16_63?,B1
 	XOR	$t0a,$S0,$S0
 ||	XOR	$t0e,$S1,$S1
 ||	ADD	$X0,$T1,$T1			; T1 += X[i]
@@ -234,7 +251,7 @@ outerloop?:
 ||	ADD	$T1,$T2,$A			; a = T1 + T2
 ||	SHRU	$X1,3,$s0			; modulo-scheduled
 ||	SHRU	$X14,10,$s1			; modulo-scheduled
-	SPKERNEL
+;;===== branch to loop16_63? is taken here
 
    [A0]	B	outerloop?
 || [A0]	LDNW	*$INP++,$Xn			; pre-fetch input
@@ -265,7 +282,11 @@ outerloop?:
 ||	STW	$H,*${CTXB}[7]
 	.endasmfunc
 
+	.if	__TI_EABI__
+	.sect	".text:sha_asm.const"
+	.else
 	.sect	".const:sha_asm"
+	.endif
 	.align	128
 K256:
 	.uword	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@@ -284,7 +305,7 @@ K256:
 	.uword	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
 	.uword	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 	.uword	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-	.cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.cstring "SHA256 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
 	.align	4
 
 ___
diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64x.pl
similarity index 89%
copy from crypto/sha/asm/sha512-c64xplus.pl
copy to crypto/sha/asm/sha512-c64x.pl
index 56c8583..e35a72a 100644
--- a/crypto/sha/asm/sha512-c64xplus.pl
+++ b/crypto/sha/asm/sha512-c64x.pl
@@ -7,11 +7,11 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# SHA512 for C64x+.
+# SHA512 for C64x.
 #
-# January 2012
+# November 2016
 #
-# Performance is 19 cycles per processed byte. Compared to block
+# Performance is ~19 cycles per processed byte. Compared to block
 # transform function from sha512.c compiled with cl6x with -mv6400+
 # -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
 # Loop unroll won't make it, this implementation, any faster, because
@@ -47,6 +47,14 @@ open STDOUT,">$output";
 $code.=<<___;
 	.text
 
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	sha512_block_data_order,_sha512_block_data_order
+	.endif
+
 	.asg	B3,RA
 	.asg	A15,FP
 	.asg	B15,SP
@@ -61,6 +69,7 @@ $code.=<<___;
 
 	.global	_sha512_block_data_order
 _sha512_block_data_order:
+__sha512_block:
 	.asmfunc stack_usage(40+128)
 	MV	$NUM,A0				; reassign $NUM
 ||	MVK	-128,B0
@@ -75,13 +84,21 @@ _sha512_block_data_order:
    [A0]	STDW	A11:A10,*SP[1]
 || [A0]	MVC	B1,AMR				; setup circular addressing
 || [A0]	ADD	B0,SP,SP			; alloca(128)
+	.if	__TI_EABI__
    [A0]	AND	B0,SP,SP			; align stack at 128 bytes
-|| [A0]	ADDKPC	_sha512_block_data_order,B1
-|| [A0]	MVKL	(K512-_sha512_block_data_order),$K512
-   [A0]	MVKH	(K512-_sha512_block_data_order),$K512
+|| [A0]	ADDKPC	__sha512_block,B1
+|| [A0]	MVKL	\$PCR_OFFSET(K512,__sha512_block),$K512
+   [A0]	MVKH	\$PCR_OFFSET(K512,__sha512_block),$K512
 || [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	.else
+   [A0]	AND	B0,SP,SP			; align stack at 128 bytes
+|| [A0]	ADDKPC	__sha512_block,B1
+|| [A0]	MVKL	(K512-__sha512_block),$K512
+   [A0]	MVKH	(K512-__sha512_block),$K512
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	.endif
 	ADDAW	SP,3,$Xilo
-	ADDAW	SP,2,$Xihi
+	ADD	SP,4*2,$Xihi			; ADDAW	SP,2,$Xihi
 
 ||	MV	$CTXA,$CTXB
 	LDW	*${CTXA}[0^.LITTLE_ENDIAN],$Ahi	; load ctx
@@ -134,13 +151,13 @@ loop0_15?:
 	SWAP2	$T1hi,$T1hi
 ||	SWAP2	$T1lo,$T1lo
 	.endif
-loop16_79?:
-	STW	$T1hi,*$Xihi++[2]
+	STW	$T1hi,*$Xihi++[2]			; original loop16_79?
 ||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
 ||	ADD	$Hhi,$T1hi,$T1hi
 ||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
 ||	SHRU	$Ehi,14,$S1hi
 ||	SHL	$Ehi,32-14,$S1lo
+loop16_79?:
 	XOR	$Fhi,$Ghi,$CHhi
 ||	XOR	$Flo,$Glo,$CHlo
 ||	ADD	KHI,$T1hi,$T1hi
@@ -213,21 +230,21 @@ loop16_79?:
 ||	XOR	$t0lo,$S0lo,$S0lo
 ||	ADD	$Ehi,$T1hi,$T1hi
 ||	ADDU	$Elo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += e
-|| [B0]	BNOP	loop0_15?
 ||	SHRU	$Ahi,39-32,$t0lo
 ||	SHL	$Ahi,64-39,$t0hi
+   [B0]	BNOP	loop0_15?
+|| [B0]	LDNDW	*$INP++,B11:B10				; pre-fetch input
 	XOR	$t0hi,$S0hi,$S0hi
 ||	XOR	$t0lo,$S0lo,$S0lo
-|| [B0]	LDNDW	*$INP++,B11:B10				; pre-fetch input
-||[!B1]	BNOP	break?
 ||	SHRU	$Alo,39-32,$t0hi
 ||	SHL	$Alo,64-39,$t0lo
+||[!B0]	LDW	*${Xihi}[28],$T1hi
+||[!B0]	LDW	*${Xilo}[28],$T1lo			; X[i+14]
 	XOR	$t0hi,$S0hi,$S0hi
 ||	XOR	$t0lo,$S0lo,$S0lo			; Sigma0(a)
 ||	ADD	$T1carry,$T1hi,$Ehi
-||	MV	$T1lo,$Elo				; e = T1
-||[!B0]	LDW	*${Xihi}[28],$T1hi
-||[!B0]	LDW	*${Xilo}[28],$T1lo			; X[i+14]
+||	ROTL	$T1lo,0,$Elo				; e = T1, "ghost" value
+||[!B1]	BNOP	break?
 	ADD	$S0hi,$T2hi,$T2hi
 ||	ADDU	$S0lo,$T2carry:$T2lo,$T2carry:$T2lo	; T2 += Sigma0(a)
 || [B1]	LDDW	*$K512++,$Khi:$Klo			; pre-fetch K512[i]
@@ -236,14 +253,13 @@ loop16_79?:
 ||	MV	$T2lo,$Alo				; a = T2
 || [B0]	SUB	B0,1,B0
 ;;===== branch to loop00_15? is taken here
-	NOP
+   [B1]	LDW	*${Xihi}[2],$T2hi
+|| [B1]	LDW	*${Xilo}[2],$T2lo			; X[i+1]
+|| [B1]	SHRU	$T1hi,19,$S1hi
+|| [B1]	SHL	$T1hi,32-19,$S1lo
+   [B1]	SHRU	$T1lo,19,$t0lo
+|| [B1]	SHL	$T1lo,32-19,$t0hi
 ;;===== branch to break? is taken here
-	LDW	*${Xihi}[2],$T2hi
-||	LDW	*${Xilo}[2],$T2lo			; X[i+1]
-||	SHRU	$T1hi,19,$S1hi
-||	SHL	$T1hi,32-19,$S1lo
-	SHRU	$T1lo,19,$t0lo
-||	SHL	$T1lo,32-19,$t0hi
 	XOR	$t0hi,$S1hi,$S1hi
 ||	XOR	$t0lo,$S1lo,$S1lo
 ||	SHRU	$T1hi,61-32,$t0lo
@@ -281,7 +297,6 @@ loop16_79?:
 ||	XOR	$t0lo,$S0lo,$S0lo
 ||	ADD	$S1hi,$T1hi,$T1hi
 ||	ADDU	$S1lo,$T1lo,$T1carry:$T1lo		; T1 = X[i+9]+sigma1()
-|| [B1]	BNOP	loop16_79?
 ||	SHRU	$T2hi,7,$t0hi
 ||	SHL	$T2hi,32-7,$t0lo
 	XOR	$t0hi,$S0hi,$S0hi
@@ -289,6 +304,7 @@ loop16_79?:
 ||	ADD	$CHhi,$T1hi,$T1hi
 ||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += X[i]
 ||	SHRU	$T2lo,7,$t0lo
+|| [B1]	BNOP	loop16_79?
 	XOR	$t0lo,$S0lo,$S0lo			; sigma0(Xi[i+1]
 
 	ADD	$S0hi,$T1hi,$T1hi
@@ -296,6 +312,13 @@ loop16_79?:
 || [B1]	SUB	B1,1,B1
 	NOP						; avoid cross-path stall
 	ADD	$T1carry,$T1hi,$T1hi
+
+   	STW	$T1hi,*$Xihi++[2]			; copied "top" bundle
+||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
+||	ADD	$Hhi,$T1hi,$T1hi
+||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
+||	SHRU	$Ehi,14,$S1hi
+||	SHL	$Ehi,32-14,$S1lo
 ;;===== branch to loop16_79? is taken here
 
 break?:
@@ -359,7 +382,11 @@ break?:
 	NOP	2				; wait till FP is committed
 	.endasmfunc
 
+	.if	__TI_EABI__
+	.sect	".text:sha_asm.const"
+	.else
 	.sect	".const:sha_asm"
+	.endif
 	.align	128
 K512:
 	.uword	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
@@ -402,7 +429,7 @@ K512:
 	.uword	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
 	.uword	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
 	.uword	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
-	.cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.cstring "SHA512 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
 	.align	4
 ___
 
diff --git a/test/fips_algvs.c b/test/fips_algvs.c
index 8ff75dc..2bfd213 100644
--- a/test/fips_algvs.c
+++ b/test/fips_algvs.c
@@ -150,7 +150,7 @@ extern int fips_rsavtest_main(int argc, char **argv);
 extern int fips_shatest_main(int argc, char **argv);
 extern int fips_test_suite_main(int argc, char **argv);
 
-#if !defined(_TMS320C6400_PLUS)
+#if !defined(_TMS320C6400_PLUS) && !defined(_TMS320C6400)
 #include "fips_aesavs.c"
 #include "fips_cmactest.c"
 #include "fips_desmovs.c"
diff --git a/util/mk1mf.pl b/util/mk1mf.pl
index 8934aba..5c4c50a 100755
--- a/util/mk1mf.pl
+++ b/util/mk1mf.pl
@@ -249,7 +249,7 @@ elsif (($platform eq "netware-clib") || ($platform eq "netware-libc") ||
 	$BSDSOCK=1 if ($platform eq "netware-libc-bsdsock") || ($platform eq "netware-clib-bsdsock");
 	require 'netware.pl';
 	}
-elsif ($platform eq "c64xplus")
+elsif ($platform =~ /^c64x/)
 	{
 	require "TI_CGTOOLS.pl";
 	}