[openssl] master update

tomas at openssl.org tomas at openssl.org
Tue Jan 18 10:52:41 UTC 2022


The branch master has been updated
       via  15b7175f558bf9eb057ec3266685486f727dd70f (commit)
      from  c1167f09d840b109ef1c1c1485e3de64be2fc625 (commit)


- Log -----------------------------------------------------------------
commit 15b7175f558bf9eb057ec3266685486f727dd70f
Author: Daniel Hu <Daniel.Hu at arm.com>
Date:   Tue Oct 19 22:49:05 2021 +0100

    SM4 optimization for ARM by HW instruction
    
    This patch implements the SM4 optimization for ARM processor,
    using SM4 HW instruction, which is an optional feature of
    crypto extension for aarch64 V8.
    
    Tested on some modern ARM micro-architectures with SM4 support, the
    performance uplift can be observed around 8X~40X over existing
    C implementation in openssl. Algorithms that can be parallelized
    (like CTR, ECB, CBC decryption) are on higher end, with algorithm
    like CBC encryption on lower end (due to inter-block dependency)
    
    Perf data on Yitian-710 2.75GHz hardware, before and after optimization:
    
    Before:
      type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
      SM4-CTR  105787.80k   107837.87k   108380.84k   108462.08k   108549.46k   108554.92k
      SM4-ECB  111924.58k   118173.76k   119776.00k   120093.70k   120264.02k   120274.94k
      SM4-CBC  106428.09k   109190.98k   109674.33k   109774.51k   109827.41k   109827.41k
    
    After (7.4x - 36.6x faster):
      type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
      SM4-CTR  781979.02k  2432994.28k  3437753.86k  3834177.88k  3963715.58k  3974556.33k
      SM4-ECB  937590.69k  2941689.02k  3945751.81k  4328655.87k  4459181.40k  4468692.31k
      SM4-CBC  890639.88k  1027746.58k  1050621.78k  1056696.66k  1058613.93k  1058701.31k
    
    Signed-off-by: Daniel Hu <Daniel.Hu at arm.com>
    
    Reviewed-by: Paul Dale <pauli at openssl.org>
    Reviewed-by: Tomas Mraz <tomas at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/17455)

-----------------------------------------------------------------------

Summary of changes:
 crypto/arm64cpuid.pl                               |   8 +
 crypto/arm_arch.h                                  |   1 +
 crypto/armcap.c                                    |  10 +
 crypto/evp/e_sm4.c                                 | 193 +++++--
 crypto/sm4/asm/sm4-armv8.pl                        | 635 +++++++++++++++++++++
 crypto/sm4/build.info                              |  32 +-
 include/crypto/sm4_platform.h                      |  48 ++
 providers/implementations/ciphers/cipher_sm4.h     |   1 +
 .../implementations/ciphers/cipher_sm4_gcm_hw.c    |  20 +-
 providers/implementations/ciphers/cipher_sm4_hw.c  |  57 +-
 10 files changed, 945 insertions(+), 60 deletions(-)
 create mode 100755 crypto/sm4/asm/sm4-armv8.pl
 create mode 100644 include/crypto/sm4_platform.h

diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
index b30f505339..1841c0cc04 100755
--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
@@ -80,6 +80,14 @@ _armv8_pmull_probe:
 	ret
 .size	_armv8_pmull_probe,.-_armv8_pmull_probe
 
+.globl	_armv8_sm4_probe
+.type	_armv8_sm4_probe,%function
+_armv8_sm4_probe:
+	AARCH64_VALID_CALL_TARGET
+	.long	0xcec08400	// sm4e	v0.4s, v0.4s
+	ret
+.size	_armv8_sm4_probe,.-_armv8_sm4_probe
+
 .globl	_armv8_sha512_probe
 .type	_armv8_sha512_probe,%function
 _armv8_sha512_probe:
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index 77173cae42..291620ebc9 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -80,6 +80,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 # define ARMV8_CPUID     (1<<7)
 # define ARMV8_RNG       (1<<8)
 # define ARMV8_SM3       (1<<9)
+# define ARMV8_SM4       (1<<10)
 
 /*
  * MIDR_EL1 system register
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 93003c9121..5016987eeb 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -54,6 +54,7 @@ void _armv8_sha256_probe(void);
 void _armv8_pmull_probe(void);
 # ifdef __aarch64__
 void _armv8_sm3_probe(void);
+void _armv8_sm4_probe(void);
 void _armv8_sha512_probe(void);
 unsigned int _armv8_cpuid_probe(void);
 void _armv8_rng_probe(void);
@@ -171,6 +172,7 @@ static unsigned long getauxval(unsigned long key)
 #  define HWCAP_CE_SHA256        (1 << 6)
 #  define HWCAP_CPUID            (1 << 11)
 #  define HWCAP_CE_SM3           (1 << 18)
+#  define HWCAP_CE_SM4           (1 << 19)
 #  define HWCAP_CE_SHA512        (1 << 21)
                                   /* AT_HWCAP2 */
 #  define HWCAP2                 26
@@ -242,6 +244,9 @@ void OPENSSL_cpuid_setup(void)
             OPENSSL_armcap_P |= ARMV8_SHA256;
 
 #  ifdef __aarch64__
+        if (hwcap & HWCAP_CE_SM4)
+            OPENSSL_armcap_P |= ARMV8_SM4;
+
         if (hwcap & HWCAP_CE_SHA512)
             OPENSSL_armcap_P |= ARMV8_SHA512;
 
@@ -293,6 +298,11 @@ void OPENSSL_cpuid_setup(void)
             OPENSSL_armcap_P |= ARMV8_SHA256;
         }
 #  if defined(__aarch64__) && !defined(__APPLE__)
+        if (sigsetjmp(ill_jmp, 1) == 0) {
+            _armv8_sm4_probe();
+            OPENSSL_armcap_P |= ARMV8_SM4;
+        }
+
         if (sigsetjmp(ill_jmp, 1) == 0) {
             _armv8_sha512_probe();
             OPENSSL_armcap_P |= ARMV8_SHA512;
diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
index abd603015c..bff79ff197 100644
--- a/crypto/evp/e_sm4.c
+++ b/crypto/evp/e_sm4.c
@@ -17,92 +17,187 @@
 # include <openssl/modes.h>
 # include "crypto/sm4.h"
 # include "crypto/evp.h"
+# include "crypto/sm4_platform.h"
 # include "evp_local.h"
 
 typedef struct {
-    SM4_KEY ks;
+    union {
+        OSSL_UNION_ALIGN;
+        SM4_KEY ks;
+    } ks;
+    block128_f block;
+    union {
+        ecb128_f ecb;
+        cbc128_f cbc;
+        ctr128_f ctr;
+    } stream;
 } EVP_SM4_KEY;
 
+# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER sm4_##mode = { \
+        nid##_##nmode,blocksize,128/8,ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        EVP_ORIG_GLOBAL,                \
+        sm4_init_key,                   \
+        sm4_##mode##_cipher,            \
+        NULL,                           \
+        sizeof(EVP_SM4_KEY),            \
+        NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_sm4_##mode(void) \
+{ return &sm4_##mode; }
+
+#define DEFINE_BLOCK_CIPHERS(nid,flags)             \
+        BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)     \
+        BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)      \
+        BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
+        BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
+        BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags)
+
 static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                         const unsigned char *iv, int enc)
 {
-    ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+    int mode;
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+    mode = EVP_CIPHER_CTX_get_mode(ctx);
+    if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+        && !enc) {
+#ifdef HWSM4_CAPABLE
+        if (HWSM4_CAPABLE) {
+            HWSM4_set_decrypt_key(key, &dat->ks.ks);
+            dat->block = (block128_f) HWSM4_decrypt;
+            dat->stream.cbc = NULL;
+# ifdef HWSM4_cbc_encrypt
+            if (mode == EVP_CIPH_CBC_MODE)
+                dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
+# endif
+# ifdef HWSM4_ecb_encrypt
+            if (mode == EVP_CIPH_ECB_MODE)
+                dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+# endif
+        } else
+#endif
+        {
+            dat->block = (block128_f) ossl_sm4_decrypt;
+            ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+        }
+    } else
+#ifdef HWSM4_CAPABLE
+    if (HWSM4_CAPABLE) {
+        HWSM4_set_encrypt_key(key, &dat->ks.ks);
+        dat->block = (block128_f) HWSM4_encrypt;
+        dat->stream.cbc = NULL;
+# ifdef HWSM4_cbc_encrypt
+        if (mode == EVP_CIPH_CBC_MODE)
+            dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
+        else
+# endif
+# ifdef HWSM4_ecb_encrypt
+        if (mode == EVP_CIPH_ECB_MODE)
+            dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+        else
+# endif
+# ifdef HWSM4_ctr32_encrypt_blocks
+        if (mode == EVP_CIPH_CTR_MODE)
+            dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
+        else
+# endif
+            (void)0;            /* terminate potentially open 'else' */
+    } else
+#endif
+    {
+        dat->block = (block128_f) ossl_sm4_encrypt;
+        ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+    }
     return 1;
 }
 
-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
-                            size_t len, const SM4_KEY *key,
-                            unsigned char *ivec, const int enc)
+static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    if (enc)
-        CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
-                              (block128_f)ossl_sm4_encrypt);
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+    if (dat->stream.cbc)
+        (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv,
+                            EVP_CIPHER_CTX_is_encrypting(ctx));
+    else if (EVP_CIPHER_CTX_is_encrypting(ctx))
+        CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv,
+                              dat->block);
     else
-        CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
-                              (block128_f)ossl_sm4_decrypt);
+        CRYPTO_cbc128_decrypt(in, out, len, &dat->ks,
+                              ctx->iv, dat->block);
+    return 1;
 }
 
-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-                               size_t length, const SM4_KEY *key,
-                               unsigned char *ivec, int *num, const int enc)
+static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
-                          (block128_f)ossl_sm4_encrypt);
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+    int num = EVP_CIPHER_CTX_get_num(ctx);
+
+    CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
+                          ctx->iv, &num,
+                          EVP_CIPHER_CTX_is_encrypting(ctx), dat->block);
+    EVP_CIPHER_CTX_set_num(ctx, num);
+    return 1;
 }
 
-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
-                            const SM4_KEY *key, const int enc)
+static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    if (enc)
-        ossl_sm4_encrypt(in, out, key);
+    size_t bl = EVP_CIPHER_CTX_get_block_size(ctx);
+    size_t i;
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+    if (len < bl)
+        return 1;
+
+    if (dat->stream.ecb != NULL)
+        (*dat->stream.ecb) (in, out, len, &dat->ks.ks,
+                            EVP_CIPHER_CTX_is_encrypting(ctx));
     else
-        ossl_sm4_decrypt(in, out, key);
+        for (i = 0, len -= bl; i <= len; i += bl)
+            (*dat->block) (in + i, out + i, &dat->ks);
+
+    return 1;
 }
 
-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-                               size_t length, const SM4_KEY *key,
-                               unsigned char *ivec, int *num)
+static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
-                          (block128_f)ossl_sm4_encrypt);
-}
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+    int num = EVP_CIPHER_CTX_get_num(ctx);
 
-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4,
-                       16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1,
-                       sm4_init_key, 0, 0, 0, 0)
+    CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
+                          ctx->iv, &num, dat->block);
+    EVP_CIPHER_CTX_set_num(ctx, num);
+    return 1;
+}
 
 static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
                           const unsigned char *in, size_t len)
 {
     int n = EVP_CIPHER_CTX_get_num(ctx);
     unsigned int num;
-    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx);
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
 
     if (n < 0)
         return 0;
     num = (unsigned int)n;
 
-    CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv,
-                          EVP_CIPHER_CTX_buf_noconst(ctx), &num,
-                          (block128_f)ossl_sm4_encrypt);
+    if (dat->stream.ctr)
+        CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
+                                    ctx->iv,
+                                    EVP_CIPHER_CTX_buf_noconst(ctx),
+                                    &num, dat->stream.ctr);
+    else
+        CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
+                              ctx->iv,
+                              EVP_CIPHER_CTX_buf_noconst(ctx), &num,
+                              dat->block);
     EVP_CIPHER_CTX_set_num(ctx, num);
     return 1;
 }
 
-static const EVP_CIPHER sm4_ctr_mode = {
-    NID_sm4_ctr, 1, 16, 16,
-    EVP_CIPH_CTR_MODE,
-    EVP_ORIG_GLOBAL,
-    sm4_init_key,
-    sm4_ctr_cipher,
-    NULL,
-    sizeof(EVP_SM4_KEY),
-    NULL, NULL, NULL, NULL
-};
-
-const EVP_CIPHER *EVP_sm4_ctr(void)
-{
-    return &sm4_ctr_mode;
-}
-
+DEFINE_BLOCK_CIPHERS(NID_sm4, 0)
 #endif
diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
new file mode 100755
index 0000000000..7358a6e6a2
--- /dev/null
+++ b/crypto/sm4/asm/sm4-armv8.pl
@@ -0,0 +1,635 @@
+#! /usr/bin/env perl
+# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# This module implements support for SM4 hw support on aarch64
+# Oct 2021
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$prefix="sm4_v8";
+my @rks=map("v$_",(0..7));
+
+sub rev32() {
+my $dst = shift;
+my $src = shift;
+$code.=<<___;
+#ifndef __ARMEB__
+	rev32	$dst.16b,$src.16b
+#endif
+___
+}
+
+sub enc_blk () {
+my $data = shift;
+$code.=<<___;
+	sm4e	$data.4s, at rks[0].4s
+	sm4e	$data.4s, at rks[1].4s
+	sm4e	$data.4s, at rks[2].4s
+	sm4e	$data.4s, at rks[3].4s
+	sm4e	$data.4s, at rks[4].4s
+	sm4e	$data.4s, at rks[5].4s
+	sm4e	$data.4s, at rks[6].4s
+	sm4e	$data.4s, at rks[7].4s
+	rev64	$data.4S,$data.4S
+	ext	$data.16b,$data.16b,$data.16b,#8
+___
+}
+
+sub enc_4blks () {
+my $data0 = shift;
+my $data1 = shift;
+my $data2 = shift;
+my $data3 = shift;
+$code.=<<___;
+	sm4e	$data0.4s, at rks[0].4s
+	sm4e	$data1.4s, at rks[0].4s
+	sm4e	$data2.4s, at rks[0].4s
+	sm4e	$data3.4s, at rks[0].4s
+
+	sm4e	$data0.4s, at rks[1].4s
+	sm4e	$data1.4s, at rks[1].4s
+	sm4e	$data2.4s, at rks[1].4s
+	sm4e	$data3.4s, at rks[1].4s
+
+	sm4e	$data0.4s, at rks[2].4s
+	sm4e	$data1.4s, at rks[2].4s
+	sm4e	$data2.4s, at rks[2].4s
+	sm4e	$data3.4s, at rks[2].4s
+
+	sm4e	$data0.4s, at rks[3].4s
+	sm4e	$data1.4s, at rks[3].4s
+	sm4e	$data2.4s, at rks[3].4s
+	sm4e	$data3.4s, at rks[3].4s
+
+	sm4e	$data0.4s, at rks[4].4s
+	sm4e	$data1.4s, at rks[4].4s
+	sm4e	$data2.4s, at rks[4].4s
+	sm4e	$data3.4s, at rks[4].4s
+
+	sm4e	$data0.4s, at rks[5].4s
+	sm4e	$data1.4s, at rks[5].4s
+	sm4e	$data2.4s, at rks[5].4s
+	sm4e	$data3.4s, at rks[5].4s
+
+	sm4e	$data0.4s, at rks[6].4s
+	sm4e	$data1.4s, at rks[6].4s
+	sm4e	$data2.4s, at rks[6].4s
+	sm4e	$data3.4s, at rks[6].4s
+
+	sm4e	$data0.4s, at rks[7].4s
+	rev64	$data0.4S,$data0.4S
+	sm4e	$data1.4s, at rks[7].4s
+	ext	$data0.16b,$data0.16b,$data0.16b,#8
+	rev64	$data1.4S,$data1.4S
+	sm4e	$data2.4s, at rks[7].4s
+	ext	$data1.16b,$data1.16b,$data1.16b,#8
+	rev64	$data2.4S,$data2.4S
+	sm4e	$data3.4s, at rks[7].4s
+	ext	$data2.16b,$data2.16b,$data2.16b,#8
+	rev64	$data3.4S,$data3.4S
+	ext	$data3.16b,$data3.16b,$data3.16b,#8
+___
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch	armv8-a+crypto
+.text
+___
+
+{{{
+$code.=<<___;
+.align	6
+.Lck:
+	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+	.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+my ($tmp)=("x2");
+my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
+my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
+my ($fkconst) = ("v24");
+$code.=<<___;
+.globl	${prefix}_set_encrypt_key
+.type	${prefix}_set_encrypt_key,%function
+.align	5
+${prefix}_set_encrypt_key:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$key0.4s},[$key]
+	adr	$tmp,.Lfk
+	ld1	{$fkconst.4s},[$tmp]
+	adr	$tmp,.Lck
+	ld1	{$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
+___
+	&rev32($key0, $key0);
+$code.=<<___;
+	ld1	{$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
+	eor	$key0.16b,$key0.16b,$fkconst.16b;
+	sm4ekey	$key0.4S,$key0.4S,$const0.4S
+	sm4ekey	$key1.4S,$key0.4S,$const1.4S
+	sm4ekey	$key2.4S,$key1.4S,$const2.4S
+	sm4ekey	$key3.4S,$key2.4S,$const3.4S
+	sm4ekey	$key4.4S,$key3.4S,$const4.4S
+	st1	{$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
+	sm4ekey	$key5.4S,$key4.4S,$const5.4S
+	sm4ekey	$key6.4S,$key5.4S,$const6.4S
+	sm4ekey	$key7.4S,$key6.4S,$const7.4S
+	st1	{$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
+	ret
+.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+my ($tmp)=("x2");
+my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
+my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
+my ($fkconst) = ("v24");
+$code.=<<___;
+.globl	${prefix}_set_decrypt_key
+.type	${prefix}_set_decrypt_key,%function
+.align	5
+${prefix}_set_decrypt_key:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$key0.4s},[$key]
+	adr	$tmp,.Lfk
+	ld1	{$fkconst.4s},[$tmp]
+	adr	$tmp, .Lck
+	ld1	{$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
+___
+	&rev32($key0, $key0);
+$code.=<<___;
+	ld1	{$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
+	eor	$key0.16b, $key0.16b,$fkconst.16b;
+	sm4ekey	$key0.4S,$key0.4S,$const0.4S
+	sm4ekey	$key1.4S,$key0.4S,$const1.4S
+	sm4ekey	$key2.4S,$key1.4S,$const2.4S
+	rev64	$key0.4s,$key0.4s
+	rev64	$key1.4s,$key1.4s
+	ext	$key0.16b,$key0.16b,$key0.16b,#8
+	ext	$key1.16b,$key1.16b,$key1.16b,#8
+	sm4ekey	$key3.4S,$key2.4S,$const3.4S
+	sm4ekey	$key4.4S,$key3.4S,$const4.4S
+	rev64	$key2.4s,$key2.4s
+	rev64	$key3.4s,$key3.4s
+	ext	$key2.16b,$key2.16b,$key2.16b,#8
+	ext	$key3.16b,$key3.16b,$key3.16b,#8
+	sm4ekey	$key5.4S,$key4.4S,$const5.4S
+	sm4ekey	$key6.4S,$key5.4S,$const6.4S
+	rev64	$key4.4s,$key4.4s
+	rev64	$key5.4s,$key5.4s
+	ext	$key4.16b,$key4.16b,$key4.16b,#8
+	ext	$key5.16b,$key5.16b,$key5.16b,#8
+	sm4ekey	$key7.4S,$key6.4S,$const7.4S
+	rev64	$key6.4s, $key6.4s
+	rev64	$key7.4s, $key7.4s
+	ext	$key6.16b,$key6.16b,$key6.16b,#8
+	ext	$key7.16b,$key7.16b,$key7.16b,#8
+	st1	{$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
+	st1	{$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
+	ret
+.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($inp,$out,$rk)=map("x$_",(0..2));
+my ($data)=("v16");
+$code.=<<___;
+.globl	${prefix}_${dir}crypt
+.type	${prefix}_${dir}crypt,%function
+.align	5
+${prefix}_${dir}crypt:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$data.4s},[$inp]
+	ld1	{@rks[0].4s, at rks[1].4s, at rks[2].4s, at rks[3].4s},[$rk],64
+	ld1	{@rks[4].4s, at rks[5].4s, at rks[6].4s, at rks[7].4s},[$rk]
+___
+	&rev32($data,$data);
+	&enc_blk($data);
+	&rev32($data,$data);
+$code.=<<___;
+	st1	{$data.4s},[$out]
+	ret
+.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+
+&gen_block("en");
+&gen_block("de");
+}}}
+
+{{{
+my ($inp,$out,$len,$rk)=map("x$_",(0..3));
+my ($enc) = ("w4");
+my @dat=map("v$_",(16..23));
+$code.=<<___;
+.globl	${prefix}_ecb_encrypt
+.type	${prefix}_ecb_encrypt,%function
+.align	5
+${prefix}_ecb_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{@rks[0].4s, at rks[1].4s, at rks[2].4s, at rks[3].4s},[$rk],#64
+	ld1	{@rks[4].4s, at rks[5].4s, at rks[6].4s, at rks[7].4s},[$rk]
+1:
+	cmp	$len,#64
+	b.lt	1f
+	ld1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$inp],#64
+	cmp	$len,#128
+	b.lt	2f
+	ld1	{@dat[4].4s, at dat[5].4s, at dat[6].4s, at dat[7].4s},[$inp],#64
+	// 8 blocks
+___
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+	&rev32(@dat[4], at dat[4]);
+	&rev32(@dat[5], at dat[5]);
+	&rev32(@dat[6], at dat[6]);
+	&rev32(@dat[7], at dat[7]);
+	&enc_4blks(@dat[0], at dat[1], at dat[2], at dat[3]);
+	&enc_4blks(@dat[4], at dat[5], at dat[6], at dat[7]);
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+	&rev32(@dat[4], at dat[4]);
+	&rev32(@dat[5], at dat[5]);
+$code.=<<___;
+	st1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$out],#64
+___
+	&rev32(@dat[6], at dat[6]);
+	&rev32(@dat[7], at dat[7]);
+$code.=<<___;
+	st1	{@dat[4].4s, at dat[5].4s, at dat[6].4s, at dat[7].4s},[$out],#64
+	subs	$len,$len,#128
+	b.gt	1b
+	ret
+	// 4 blocks
+2:
+___
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+	&enc_4blks(@dat[0], at dat[1], at dat[2], at dat[3]);
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+$code.=<<___;
+	st1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$out],#64
+	subs	$len,$len,#64
+	b.gt	1b
+1:
+	subs	$len,$len,#16
+	b.lt	1f
+	ld1	{@dat[0].4s},[$inp],#16
+___
+	&rev32(@dat[0], at dat[0]);
+	&enc_blk(@dat[0]);
+	&rev32(@dat[0], at dat[0]);
+$code.=<<___;
+	st1	{@dat[0].4s},[$out],#16
+	b.ne	1b
+1:
+	ret
+.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
+
+{{{
+my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
+my ($enc) = ("w5");
+my @dat=map("v$_",(16..23));
+my @in=map("v$_",(24..31));
+my ($ivec) = ("v8");
+$code.=<<___;
+.globl	${prefix}_cbc_encrypt
+.type	${prefix}_cbc_encrypt,%function
+.align	5
+${prefix}_cbc_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	stp	d8,d9,[sp, #-16]!
+
+	ld1	{@rks[0].4s, at rks[1].4s, at rks[2].4s, at rks[3].4s},[$rk],#64
+	ld1	{@rks[4].4s, at rks[5].4s, at rks[6].4s, at rks[7].4s},[$rk]
+	ld1	{$ivec.4s},[$ivp]
+	cmp	$enc,#0
+	b.eq	.Ldec
+1:
+	cmp	$len, #64
+	b.lt	1f
+	ld1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$inp],#64
+	eor	@dat[0].16b, at dat[0].16b,$ivec.16b
+___
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+	&enc_blk(@dat[0]);
+$code.=<<___;
+	eor	@dat[1].16b, at dat[1].16b, at dat[0].16b
+___
+	&enc_blk(@dat[1]);
+	&rev32(@dat[0], at dat[0]);
+$code.=<<___;
+	eor	@dat[2].16b, at dat[2].16b, at dat[1].16b
+___
+	&enc_blk(@dat[2]);
+	&rev32(@dat[1], at dat[1]);
+$code.=<<___;
+	eor	@dat[3].16b, at dat[3].16b, at dat[2].16b
+___
+	&enc_blk(@dat[3]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+$code.=<<___;
+	mov	$ivec.16b, at dat[3].16b
+	st1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$out],#64
+	subs	$len,$len,#64
+	b.ne	1b
+1:
+	subs	$len,$len,#16
+	b.lt	3f
+	ld1	{@dat[0].4s},[$inp],#16
+	eor	$ivec.16b,$ivec.16b, at dat[0].16b
+___
+	&rev32($ivec,$ivec);
+	&enc_blk($ivec);
+	&rev32($ivec,$ivec);
+$code.=<<___;
+	st1	{$ivec.16b},[$out],#16
+	b.ne	1b
+	b	3f
+.Ldec:
+1:
+	cmp	$len, #64
+	b.lt	1f
+	ld1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$inp]
+	ld1	{@in[0].4s, at in[1].4s, at in[2].4s, at in[3].4s},[$inp],#64
+	cmp	$len,#128
+	b.lt	2f
+	// 8 blocks mode
+	ld1	{@dat[4].4s, at dat[5].4s, at dat[6].4s, at dat[7].4s},[$inp]
+	ld1	{@in[4].4s, at in[5].4s, at in[6].4s, at in[7].4s},[$inp],#64
+___
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3],$dat[3]);
+	&rev32(@dat[4], at dat[4]);
+	&rev32(@dat[5], at dat[5]);
+	&rev32(@dat[6], at dat[6]);
+	&rev32(@dat[7],$dat[7]);
+	&enc_4blks(@dat[0], at dat[1], at dat[2], at dat[3]);
+	&enc_4blks(@dat[4], at dat[5], at dat[6], at dat[7]);
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+	&rev32(@dat[4], at dat[4]);
+	&rev32(@dat[5], at dat[5]);
+	&rev32(@dat[6], at dat[6]);
+	&rev32(@dat[7], at dat[7]);
+$code.=<<___;
+	eor	@dat[0].16b, at dat[0].16b,$ivec.16b
+	eor	@dat[1].16b, at dat[1].16b, at in[0].16b
+	eor	@dat[2].16b, at dat[2].16b, at in[1].16b
+	mov	$ivec.16b, at in[7].16b
+	eor	@dat[3].16b,$dat[3].16b, at in[2].16b
+	eor	@dat[4].16b,$dat[4].16b, at in[3].16b
+	eor	@dat[5].16b,$dat[5].16b, at in[4].16b
+	eor	@dat[6].16b,$dat[6].16b, at in[5].16b
+	eor	@dat[7].16b,$dat[7].16b, at in[6].16b
+	st1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$out],#64
+	st1	{@dat[4].4s, at dat[5].4s, at dat[6].4s, at dat[7].4s},[$out],#64
+	subs	$len,$len,128
+	b.gt	1b
+	b	3f
+	// 4 blocks mode
+2:
+___
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3],$dat[3]);
+	&enc_4blks(@dat[0], at dat[1], at dat[2], at dat[3]);
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+$code.=<<___;
+	eor	@dat[0].16b, at dat[0].16b,$ivec.16b
+	eor	@dat[1].16b, at dat[1].16b, at in[0].16b
+	mov	$ivec.16b, at in[3].16b
+	eor	@dat[2].16b, at dat[2].16b, at in[1].16b
+	eor	@dat[3].16b,$dat[3].16b, at in[2].16b
+	st1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$out],#64
+	subs	$len,$len,#64
+	b.gt	1b
+1:
+	subs	$len,$len,#16
+	b.lt	3f
+	ld1	{@dat[0].4s},[$inp],#16
+	mov	@in[0].16b, at dat[0].16b
+___
+	&rev32(@dat[0], at dat[0]);
+	&enc_blk(@dat[0]);
+	&rev32(@dat[0], at dat[0]);
+$code.=<<___;
+	eor	@dat[0].16b, at dat[0].16b,$ivec.16b
+	mov	$ivec.16b, at in[0].16b
+	st1	{@dat[0].16b},[$out],#16
+	b.ne	1b
+3:
+	// save back IV
+	st1	{$ivec.16b},[$ivp]
+	ldp	d8,d9,[sp],#16
+	ret
+.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+
+{{{
+my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
+my ($ctr)=("w5");
+my @dat=map("v$_",(16..23));
+my @in=map("v$_",(24..31));
+my ($ivec)=("v8");
+$code.=<<___;
+.globl	${prefix}_ctr32_encrypt_blocks
+.type	${prefix}_ctr32_encrypt_blocks,%function
+.align	5
+${prefix}_ctr32_encrypt_blocks:
+	AARCH64_VALID_CALL_TARGET
+	stp	d8,d9,[sp, #-16]!
+
+	ld1	{$ivec.4s},[$ivp]
+	ld1	{@rks[0].4s, at rks[1].4s, at rks[2].4s, at rks[3].4s},[$rk],64
+	ld1	{@rks[4].4s, at rks[5].4s, at rks[6].4s, at rks[7].4s},[$rk]
+___
+	&rev32($ivec,$ivec);
+$code.=<<___;
+	mov	$ctr,$ivec.s[3]
+1:
+	cmp	$len,#4
+	b.lt	1f
+	ld1	{@in[0].4s, at in[1].4s, at in[2].4s, at in[3].4s},[$inp],#64
+	mov	@dat[0].16b,$ivec.16b
+	mov	@dat[1].16b,$ivec.16b
+	mov	@dat[2].16b,$ivec.16b
+	mov	@dat[3].16b,$ivec.16b
+	add	$ctr,$ctr,#1
+	mov	$dat[1].s[3],$ctr
+	add	$ctr,$ctr,#1
+	mov	@dat[2].s[3],$ctr
+	add	$ctr,$ctr,#1
+	mov	@dat[3].s[3],$ctr
+	cmp	$len,#8
+	b.lt	2f
+	ld1	{@in[4].4s, at in[5].4s, at in[6].4s, at in[7].4s},[$inp],#64
+	mov	@dat[4].16b,$ivec.16b
+	mov	@dat[5].16b,$ivec.16b
+	mov	@dat[6].16b,$ivec.16b
+	mov	@dat[7].16b,$ivec.16b
+	add	$ctr,$ctr,#1
+	mov	$dat[4].s[3],$ctr
+	add	$ctr,$ctr,#1
+	mov	@dat[5].s[3],$ctr
+	add	$ctr,$ctr,#1
+	mov	@dat[6].s[3],$ctr
+	add	$ctr,$ctr,#1
+	mov	@dat[7].s[3],$ctr
+___
+	&enc_4blks(@dat[0], at dat[1], at dat[2], at dat[3]);
+	&enc_4blks(@dat[4], at dat[5], at dat[6], at dat[7]);
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+	&rev32(@dat[4], at dat[4]);
+	&rev32(@dat[5], at dat[5]);
+	&rev32(@dat[6], at dat[6]);
+	&rev32(@dat[7], at dat[7]);
+$code.=<<___;
+	eor	@dat[0].16b, at dat[0].16b, at in[0].16b
+	eor	@dat[1].16b, at dat[1].16b, at in[1].16b
+	eor	@dat[2].16b, at dat[2].16b, at in[2].16b
+	eor	@dat[3].16b, at dat[3].16b, at in[3].16b
+	eor	@dat[4].16b, at dat[4].16b, at in[4].16b
+	eor	@dat[5].16b, at dat[5].16b, at in[5].16b
+	eor	@dat[6].16b, at dat[6].16b, at in[6].16b
+	eor	@dat[7].16b, at dat[7].16b, at in[7].16b
+	st1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$out],#64
+	st1	{@dat[4].4s, at dat[5].4s, at dat[6].4s, at dat[7].4s},[$out],#64
+	subs	$len,$len,#8
+	b.eq	3f
+	add	$ctr,$ctr,#1
+	mov	$ivec.s[3],$ctr
+	b	1b
+2:
+___
+	&enc_4blks(@dat[0], at dat[1], at dat[2], at dat[3]);
+	&rev32(@dat[0], at dat[0]);
+	&rev32(@dat[1], at dat[1]);
+	&rev32(@dat[2], at dat[2]);
+	&rev32(@dat[3], at dat[3]);
+$code.=<<___;
+	eor	@dat[0].16b, at dat[0].16b, at in[0].16b
+	eor	@dat[1].16b, at dat[1].16b, at in[1].16b
+	eor	@dat[2].16b, at dat[2].16b, at in[2].16b
+	eor	@dat[3].16b, at dat[3].16b, at in[3].16b
+	st1	{@dat[0].4s, at dat[1].4s, at dat[2].4s, at dat[3].4s},[$out],#64
+	subs	$len,$len,#4
+	b.eq	3f
+	add	$ctr,$ctr,#1
+	mov	$ivec.s[3],$ctr
+	b	1b
+1:
+	subs	$len,$len,#1
+	b.lt	3f
+	mov	$dat[0].16b,$ivec.16b
+	ld1	{@in[0].4s},[$inp],#16
+___
+	&enc_blk(@dat[0]);
+	&rev32(@dat[0], at dat[0]);
+$code.=<<___;
+	eor	$dat[0].16b,$dat[0].16b, at in[0].16b
+	st1	{$dat[0].4s},[$out],#16
+	b.eq	3f
+	add	$ctr,$ctr,#1
+	mov	$ivec.s[3],$ctr
+	b	1b
+3:
+	ldp	d8,d9,[sp],#16
+	ret
+.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+########################################
+{   my  %opcode = (
+        "sm4e"          => 0xcec08400,
+        "sm4ekey"       => 0xce60c800);
+
+    sub unsm4 {
+        my ($mnemonic,$arg)=@_;
+
+        $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+        &&
+        sprintf ".inst\t0x%08x\t//%s %s",
+                        $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+                        $mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
index b65a7d149e..e27aa49e67 100644
--- a/crypto/sm4/build.info
+++ b/crypto/sm4/build.info
@@ -1,4 +1,32 @@
 LIBS=../../libcrypto
-SOURCE[../../libcrypto]=\
-        sm4.c
 
+IF[{- !$disabled{asm} -}]
+  $SM4DEF_aarch64=SM4_ASM
+  $SM4ASM_aarch64=sm4-armv8.S
+
+  # Now that we have defined all the arch specific variables, use the
+  # appropriate one, and define the appropriate macros
+  IF[$SM4ASM_{- $target{asm_arch} -}]
+    $SM4ASM=$SM4ASM_{- $target{asm_arch} -}
+    $SM4DEF=$SM4DEF_{- $target{asm_arch} -}
+  ENDIF
+ENDIF
+
+SOURCE[../../libcrypto]= $SM4ASM sm4.c
+
+
+# Implementations are now spread across several libraries, so the defines
+# need to be applied to all affected libraries and modules.
+DEFINE[../../libcrypto]=$SM4DEF
+DEFINE[../../providers/libfips.a]=$SM4DEF
+DEFINE[../../providers/libdefault.a]=$SM4DEF
+# We only need to include the SM4DEF stuff in the legacy provider when it's a
+# separate module and it's dynamically linked with libcrypto.  Otherwise, it
+# already gets everything that the static libcrypto.a has, and doesn't need it
+# added again.
+IF[{- !$disabled{module} && !$disabled{shared} -}]
+  DEFINE[../providers/liblegacy.a]=$SM4DEF
+ENDIF
+
+GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
+INCLUDE[sm4-armv8.o]=..
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
new file mode 100644
index 0000000000..42c8b44a43
--- /dev/null
+++ b/include/crypto/sm4_platform.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef OSSL_SM4_PLATFORM_H
+# define OSSL_SM4_PLATFORM_H
+# pragma once
+
+# if defined(OPENSSL_CPUID_OBJ)
+#  if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+#   include "arm_arch.h"
+#   if __ARM_MAX_ARCH__>=8
+#    define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
+#    define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
+#    define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
+#    define HWSM4_encrypt sm4_v8_encrypt
+#    define HWSM4_decrypt sm4_v8_decrypt
+#    define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
+#    define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
+#    define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
+#   endif
+#  endif
+# endif /* OPENSSL_CPUID_OBJ */
+
+# if defined(HWSM4_CAPABLE)
+int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
+int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
+void HWSM4_encrypt(const unsigned char *in, unsigned char *out,
+                   const SM4_KEY *key);
+void HWSM4_decrypt(const unsigned char *in, unsigned char *out,
+                   const SM4_KEY *key);
+void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const SM4_KEY *key,
+                       unsigned char *ivec, const int enc);
+void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const SM4_KEY *key,
+                       const int enc);
+void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                size_t len, const void *key,
+                                const unsigned char ivec[16]);
+# endif /* HWSM4_CAPABLE */
+
+#endif /* OSSL_SM4_PLATFORM_H */
diff --git a/providers/implementations/ciphers/cipher_sm4.h b/providers/implementations/ciphers/cipher_sm4.h
index f7f833fcb4..01a031a74d 100644
--- a/providers/implementations/ciphers/cipher_sm4.h
+++ b/providers/implementations/ciphers/cipher_sm4.h
@@ -9,6 +9,7 @@
 
 #include "prov/ciphercommon.h"
 #include "crypto/sm4.h"
+#include "crypto/sm4_platform.h"
 
 typedef struct prov_cast_ctx_st {
     PROV_CIPHER_CTX base;      /* Must be first */
diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
index 6bcd1ec406..c0c9b22bd3 100644
--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
@@ -12,6 +12,7 @@
  */
 
 #include "cipher_sm4_gcm.h"
+#include "crypto/sm4_platform.h"
 
 static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
                            size_t keylen)
@@ -20,9 +21,22 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
     SM4_KEY *ks = &actx->ks.ks;
 
     ctx->ks = ks;
-    ossl_sm4_set_key(key, ks);
-    CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
-    ctx->ctr = (ctr128_f)NULL;
+# ifdef HWSM4_CAPABLE
+    if (HWSM4_CAPABLE) {
+        HWSM4_set_encrypt_key(key, ks);
+        CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) HWSM4_encrypt);
+#  ifdef HWSM4_ctr32_encrypt_blocks
+        ctx->ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
+#  else /* HWSM4_ctr32_encrypt_blocks */
+        ctx->ctr = (ctr128_f)NULL;
+#  endif
+    } else
+# endif /* HWSM4_CAPABLE */
+    {
+        ossl_sm4_set_key(key, ks);
+        CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
+        ctx->ctr = (ctr128_f)NULL;
+    }
     ctx->key_set = 1;
 
     return 1;
diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
index 0db04b1a74..4cd3d3d669 100644
--- a/providers/implementations/ciphers/cipher_sm4_hw.c
+++ b/providers/implementations/ciphers/cipher_sm4_hw.c
@@ -15,14 +15,59 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
     PROV_SM4_CTX *sctx =  (PROV_SM4_CTX *)ctx;
     SM4_KEY *ks = &sctx->ks.ks;
 
-    ossl_sm4_set_key(key, ks);
     ctx->ks = ks;
     if (ctx->enc
             || (ctx->mode != EVP_CIPH_ECB_MODE
-                && ctx->mode != EVP_CIPH_CBC_MODE))
-        ctx->block = (block128_f)ossl_sm4_encrypt;
-    else
-        ctx->block = (block128_f)ossl_sm4_decrypt;
+                && ctx->mode != EVP_CIPH_CBC_MODE)) {
+#ifdef HWSM4_CAPABLE
+        if (HWSM4_CAPABLE) {
+            HWSM4_set_encrypt_key(key, ks);
+            ctx->block = (block128_f)HWSM4_encrypt;
+            ctx->stream.cbc = NULL;
+#ifdef HWSM4_cbc_encrypt
+            if (ctx->mode == EVP_CIPH_CBC_MODE)
+                ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
+            else
+#endif
+#ifdef HWSM4_ecb_encrypt
+            if (ctx->mode == EVP_CIPH_ECB_MODE)
+                ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
+            else
+#endif
+#ifdef HWSM4_ctr32_encrypt_blocks
+            if (ctx->mode == EVP_CIPH_CTR_MODE)
+                ctx->stream.ctr = (ctr128_f)HWSM4_ctr32_encrypt_blocks;
+            else
+#endif
+            (void)0;            /* terminate potentially open 'else' */
+        } else
+#endif
+        {
+            ossl_sm4_set_key(key, ks);
+            ctx->block = (block128_f)ossl_sm4_encrypt;
+        }
+    } else {
+#ifdef HWSM4_CAPABLE
+        if (HWSM4_CAPABLE) {
+            HWSM4_set_decrypt_key(key, ks);
+            ctx->block = (block128_f)HWSM4_decrypt;
+            ctx->stream.cbc = NULL;
+#ifdef HWSM4_cbc_encrypt
+            if (ctx->mode == EVP_CIPH_CBC_MODE)
+                ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
+#endif
+#ifdef HWSM4_ecb_encrypt
+            if (ctx->mode == EVP_CIPH_ECB_MODE)
+                ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
+#endif
+        } else
+#endif
+        {
+            ossl_sm4_set_key(key, ks);
+            ctx->block = (block128_f)ossl_sm4_decrypt;
+        }
+    }
+
     return 1;
 }
 
@@ -31,7 +76,7 @@ IMPLEMENT_CIPHER_HW_COPYCTX(cipher_hw_sm4_copyctx, PROV_SM4_CTX)
 # define PROV_CIPHER_HW_sm4_mode(mode)                                         \
 static const PROV_CIPHER_HW sm4_##mode = {                                     \
     cipher_hw_sm4_initkey,                                                     \
-    ossl_cipher_hw_chunked_##mode,                                             \
+    ossl_cipher_hw_generic_##mode,                                             \
     cipher_hw_sm4_copyctx                                                      \
 };                                                                             \
 const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_##mode(size_t keybits)           \


More information about the openssl-commits mailing list