[openssl] master update

Matt Caswell matt at openssl.org
Thu May 20 07:51:55 UTC 2021


The branch master has been updated
       via  e3884ec5c37334e585e9208ce69d7e5b3cad4624 (commit)
      from  b7140b0604bdfaa034452d97648a9c23a97568e4 (commit)


- Log -----------------------------------------------------------------
commit e3884ec5c37334e585e9208ce69d7e5b3cad4624
Author: Pauli <pauli at openssl.org>
Date:   Thu May 20 13:51:59 2021 +1000

    Revert "ARM assembly pack: translate bit-sliced AES implementation to AArch64"
    
    This reverts commit da51566b256e0c0536d5b986e676863b0526bf5e.
    
    Fixes #15321
    
    Reviewed-by: Tim Hudson <tjh at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/15364)

-----------------------------------------------------------------------

Summary of changes:
 crypto/aes/asm/bsaes-armv8.S | 2338 ------------------------------------------
 crypto/aes/build.info        |    5 +-
 2 files changed, 2 insertions(+), 2341 deletions(-)
 delete mode 100644 crypto/aes/asm/bsaes-armv8.S

diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S
deleted file mode 100644
index 9bd02d0c8a..0000000000
--- a/crypto/aes/asm/bsaes-armv8.S
+++ /dev/null
@@ -1,2338 +0,0 @@
-// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License").  You may not use
-// this file except in compliance with the License.  You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-//
-// ====================================================================
-// Written by Ben Avison <bavison at riscosopen.org> for the OpenSSL
-// project. Rights for redistribution and usage in source and binary
-// forms are granted according to the OpenSSL license.
-// ====================================================================
-//
-// This implementation is a translation of bsaes-armv7 for AArch64.
-// No attempt has been made to carry across the build switches for
-// kernel targets, since the Linux kernel crypto support has moved on
-// from when it was based on OpenSSL.
-
-// A lot of hand-scheduling has been performed. Consequently, this code
-// doesn't factor out neatly into macros in the same way that the
-// AArch32 version did, and there is little to be gained by wrapping it
-// up in Perl, and it is presented as pure assembly.
-
-
-#include "crypto/arm_arch.h"
-
-.text
-
-.type   _bsaes_decrypt8,%function
-.align  4
-// On entry:
-//   x9 -> key (previously expanded using _bsaes_key_convert)
-//   x10 = number of rounds
-//   v0-v7 input data
-// On exit:
-//   x9-x11 corrupted
-//   other general-purpose registers preserved
-//   v0-v7 output data
-//   v11-v15 preserved
-//   other SIMD registers corrupted
-_bsaes_decrypt8:
-        ldr     q8, [x9], #16
-        adr     x11, .LM0ISR
-        movi    v9.16b, #0x55
-        ldr     q10, [x11], #16
-        movi    v16.16b, #0x33
-        movi    v17.16b, #0x0f
-        sub     x10, x10, #1
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v1.16b, v1.16b, v8.16b
-        eor     v2.16b, v2.16b, v8.16b
-        eor     v4.16b, v4.16b, v8.16b
-        eor     v3.16b, v3.16b, v8.16b
-        eor     v5.16b, v5.16b, v8.16b
-        tbl     v0.16b, {v0.16b}, v10.16b
-        tbl     v1.16b, {v1.16b}, v10.16b
-        tbl     v2.16b, {v2.16b}, v10.16b
-        tbl     v4.16b, {v4.16b}, v10.16b
-        eor     v6.16b, v6.16b, v8.16b
-        eor     v7.16b, v7.16b, v8.16b
-        tbl     v3.16b, {v3.16b}, v10.16b
-        tbl     v5.16b, {v5.16b}, v10.16b
-        tbl     v6.16b, {v6.16b}, v10.16b
-        ushr    v8.2d, v0.2d, #1
-        tbl     v7.16b, {v7.16b}, v10.16b
-        ushr    v10.2d, v4.2d, #1
-        ushr    v18.2d, v2.2d, #1
-        eor     v8.16b, v8.16b, v1.16b
-        ushr    v19.2d, v6.2d, #1
-        eor     v10.16b, v10.16b, v5.16b
-        eor     v18.16b, v18.16b, v3.16b
-        and     v8.16b, v8.16b, v9.16b
-        eor     v19.16b, v19.16b, v7.16b
-        and     v10.16b, v10.16b, v9.16b
-        and     v18.16b, v18.16b, v9.16b
-        eor     v1.16b, v1.16b, v8.16b
-        shl     v8.2d, v8.2d, #1
-        and     v9.16b, v19.16b, v9.16b
-        eor     v5.16b, v5.16b, v10.16b
-        shl     v10.2d, v10.2d, #1
-        eor     v3.16b, v3.16b, v18.16b
-        shl     v18.2d, v18.2d, #1
-        eor     v0.16b, v0.16b, v8.16b
-        shl     v8.2d, v9.2d, #1
-        eor     v7.16b, v7.16b, v9.16b
-        eor     v4.16b, v4.16b, v10.16b
-        eor     v2.16b, v2.16b, v18.16b
-        ushr    v9.2d, v1.2d, #2
-        eor     v6.16b, v6.16b, v8.16b
-        ushr    v8.2d, v0.2d, #2
-        ushr    v10.2d, v5.2d, #2
-        ushr    v18.2d, v4.2d, #2
-        eor     v9.16b, v9.16b, v3.16b
-        eor     v8.16b, v8.16b, v2.16b
-        eor     v10.16b, v10.16b, v7.16b
-        eor     v18.16b, v18.16b, v6.16b
-        and     v9.16b, v9.16b, v16.16b
-        and     v8.16b, v8.16b, v16.16b
-        and     v10.16b, v10.16b, v16.16b
-        and     v16.16b, v18.16b, v16.16b
-        eor     v3.16b, v3.16b, v9.16b
-        shl     v9.2d, v9.2d, #2
-        eor     v2.16b, v2.16b, v8.16b
-        shl     v8.2d, v8.2d, #2
-        eor     v7.16b, v7.16b, v10.16b
-        shl     v10.2d, v10.2d, #2
-        eor     v6.16b, v6.16b, v16.16b
-        shl     v16.2d, v16.2d, #2
-        eor     v1.16b, v1.16b, v9.16b
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v5.16b, v5.16b, v10.16b
-        eor     v4.16b, v4.16b, v16.16b
-        ushr    v8.2d, v3.2d, #4
-        ushr    v9.2d, v2.2d, #4
-        ushr    v10.2d, v1.2d, #4
-        ushr    v16.2d, v0.2d, #4
-        eor     v8.16b, v8.16b, v7.16b
-        eor     v9.16b, v9.16b, v6.16b
-        eor     v10.16b, v10.16b, v5.16b
-        eor     v16.16b, v16.16b, v4.16b
-        and     v8.16b, v8.16b, v17.16b
-        and     v9.16b, v9.16b, v17.16b
-        and     v10.16b, v10.16b, v17.16b
-        and     v16.16b, v16.16b, v17.16b
-        eor     v7.16b, v7.16b, v8.16b
-        shl     v8.2d, v8.2d, #4
-        eor     v6.16b, v6.16b, v9.16b
-        shl     v9.2d, v9.2d, #4
-        eor     v5.16b, v5.16b, v10.16b
-        shl     v10.2d, v10.2d, #4
-        eor     v4.16b, v4.16b, v16.16b
-        shl     v16.2d, v16.2d, #4
-        eor     v3.16b, v3.16b, v8.16b
-        eor     v2.16b, v2.16b, v9.16b
-        eor     v1.16b, v1.16b, v10.16b
-        eor     v0.16b, v0.16b, v16.16b
-        b       .Ldec_sbox
-.align  4
-.Ldec_loop:
-        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
-        ldp     q8, q9, [x9], #32
-        eor     v0.16b, v16.16b, v0.16b
-        ldr     q10, [x9], #16
-        eor     v1.16b, v17.16b, v1.16b
-        ldr     q16, [x9], #16
-        eor     v2.16b, v18.16b, v2.16b
-        eor     v3.16b, v19.16b, v3.16b
-        eor     v4.16b, v8.16b, v4.16b
-        eor     v5.16b, v9.16b, v5.16b
-        eor     v6.16b, v10.16b, v6.16b
-        eor     v7.16b, v16.16b, v7.16b
-        tbl     v0.16b, {v0.16b}, v28.16b
-        tbl     v1.16b, {v1.16b}, v28.16b
-        tbl     v2.16b, {v2.16b}, v28.16b
-        tbl     v3.16b, {v3.16b}, v28.16b
-        tbl     v4.16b, {v4.16b}, v28.16b
-        tbl     v5.16b, {v5.16b}, v28.16b
-        tbl     v6.16b, {v6.16b}, v28.16b
-        tbl     v7.16b, {v7.16b}, v28.16b
-.Ldec_sbox:
-        eor     v1.16b, v1.16b, v4.16b
-        eor     v3.16b, v3.16b, v4.16b
-        subs    x10, x10, #1
-        eor     v4.16b, v4.16b, v7.16b
-        eor     v2.16b, v2.16b, v7.16b
-        eor     v1.16b, v1.16b, v6.16b
-        eor     v6.16b, v6.16b, v4.16b
-        eor     v2.16b, v2.16b, v5.16b
-        eor     v0.16b, v0.16b, v1.16b
-        eor     v7.16b, v7.16b, v6.16b
-        eor     v8.16b, v6.16b, v2.16b
-        and     v9.16b, v4.16b, v6.16b
-        eor     v10.16b, v2.16b, v6.16b
-        eor     v3.16b, v3.16b, v0.16b
-        eor     v5.16b, v5.16b, v0.16b
-        eor     v16.16b, v7.16b, v4.16b
-        eor     v17.16b, v4.16b, v0.16b
-        and     v18.16b, v0.16b, v2.16b
-        eor     v19.16b, v7.16b, v4.16b
-        eor     v1.16b, v1.16b, v3.16b
-        eor     v20.16b, v3.16b, v0.16b
-        eor     v21.16b, v5.16b, v2.16b
-        eor     v22.16b, v3.16b, v7.16b
-        and     v8.16b, v17.16b, v8.16b
-        orr     v17.16b, v3.16b, v5.16b
-        eor     v23.16b, v1.16b, v6.16b
-        eor     v24.16b, v20.16b, v16.16b
-        eor     v25.16b, v1.16b, v5.16b
-        orr     v26.16b, v20.16b, v21.16b
-        and     v20.16b, v20.16b, v21.16b
-        and     v27.16b, v7.16b, v1.16b
-        eor     v21.16b, v21.16b, v23.16b
-        orr     v28.16b, v16.16b, v23.16b
-        orr     v29.16b, v22.16b, v25.16b
-        eor     v26.16b, v26.16b, v8.16b
-        and     v16.16b, v16.16b, v23.16b
-        and     v22.16b, v22.16b, v25.16b
-        and     v21.16b, v24.16b, v21.16b
-        eor     v8.16b, v28.16b, v8.16b
-        eor     v23.16b, v5.16b, v2.16b
-        eor     v24.16b, v1.16b, v6.16b
-        eor     v16.16b, v16.16b, v22.16b
-        eor     v22.16b, v3.16b, v0.16b
-        eor     v25.16b, v29.16b, v21.16b
-        eor     v21.16b, v26.16b, v21.16b
-        eor     v8.16b, v8.16b, v20.16b
-        eor     v26.16b, v23.16b, v24.16b
-        eor     v16.16b, v16.16b, v20.16b
-        eor     v28.16b, v22.16b, v19.16b
-        eor     v20.16b, v25.16b, v20.16b
-        eor     v9.16b, v21.16b, v9.16b
-        eor     v8.16b, v8.16b, v18.16b
-        eor     v18.16b, v5.16b, v1.16b
-        eor     v21.16b, v16.16b, v17.16b
-        eor     v16.16b, v16.16b, v17.16b
-        eor     v17.16b, v20.16b, v27.16b
-        eor     v20.16b, v3.16b, v7.16b
-        eor     v25.16b, v9.16b, v8.16b
-        eor     v27.16b, v0.16b, v4.16b
-        and     v29.16b, v9.16b, v17.16b
-        eor     v30.16b, v8.16b, v29.16b
-        eor     v31.16b, v21.16b, v29.16b
-        eor     v29.16b, v21.16b, v29.16b
-        bsl     v30.16b, v17.16b, v21.16b
-        bsl     v31.16b, v9.16b, v8.16b
-        bsl     v16.16b, v30.16b, v29.16b
-        bsl     v21.16b, v29.16b, v30.16b
-        eor     v8.16b, v31.16b, v30.16b
-        and     v1.16b, v1.16b, v31.16b
-        and     v9.16b, v16.16b, v31.16b
-        and     v6.16b, v6.16b, v30.16b
-        eor     v16.16b, v17.16b, v21.16b
-        and     v4.16b, v4.16b, v30.16b
-        eor     v17.16b, v8.16b, v30.16b
-        and     v21.16b, v24.16b, v8.16b
-        eor     v9.16b, v9.16b, v25.16b
-        and     v19.16b, v19.16b, v8.16b
-        eor     v24.16b, v30.16b, v16.16b
-        eor     v25.16b, v30.16b, v16.16b
-        and     v7.16b, v7.16b, v17.16b
-        and     v10.16b, v10.16b, v16.16b
-        eor     v29.16b, v9.16b, v16.16b
-        eor     v30.16b, v31.16b, v9.16b
-        and     v0.16b, v24.16b, v0.16b
-        and     v9.16b, v18.16b, v9.16b
-        and     v2.16b, v25.16b, v2.16b
-        eor     v10.16b, v10.16b, v6.16b
-        eor     v18.16b, v29.16b, v16.16b
-        and     v5.16b, v30.16b, v5.16b
-        eor     v24.16b, v8.16b, v29.16b
-        and     v25.16b, v26.16b, v29.16b
-        and     v26.16b, v28.16b, v29.16b
-        eor     v8.16b, v8.16b, v29.16b
-        eor     v17.16b, v17.16b, v18.16b
-        eor     v5.16b, v1.16b, v5.16b
-        and     v23.16b, v24.16b, v23.16b
-        eor     v21.16b, v21.16b, v25.16b
-        eor     v19.16b, v19.16b, v26.16b
-        eor     v0.16b, v4.16b, v0.16b
-        and     v3.16b, v17.16b, v3.16b
-        eor     v1.16b, v9.16b, v1.16b
-        eor     v9.16b, v25.16b, v23.16b
-        eor     v5.16b, v5.16b, v21.16b
-        eor     v2.16b, v6.16b, v2.16b
-        and     v6.16b, v8.16b, v22.16b
-        eor     v3.16b, v7.16b, v3.16b
-        and     v8.16b, v20.16b, v18.16b
-        eor     v10.16b, v10.16b, v9.16b
-        eor     v0.16b, v0.16b, v19.16b
-        eor     v9.16b, v1.16b, v9.16b
-        eor     v1.16b, v2.16b, v21.16b
-        eor     v3.16b, v3.16b, v19.16b
-        and     v16.16b, v27.16b, v16.16b
-        eor     v17.16b, v26.16b, v6.16b
-        eor     v6.16b, v8.16b, v7.16b
-        eor     v7.16b, v1.16b, v9.16b
-        eor     v1.16b, v5.16b, v3.16b
-        eor     v2.16b, v10.16b, v3.16b
-        eor     v4.16b, v16.16b, v4.16b
-        eor     v8.16b, v6.16b, v17.16b
-        eor     v5.16b, v9.16b, v3.16b
-        eor     v9.16b, v0.16b, v1.16b
-        eor     v6.16b, v7.16b, v1.16b
-        eor     v0.16b, v4.16b, v17.16b
-        eor     v4.16b, v8.16b, v7.16b
-        eor     v7.16b, v9.16b, v2.16b
-        eor     v8.16b, v3.16b, v0.16b
-        eor     v7.16b, v7.16b, v5.16b
-        eor     v3.16b, v4.16b, v7.16b
-        eor     v4.16b, v7.16b, v0.16b
-        eor     v7.16b, v8.16b, v3.16b
-        bcc     .Ldec_done
-        ext     v8.16b, v0.16b, v0.16b, #8
-        ext     v9.16b, v1.16b, v1.16b, #8
-        ldr     q28, [x11]                  // load from .LISR in common case (x10 > 0)
-        ext     v10.16b, v6.16b, v6.16b, #8
-        ext     v16.16b, v3.16b, v3.16b, #8
-        ext     v17.16b, v5.16b, v5.16b, #8
-        ext     v18.16b, v4.16b, v4.16b, #8
-        eor     v8.16b, v8.16b, v0.16b
-        eor     v9.16b, v9.16b, v1.16b
-        eor     v10.16b, v10.16b, v6.16b
-        eor     v16.16b, v16.16b, v3.16b
-        eor     v17.16b, v17.16b, v5.16b
-        ext     v19.16b, v2.16b, v2.16b, #8
-        ext     v20.16b, v7.16b, v7.16b, #8
-        eor     v18.16b, v18.16b, v4.16b
-        eor     v6.16b, v6.16b, v8.16b
-        eor     v8.16b, v2.16b, v10.16b
-        eor     v4.16b, v4.16b, v9.16b
-        eor     v2.16b, v19.16b, v2.16b
-        eor     v9.16b, v20.16b, v7.16b
-        eor     v0.16b, v0.16b, v16.16b
-        eor     v1.16b, v1.16b, v16.16b
-        eor     v6.16b, v6.16b, v17.16b
-        eor     v8.16b, v8.16b, v16.16b
-        eor     v7.16b, v7.16b, v18.16b
-        eor     v4.16b, v4.16b, v16.16b
-        eor     v2.16b, v3.16b, v2.16b
-        eor     v1.16b, v1.16b, v17.16b
-        eor     v3.16b, v5.16b, v9.16b
-        eor     v5.16b, v8.16b, v17.16b
-        eor     v7.16b, v7.16b, v17.16b
-        ext     v8.16b, v0.16b, v0.16b, #12
-        ext     v9.16b, v6.16b, v6.16b, #12
-        ext     v10.16b, v4.16b, v4.16b, #12
-        ext     v16.16b, v1.16b, v1.16b, #12
-        ext     v17.16b, v5.16b, v5.16b, #12
-        ext     v18.16b, v7.16b, v7.16b, #12
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v6.16b, v6.16b, v9.16b
-        eor     v4.16b, v4.16b, v10.16b
-        ext     v19.16b, v2.16b, v2.16b, #12
-        ext     v20.16b, v3.16b, v3.16b, #12
-        eor     v1.16b, v1.16b, v16.16b
-        eor     v5.16b, v5.16b, v17.16b
-        eor     v7.16b, v7.16b, v18.16b
-        eor     v2.16b, v2.16b, v19.16b
-        eor     v16.16b, v16.16b, v0.16b
-        eor     v3.16b, v3.16b, v20.16b
-        eor     v17.16b, v17.16b, v4.16b
-        eor     v10.16b, v10.16b, v6.16b
-        ext     v0.16b, v0.16b, v0.16b, #8
-        eor     v9.16b, v9.16b, v1.16b
-        ext     v1.16b, v1.16b, v1.16b, #8
-        eor     v8.16b, v8.16b, v3.16b
-        eor     v16.16b, v16.16b, v3.16b
-        eor     v18.16b, v18.16b, v5.16b
-        eor     v19.16b, v19.16b, v7.16b
-        ext     v21.16b, v5.16b, v5.16b, #8
-        ext     v5.16b, v7.16b, v7.16b, #8
-        eor     v7.16b, v20.16b, v2.16b
-        ext     v4.16b, v4.16b, v4.16b, #8
-        ext     v20.16b, v3.16b, v3.16b, #8
-        eor     v17.16b, v17.16b, v3.16b
-        ext     v2.16b, v2.16b, v2.16b, #8
-        eor     v3.16b, v10.16b, v3.16b
-        ext     v10.16b, v6.16b, v6.16b, #8
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v1.16b, v1.16b, v16.16b
-        eor     v5.16b, v5.16b, v18.16b
-        eor     v3.16b, v3.16b, v4.16b
-        eor     v7.16b, v20.16b, v7.16b
-        eor     v6.16b, v2.16b, v19.16b
-        eor     v4.16b, v21.16b, v17.16b
-        eor     v2.16b, v10.16b, v9.16b
-        bne     .Ldec_loop
-        ldr     q28, [x11, #16]!            // load from .LISRM0 on last round (x10 == 0)
-        b       .Ldec_loop
-.align  4
-.Ldec_done:
-        ushr    v8.2d, v0.2d, #1
-        movi    v9.16b, #0x55
-        ldr     q10, [x9]
-        ushr    v16.2d, v2.2d, #1
-        movi    v17.16b, #0x33
-        ushr    v18.2d, v6.2d, #1
-        movi    v19.16b, #0x0f
-        eor     v8.16b, v8.16b, v1.16b
-        ushr    v20.2d, v3.2d, #1
-        eor     v16.16b, v16.16b, v7.16b
-        eor     v18.16b, v18.16b, v4.16b
-        and     v8.16b, v8.16b, v9.16b
-        eor     v20.16b, v20.16b, v5.16b
-        and     v16.16b, v16.16b, v9.16b
-        and     v18.16b, v18.16b, v9.16b
-        shl     v21.2d, v8.2d, #1
-        eor     v1.16b, v1.16b, v8.16b
-        and     v8.16b, v20.16b, v9.16b
-        eor     v7.16b, v7.16b, v16.16b
-        shl     v9.2d, v16.2d, #1
-        eor     v4.16b, v4.16b, v18.16b
-        shl     v16.2d, v18.2d, #1
-        eor     v0.16b, v0.16b, v21.16b
-        shl     v18.2d, v8.2d, #1
-        eor     v5.16b, v5.16b, v8.16b
-        eor     v2.16b, v2.16b, v9.16b
-        eor     v6.16b, v6.16b, v16.16b
-        ushr    v8.2d, v1.2d, #2
-        eor     v3.16b, v3.16b, v18.16b
-        ushr    v9.2d, v0.2d, #2
-        ushr    v16.2d, v7.2d, #2
-        ushr    v18.2d, v2.2d, #2
-        eor     v8.16b, v8.16b, v4.16b
-        eor     v9.16b, v9.16b, v6.16b
-        eor     v16.16b, v16.16b, v5.16b
-        eor     v18.16b, v18.16b, v3.16b
-        and     v8.16b, v8.16b, v17.16b
-        and     v9.16b, v9.16b, v17.16b
-        and     v16.16b, v16.16b, v17.16b
-        and     v17.16b, v18.16b, v17.16b
-        eor     v4.16b, v4.16b, v8.16b
-        shl     v8.2d, v8.2d, #2
-        eor     v6.16b, v6.16b, v9.16b
-        shl     v9.2d, v9.2d, #2
-        eor     v5.16b, v5.16b, v16.16b
-        shl     v16.2d, v16.2d, #2
-        eor     v3.16b, v3.16b, v17.16b
-        shl     v17.2d, v17.2d, #2
-        eor     v1.16b, v1.16b, v8.16b
-        eor     v0.16b, v0.16b, v9.16b
-        eor     v7.16b, v7.16b, v16.16b
-        eor     v2.16b, v2.16b, v17.16b
-        ushr    v8.2d, v4.2d, #4
-        ushr    v9.2d, v6.2d, #4
-        ushr    v16.2d, v1.2d, #4
-        ushr    v17.2d, v0.2d, #4
-        eor     v8.16b, v8.16b, v5.16b
-        eor     v9.16b, v9.16b, v3.16b
-        eor     v16.16b, v16.16b, v7.16b
-        eor     v17.16b, v17.16b, v2.16b
-        and     v8.16b, v8.16b, v19.16b
-        and     v9.16b, v9.16b, v19.16b
-        and     v16.16b, v16.16b, v19.16b
-        and     v17.16b, v17.16b, v19.16b
-        eor     v5.16b, v5.16b, v8.16b
-        shl     v8.2d, v8.2d, #4
-        eor     v3.16b, v3.16b, v9.16b
-        shl     v9.2d, v9.2d, #4
-        eor     v7.16b, v7.16b, v16.16b
-        shl     v16.2d, v16.2d, #4
-        eor     v2.16b, v2.16b, v17.16b
-        shl     v17.2d, v17.2d, #4
-        eor     v4.16b, v4.16b, v8.16b
-        eor     v6.16b, v6.16b, v9.16b
-        eor     v7.16b, v7.16b, v10.16b
-        eor     v1.16b, v1.16b, v16.16b
-        eor     v2.16b, v2.16b, v10.16b
-        eor     v0.16b, v0.16b, v17.16b
-        eor     v4.16b, v4.16b, v10.16b
-        eor     v6.16b, v6.16b, v10.16b
-        eor     v3.16b, v3.16b, v10.16b
-        eor     v5.16b, v5.16b, v10.16b
-        eor     v1.16b, v1.16b, v10.16b
-        eor     v0.16b, v0.16b, v10.16b
-        ret
-.size   _bsaes_decrypt8,.-_bsaes_decrypt8
-
-.type   _bsaes_const,%object
-.align  6
-_bsaes_const:
-// InvShiftRows constants
-// Used in _bsaes_decrypt8, which assumes contiguity
-// .LM0ISR used with round 0 key
-// .LISR   used with middle round keys
-// .LISRM0 used with final round key
-.LM0ISR:
-.quad   0x0a0e0206070b0f03, 0x0004080c0d010509
-.LISR:
-.quad   0x0504070602010003, 0x0f0e0d0c080b0a09
-.LISRM0:
-.quad   0x01040b0e0205080f, 0x0306090c00070a0d
-
-// ShiftRows constants
-// Used in _bsaes_encrypt8, which assumes contiguity
-// .LM0SR used with round 0 key
-// .LSR   used with middle round keys
-// .LSRM0 used with final round key
-.LM0SR:
-.quad   0x0a0e02060f03070b, 0x0004080c05090d01
-.LSR:
-.quad   0x0504070600030201, 0x0f0e0d0c0a09080b
-.LSRM0:
-.quad   0x0304090e00050a0f, 0x01060b0c0207080d
-
-.LM0_bigendian:
-.quad   0x02060a0e03070b0f, 0x0004080c0105090d
-.LM0_littleendian:
-.quad   0x0105090d0004080c, 0x03070b0f02060a0e
-
-// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into
-// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
-.LREVM0SR:
-.quad   0x090d01050c000408, 0x03070b0f060a0e02
-
-.align  6
-.size   _bsaes_const,.-_bsaes_const
-
-.type   _bsaes_encrypt8,%function
-.align  4
-// On entry:
-//   x9 -> key (previously expanded using _bsaes_key_convert)
-//   x10 = number of rounds
-//   v0-v7 input data
-// On exit:
-//   x9-x11 corrupted
-//   other general-purpose registers preserved
-//   v0-v7 output data
-//   v11-v15 preserved
-//   other SIMD registers corrupted
-_bsaes_encrypt8:
-        ldr     q8, [x9], #16
-        adr     x11, .LM0SR
-        ldr     q9, [x11], #16
-_bsaes_encrypt8_alt:
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v1.16b, v1.16b, v8.16b
-        sub     x10, x10, #1
-        eor     v2.16b, v2.16b, v8.16b
-        eor     v4.16b, v4.16b, v8.16b
-        eor     v3.16b, v3.16b, v8.16b
-        eor     v5.16b, v5.16b, v8.16b
-        tbl     v0.16b, {v0.16b}, v9.16b
-        tbl     v1.16b, {v1.16b}, v9.16b
-        tbl     v2.16b, {v2.16b}, v9.16b
-        tbl     v4.16b, {v4.16b}, v9.16b
-        eor     v6.16b, v6.16b, v8.16b
-        eor     v7.16b, v7.16b, v8.16b
-        tbl     v3.16b, {v3.16b}, v9.16b
-        tbl     v5.16b, {v5.16b}, v9.16b
-        tbl     v6.16b, {v6.16b}, v9.16b
-        ushr    v8.2d, v0.2d, #1
-        movi    v10.16b, #0x55
-        tbl     v7.16b, {v7.16b}, v9.16b
-        ushr    v9.2d, v4.2d, #1
-        movi    v16.16b, #0x33
-        ushr    v17.2d, v2.2d, #1
-        eor     v8.16b, v8.16b, v1.16b
-        movi    v18.16b, #0x0f
-        ushr    v19.2d, v6.2d, #1
-        eor     v9.16b, v9.16b, v5.16b
-        eor     v17.16b, v17.16b, v3.16b
-        and     v8.16b, v8.16b, v10.16b
-        eor     v19.16b, v19.16b, v7.16b
-        and     v9.16b, v9.16b, v10.16b
-        and     v17.16b, v17.16b, v10.16b
-        eor     v1.16b, v1.16b, v8.16b
-        shl     v8.2d, v8.2d, #1
-        and     v10.16b, v19.16b, v10.16b
-        eor     v5.16b, v5.16b, v9.16b
-        shl     v9.2d, v9.2d, #1
-        eor     v3.16b, v3.16b, v17.16b
-        shl     v17.2d, v17.2d, #1
-        eor     v0.16b, v0.16b, v8.16b
-        shl     v8.2d, v10.2d, #1
-        eor     v7.16b, v7.16b, v10.16b
-        eor     v4.16b, v4.16b, v9.16b
-        eor     v2.16b, v2.16b, v17.16b
-        ushr    v9.2d, v1.2d, #2
-        eor     v6.16b, v6.16b, v8.16b
-        ushr    v8.2d, v0.2d, #2
-        ushr    v10.2d, v5.2d, #2
-        ushr    v17.2d, v4.2d, #2
-        eor     v9.16b, v9.16b, v3.16b
-        eor     v8.16b, v8.16b, v2.16b
-        eor     v10.16b, v10.16b, v7.16b
-        eor     v17.16b, v17.16b, v6.16b
-        and     v9.16b, v9.16b, v16.16b
-        and     v8.16b, v8.16b, v16.16b
-        and     v10.16b, v10.16b, v16.16b
-        and     v16.16b, v17.16b, v16.16b
-        eor     v3.16b, v3.16b, v9.16b
-        shl     v9.2d, v9.2d, #2
-        eor     v2.16b, v2.16b, v8.16b
-        shl     v8.2d, v8.2d, #2
-        eor     v7.16b, v7.16b, v10.16b
-        shl     v10.2d, v10.2d, #2
-        eor     v6.16b, v6.16b, v16.16b
-        shl     v16.2d, v16.2d, #2
-        eor     v1.16b, v1.16b, v9.16b
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v5.16b, v5.16b, v10.16b
-        eor     v4.16b, v4.16b, v16.16b
-        ushr    v8.2d, v3.2d, #4
-        ushr    v9.2d, v2.2d, #4
-        ushr    v10.2d, v1.2d, #4
-        ushr    v16.2d, v0.2d, #4
-        eor     v8.16b, v8.16b, v7.16b
-        eor     v9.16b, v9.16b, v6.16b
-        eor     v10.16b, v10.16b, v5.16b
-        eor     v16.16b, v16.16b, v4.16b
-        and     v8.16b, v8.16b, v18.16b
-        and     v9.16b, v9.16b, v18.16b
-        and     v10.16b, v10.16b, v18.16b
-        and     v16.16b, v16.16b, v18.16b
-        eor     v7.16b, v7.16b, v8.16b
-        shl     v8.2d, v8.2d, #4
-        eor     v6.16b, v6.16b, v9.16b
-        shl     v9.2d, v9.2d, #4
-        eor     v5.16b, v5.16b, v10.16b
-        shl     v10.2d, v10.2d, #4
-        eor     v4.16b, v4.16b, v16.16b
-        shl     v16.2d, v16.2d, #4
-        eor     v3.16b, v3.16b, v8.16b
-        eor     v2.16b, v2.16b, v9.16b
-        eor     v1.16b, v1.16b, v10.16b
-        eor     v0.16b, v0.16b, v16.16b
-        b       .Lenc_sbox
-.align  4
-.Lenc_loop:
-        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
-        ldp     q8, q9, [x9], #32
-        eor     v0.16b, v16.16b, v0.16b
-        ldr     q10, [x9], #16
-        eor     v1.16b, v17.16b, v1.16b
-        ldr     q16, [x9], #16
-        eor     v2.16b, v18.16b, v2.16b
-        eor     v3.16b, v19.16b, v3.16b
-        eor     v4.16b, v8.16b, v4.16b
-        eor     v5.16b, v9.16b, v5.16b
-        eor     v6.16b, v10.16b, v6.16b
-        eor     v7.16b, v16.16b, v7.16b
-        tbl     v0.16b, {v0.16b}, v28.16b
-        tbl     v1.16b, {v1.16b}, v28.16b
-        tbl     v2.16b, {v2.16b}, v28.16b
-        tbl     v3.16b, {v3.16b}, v28.16b
-        tbl     v4.16b, {v4.16b}, v28.16b
-        tbl     v5.16b, {v5.16b}, v28.16b
-        tbl     v6.16b, {v6.16b}, v28.16b
-        tbl     v7.16b, {v7.16b}, v28.16b
-.Lenc_sbox:
-        eor     v5.16b, v5.16b, v6.16b
-        eor     v3.16b, v3.16b, v0.16b
-        subs    x10, x10, #1
-        eor     v2.16b, v2.16b, v1.16b
-        eor     v5.16b, v5.16b, v0.16b
-        eor     v8.16b, v3.16b, v7.16b
-        eor     v6.16b, v6.16b, v2.16b
-        eor     v7.16b, v7.16b, v5.16b
-        eor     v8.16b, v8.16b, v4.16b
-        eor     v3.16b, v6.16b, v3.16b
-        eor     v4.16b, v4.16b, v5.16b
-        eor     v6.16b, v1.16b, v5.16b
-        eor     v2.16b, v2.16b, v7.16b
-        eor     v1.16b, v8.16b, v1.16b
-        eor     v8.16b, v7.16b, v4.16b
-        eor     v9.16b, v3.16b, v0.16b
-        eor     v10.16b, v7.16b, v6.16b
-        eor     v16.16b, v5.16b, v3.16b
-        eor     v17.16b, v6.16b, v2.16b
-        eor     v18.16b, v5.16b, v1.16b
-        eor     v19.16b, v2.16b, v4.16b
-        eor     v20.16b, v1.16b, v0.16b
-        orr     v21.16b, v8.16b, v9.16b
-        orr     v22.16b, v10.16b, v16.16b
-        eor     v23.16b, v8.16b, v17.16b
-        eor     v24.16b, v9.16b, v18.16b
-        and     v19.16b, v19.16b, v20.16b
-        orr     v20.16b, v17.16b, v18.16b
-        and     v8.16b, v8.16b, v9.16b
-        and     v9.16b, v17.16b, v18.16b
-        and     v17.16b, v23.16b, v24.16b
-        and     v10.16b, v10.16b, v16.16b
-        eor     v16.16b, v21.16b, v19.16b
-        eor     v18.16b, v20.16b, v19.16b
-        and     v19.16b, v2.16b, v1.16b
-        and     v20.16b, v6.16b, v5.16b
-        eor     v21.16b, v22.16b, v17.16b
-        eor     v9.16b, v9.16b, v10.16b
-        eor     v10.16b, v16.16b, v17.16b
-        eor     v16.16b, v18.16b, v8.16b
-        and     v17.16b, v4.16b, v0.16b
-        orr     v18.16b, v7.16b, v3.16b
-        eor     v21.16b, v21.16b, v8.16b
-        eor     v8.16b, v9.16b, v8.16b
-        eor     v9.16b, v10.16b, v19.16b
-        eor     v10.16b, v3.16b, v0.16b
-        eor     v16.16b, v16.16b, v17.16b
-        eor     v17.16b, v5.16b, v1.16b
-        eor     v19.16b, v21.16b, v20.16b
-        eor     v20.16b, v8.16b, v18.16b
-        eor     v8.16b, v8.16b, v18.16b
-        eor     v18.16b, v7.16b, v4.16b
-        eor     v21.16b, v9.16b, v16.16b
-        eor     v22.16b, v6.16b, v2.16b
-        and     v23.16b, v9.16b, v19.16b
-        eor     v24.16b, v10.16b, v17.16b
-        eor     v25.16b, v0.16b, v1.16b
-        eor     v26.16b, v7.16b, v6.16b
-        eor     v27.16b, v18.16b, v22.16b
-        eor     v28.16b, v3.16b, v5.16b
-        eor     v29.16b, v16.16b, v23.16b
-        eor     v30.16b, v20.16b, v23.16b
-        eor     v23.16b, v20.16b, v23.16b
-        eor     v31.16b, v4.16b, v2.16b
-        bsl     v29.16b, v19.16b, v20.16b
-        bsl     v30.16b, v9.16b, v16.16b
-        bsl     v8.16b, v29.16b, v23.16b
-        bsl     v20.16b, v23.16b, v29.16b
-        eor     v9.16b, v30.16b, v29.16b
-        and     v5.16b, v5.16b, v30.16b
-        and     v8.16b, v8.16b, v30.16b
-        and     v1.16b, v1.16b, v29.16b
-        eor     v16.16b, v19.16b, v20.16b
-        and     v2.16b, v2.16b, v29.16b
-        eor     v19.16b, v9.16b, v29.16b
-        and     v17.16b, v17.16b, v9.16b
-        eor     v8.16b, v8.16b, v21.16b
-        and     v20.16b, v22.16b, v9.16b
-        eor     v21.16b, v29.16b, v16.16b
-        eor     v22.16b, v29.16b, v16.16b
-        and     v23.16b, v25.16b, v16.16b
-        and     v6.16b, v6.16b, v19.16b
-        eor     v25.16b, v8.16b, v16.16b
-        eor     v29.16b, v30.16b, v8.16b
-        and     v4.16b, v21.16b, v4.16b
-        and     v8.16b, v28.16b, v8.16b
-        and     v0.16b, v22.16b, v0.16b
-        eor     v21.16b, v23.16b, v1.16b
-        eor     v22.16b, v9.16b, v25.16b
-        eor     v9.16b, v9.16b, v25.16b
-        eor     v23.16b, v25.16b, v16.16b
-        and     v3.16b, v29.16b, v3.16b
-        and     v24.16b, v24.16b, v25.16b
-        and     v25.16b, v27.16b, v25.16b
-        and     v10.16b, v22.16b, v10.16b
-        and     v9.16b, v9.16b, v18.16b
-        eor     v18.16b, v19.16b, v23.16b
-        and     v19.16b, v26.16b, v23.16b
-        eor     v3.16b, v5.16b, v3.16b
-        eor     v17.16b, v17.16b, v24.16b
-        eor     v10.16b, v24.16b, v10.16b
-        and     v16.16b, v31.16b, v16.16b
-        eor     v20.16b, v20.16b, v25.16b
-        eor     v9.16b, v25.16b, v9.16b
-        eor     v4.16b, v2.16b, v4.16b
-        and     v7.16b, v18.16b, v7.16b
-        eor     v18.16b, v19.16b, v6.16b
-        eor     v5.16b, v8.16b, v5.16b
-        eor     v0.16b, v1.16b, v0.16b
-        eor     v1.16b, v21.16b, v10.16b
-        eor     v8.16b, v3.16b, v17.16b
-        eor     v2.16b, v16.16b, v2.16b
-        eor     v3.16b, v6.16b, v7.16b
-        eor     v6.16b, v18.16b, v9.16b
-        eor     v4.16b, v4.16b, v20.16b
-        eor     v10.16b, v5.16b, v10.16b
-        eor     v0.16b, v0.16b, v17.16b
-        eor     v9.16b, v2.16b, v9.16b
-        eor     v3.16b, v3.16b, v20.16b
-        eor     v7.16b, v6.16b, v1.16b
-        eor     v5.16b, v8.16b, v4.16b
-        eor     v6.16b, v10.16b, v1.16b
-        eor     v2.16b, v4.16b, v0.16b
-        eor     v4.16b, v3.16b, v10.16b
-        eor     v9.16b, v9.16b, v7.16b
-        eor     v3.16b, v0.16b, v5.16b
-        eor     v0.16b, v1.16b, v4.16b
-        eor     v1.16b, v4.16b, v8.16b
-        eor     v4.16b, v9.16b, v5.16b
-        eor     v6.16b, v6.16b, v3.16b
-        bcc     .Lenc_done
-        ext     v8.16b, v0.16b, v0.16b, #12
-        ext     v9.16b, v4.16b, v4.16b, #12
-        ldr     q28, [x11]
-        ext     v10.16b, v6.16b, v6.16b, #12
-        ext     v16.16b, v1.16b, v1.16b, #12
-        ext     v17.16b, v3.16b, v3.16b, #12
-        ext     v18.16b, v7.16b, v7.16b, #12
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v4.16b, v4.16b, v9.16b
-        eor     v6.16b, v6.16b, v10.16b
-        ext     v19.16b, v2.16b, v2.16b, #12
-        ext     v20.16b, v5.16b, v5.16b, #12
-        eor     v1.16b, v1.16b, v16.16b
-        eor     v3.16b, v3.16b, v17.16b
-        eor     v7.16b, v7.16b, v18.16b
-        eor     v2.16b, v2.16b, v19.16b
-        eor     v16.16b, v16.16b, v0.16b
-        eor     v5.16b, v5.16b, v20.16b
-        eor     v17.16b, v17.16b, v6.16b
-        eor     v10.16b, v10.16b, v4.16b
-        ext     v0.16b, v0.16b, v0.16b, #8
-        eor     v9.16b, v9.16b, v1.16b
-        ext     v1.16b, v1.16b, v1.16b, #8
-        eor     v8.16b, v8.16b, v5.16b
-        eor     v16.16b, v16.16b, v5.16b
-        eor     v18.16b, v18.16b, v3.16b
-        eor     v19.16b, v19.16b, v7.16b
-        ext     v3.16b, v3.16b, v3.16b, #8
-        ext     v7.16b, v7.16b, v7.16b, #8
-        eor     v20.16b, v20.16b, v2.16b
-        ext     v6.16b, v6.16b, v6.16b, #8
-        ext     v21.16b, v5.16b, v5.16b, #8
-        eor     v17.16b, v17.16b, v5.16b
-        ext     v2.16b, v2.16b, v2.16b, #8
-        eor     v10.16b, v10.16b, v5.16b
-        ext     v22.16b, v4.16b, v4.16b, #8
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v1.16b, v1.16b, v16.16b
-        eor     v5.16b, v7.16b, v18.16b
-        eor     v4.16b, v3.16b, v17.16b
-        eor     v3.16b, v6.16b, v10.16b
-        eor     v7.16b, v21.16b, v20.16b
-        eor     v6.16b, v2.16b, v19.16b
-        eor     v2.16b, v22.16b, v9.16b
-        bne     .Lenc_loop
-        ldr     q28, [x11, #16]!            // load from .LSRM0 on last round (x10 == 0)
-        b       .Lenc_loop
-.align  4
-.Lenc_done:
-        ushr    v8.2d, v0.2d, #1
-        movi    v9.16b, #0x55
-        ldr     q10, [x9]
-        ushr    v16.2d, v3.2d, #1
-        movi    v17.16b, #0x33
-        ushr    v18.2d, v4.2d, #1
-        movi    v19.16b, #0x0f
-        eor     v8.16b, v8.16b, v1.16b
-        ushr    v20.2d, v2.2d, #1
-        eor     v16.16b, v16.16b, v7.16b
-        eor     v18.16b, v18.16b, v6.16b
-        and     v8.16b, v8.16b, v9.16b
-        eor     v20.16b, v20.16b, v5.16b
-        and     v16.16b, v16.16b, v9.16b
-        and     v18.16b, v18.16b, v9.16b
-        shl     v21.2d, v8.2d, #1
-        eor     v1.16b, v1.16b, v8.16b
-        and     v8.16b, v20.16b, v9.16b
-        eor     v7.16b, v7.16b, v16.16b
-        shl     v9.2d, v16.2d, #1
-        eor     v6.16b, v6.16b, v18.16b
-        shl     v16.2d, v18.2d, #1
-        eor     v0.16b, v0.16b, v21.16b
-        shl     v18.2d, v8.2d, #1
-        eor     v5.16b, v5.16b, v8.16b
-        eor     v3.16b, v3.16b, v9.16b
-        eor     v4.16b, v4.16b, v16.16b
-        ushr    v8.2d, v1.2d, #2
-        eor     v2.16b, v2.16b, v18.16b
-        ushr    v9.2d, v0.2d, #2
-        ushr    v16.2d, v7.2d, #2
-        ushr    v18.2d, v3.2d, #2
-        eor     v8.16b, v8.16b, v6.16b
-        eor     v9.16b, v9.16b, v4.16b
-        eor     v16.16b, v16.16b, v5.16b
-        eor     v18.16b, v18.16b, v2.16b
-        and     v8.16b, v8.16b, v17.16b
-        and     v9.16b, v9.16b, v17.16b
-        and     v16.16b, v16.16b, v17.16b
-        and     v17.16b, v18.16b, v17.16b
-        eor     v6.16b, v6.16b, v8.16b
-        shl     v8.2d, v8.2d, #2
-        eor     v4.16b, v4.16b, v9.16b
-        shl     v9.2d, v9.2d, #2
-        eor     v5.16b, v5.16b, v16.16b
-        shl     v16.2d, v16.2d, #2
-        eor     v2.16b, v2.16b, v17.16b
-        shl     v17.2d, v17.2d, #2
-        eor     v1.16b, v1.16b, v8.16b
-        eor     v0.16b, v0.16b, v9.16b
-        eor     v7.16b, v7.16b, v16.16b
-        eor     v3.16b, v3.16b, v17.16b
-        ushr    v8.2d, v6.2d, #4
-        ushr    v9.2d, v4.2d, #4
-        ushr    v16.2d, v1.2d, #4
-        ushr    v17.2d, v0.2d, #4
-        eor     v8.16b, v8.16b, v5.16b
-        eor     v9.16b, v9.16b, v2.16b
-        eor     v16.16b, v16.16b, v7.16b
-        eor     v17.16b, v17.16b, v3.16b
-        and     v8.16b, v8.16b, v19.16b
-        and     v9.16b, v9.16b, v19.16b
-        and     v16.16b, v16.16b, v19.16b
-        and     v17.16b, v17.16b, v19.16b
-        eor     v5.16b, v5.16b, v8.16b
-        shl     v8.2d, v8.2d, #4
-        eor     v2.16b, v2.16b, v9.16b
-        shl     v9.2d, v9.2d, #4
-        eor     v7.16b, v7.16b, v16.16b
-        shl     v16.2d, v16.2d, #4
-        eor     v3.16b, v3.16b, v17.16b
-        shl     v17.2d, v17.2d, #4
-        eor     v6.16b, v6.16b, v8.16b
-        eor     v4.16b, v4.16b, v9.16b
-        eor     v7.16b, v7.16b, v10.16b
-        eor     v1.16b, v1.16b, v16.16b
-        eor     v3.16b, v3.16b, v10.16b
-        eor     v0.16b, v0.16b, v17.16b
-        eor     v6.16b, v6.16b, v10.16b
-        eor     v4.16b, v4.16b, v10.16b
-        eor     v2.16b, v2.16b, v10.16b
-        eor     v5.16b, v5.16b, v10.16b
-        eor     v1.16b, v1.16b, v10.16b
-        eor     v0.16b, v0.16b, v10.16b
-        ret
-.size   _bsaes_encrypt8,.-_bsaes_encrypt8
-
-.type   _bsaes_key_convert,%function
-.align  4
-// On entry:
-//   x9 -> input key (big-endian)
-//   x10 = number of rounds
-//   x17 -> output key (native endianness)
-// On exit:
-//   x9, x10 corrupted
-//   x11 -> .LM0_bigendian
-//   x17 -> last quadword of output key
-//   other general-purpose registers preserved
-//   v2-v6 preserved
-//   v7.16b[] = 0x63
-//   v8-v14 preserved
-//   v15 = last round key (converted to native endianness)
-//   other SIMD registers corrupted
-_bsaes_key_convert:
-#ifdef __ARMEL__
-        adr     x11, .LM0_littleendian
-#else
-        adr     x11, .LM0_bigendian
-#endif
-        ldr     q0, [x9], #16               // load round 0 key
-        ldr     q1, [x11]                   // .LM0
-        ldr     q15, [x9], #16              // load round 1 key
-
-        movi    v7.16b, #0x63               // compose .L63
-        movi    v16.16b, #0x01              // bit masks
-        movi    v17.16b, #0x02
-        movi    v18.16b, #0x04
-        movi    v19.16b, #0x08
-        movi    v20.16b, #0x10
-        movi    v21.16b, #0x20
-        movi    v22.16b, #0x40
-        movi    v23.16b, #0x80
-
-#ifdef __ARMEL__
-        rev32   v0.16b, v0.16b
-#endif
-        sub     x10, x10, #1
-        str     q0, [x17], #16              // save round 0 key
-
-.align  4
-.Lkey_loop:
-        tbl     v0.16b, {v15.16b}, v1.16b
-        ldr     q15, [x9], #16              // load next round key
-
-        eor     v0.16b, v0.16b, v7.16b
-        cmtst   v24.16b, v0.16b, v16.16b
-        cmtst   v25.16b, v0.16b, v17.16b
-        cmtst   v26.16b, v0.16b, v18.16b
-        cmtst   v27.16b, v0.16b, v19.16b
-        cmtst   v28.16b, v0.16b, v20.16b
-        cmtst   v29.16b, v0.16b, v21.16b
-        cmtst   v30.16b, v0.16b, v22.16b
-        cmtst   v31.16b, v0.16b, v23.16b
-        sub     x10, x10, #1
-        st1     {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
-        st1     {v28.16b-v31.16b}, [x17], #64
-        cbnz    x10, .Lkey_loop
-
-        // don't save last round key
-#ifdef __ARMEL__
-        rev32   v15.16b, v15.16b
-        adr     x11, .LM0_bigendian
-#endif
-        ret
-.size   _bsaes_key_convert,.-_bsaes_key_convert
-
-.globl  bsaes_cbc_encrypt
-.type   bsaes_cbc_encrypt,%function
-.align  4
-// On entry:
-//   x0 -> input ciphertext
-//   x1 -> output plaintext
-//   x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
-//   x3 -> key
-//   x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
-//   w5 must be == 0
-// On exit:
-//   Output plaintext filled in
-//   Initialisation vector overwritten with last quadword of ciphertext
-//   No output registers, usual AAPCS64 register preservation
-bsaes_cbc_encrypt:
-        cmp     x2, #128
-        blo     AES_cbc_encrypt
-
-        // it is up to the caller to make sure we are called with enc == 0
-
-        stp     fp, lr, [sp, #-48]!
-        stp     d8, d9, [sp, #16]
-        stp     d10, d15, [sp, #32]
-        lsr     x2, x2, #4                  // len in 16 byte blocks
-
-        ldr     w15, [x3, #240]             // get # of rounds
-        mov     x14, sp
-
-        // allocate the key schedule on the stack
-        add     x17, sp, #96
-        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
-
-        // populate the key schedule
-        mov     x9, x3                      // pass key
-        mov     x10, x15                    // pass # of rounds
-        mov     sp, x17                     // sp is sp
-        bl      _bsaes_key_convert
-        ldr     q6,  [sp]
-        str     q15, [x17]                  // save last round key
-        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
-        str     q6, [sp]
-
-        ldr     q15, [x4]                   // load IV
-        b       .Lcbc_dec_loop
-
-.align  4
-.Lcbc_dec_loop:
-        subs    x2, x2, #0x8
-        bmi     .Lcbc_dec_loop_finish
-
-        ldr     q0, [x0], #16               // load input
-        mov     x9, sp                      // pass the key
-        ldr     q1, [x0], #16
-        mov     x10, x15
-        ldr     q2, [x0], #16
-        ldr     q3, [x0], #16
-        ldr     q4, [x0], #16
-        ldr     q5, [x0], #16
-        ldr     q6, [x0], #16
-        ldr     q7, [x0], #-7*16
-
-        bl      _bsaes_decrypt8
-
-        ldr     q16, [x0], #16              // reload input
-        eor     v0.16b, v0.16b, v15.16b     // ^= IV
-        eor     v1.16b, v1.16b, v16.16b
-        str     q0, [x1], #16               // write output
-        ldr     q0, [x0], #16
-        str     q1, [x1], #16
-        ldr     q1, [x0], #16
-        eor     v1.16b, v4.16b, v1.16b
-        ldr     q4, [x0], #16
-        eor     v2.16b, v2.16b, v4.16b
-        eor     v0.16b, v6.16b, v0.16b
-        ldr     q4, [x0], #16
-        str     q0, [x1], #16
-        str     q1, [x1], #16
-        eor     v0.16b, v7.16b, v4.16b
-        ldr     q1, [x0], #16
-        str     q2, [x1], #16
-        ldr     q2, [x0], #16
-        ldr     q15, [x0], #16
-        str     q0, [x1], #16
-        eor     v0.16b, v5.16b, v2.16b
-        eor     v1.16b, v3.16b, v1.16b
-        str     q1, [x1], #16
-        str     q0, [x1], #16
-
-        b       .Lcbc_dec_loop
-
-.Lcbc_dec_loop_finish:
-        adds    x2, x2, #8
-        beq     .Lcbc_dec_done
-
-        ldr     q0, [x0], #16               // load input
-        cmp     x2, #2
-        blo     .Lcbc_dec_one
-        ldr     q1, [x0], #16
-        mov     x9, sp                      // pass the key
-        mov     x10, x15
-        beq     .Lcbc_dec_two
-        ldr     q2, [x0], #16
-        cmp     x2, #4
-        blo     .Lcbc_dec_three
-        ldr     q3, [x0], #16
-        beq     .Lcbc_dec_four
-        ldr     q4, [x0], #16
-        cmp     x2, #6
-        blo     .Lcbc_dec_five
-        ldr     q5, [x0], #16
-        beq     .Lcbc_dec_six
-        ldr     q6, [x0], #-6*16
-
-        bl      _bsaes_decrypt8
-
-        ldr     q5, [x0], #16               // reload input
-        eor     v0.16b, v0.16b, v15.16b     // ^= IV
-        ldr     q8, [x0], #16
-        ldr     q9, [x0], #16
-        ldr     q10, [x0], #16
-        str     q0, [x1], #16               // write output
-        ldr     q0, [x0], #16
-        eor     v1.16b, v1.16b, v5.16b
-        ldr     q5, [x0], #16
-        eor     v6.16b, v6.16b, v8.16b
-        ldr     q15, [x0]
-        eor     v4.16b, v4.16b, v9.16b
-        eor     v2.16b, v2.16b, v10.16b
-        str     q1, [x1], #16
-        eor     v0.16b, v7.16b, v0.16b
-        str     q6, [x1], #16
-        eor     v1.16b, v3.16b, v5.16b
-        str     q4, [x1], #16
-        str     q2, [x1], #16
-        str     q0, [x1], #16
-        str     q1, [x1]
-        b       .Lcbc_dec_done
-.align  4
-.Lcbc_dec_six:
-        sub     x0, x0, #0x60
-        bl      _bsaes_decrypt8
-        ldr     q3, [x0], #16               // reload input
-        eor     v0.16b, v0.16b, v15.16b     // ^= IV
-        ldr     q5, [x0], #16
-        ldr     q8, [x0], #16
-        ldr     q9, [x0], #16
-        str     q0, [x1], #16               // write output
-        ldr     q0, [x0], #16
-        eor     v1.16b, v1.16b, v3.16b
-        ldr     q15, [x0]
-        eor     v3.16b, v6.16b, v5.16b
-        eor     v4.16b, v4.16b, v8.16b
-        eor     v2.16b, v2.16b, v9.16b
-        str     q1, [x1], #16
-        eor     v0.16b, v7.16b, v0.16b
-        str     q3, [x1], #16
-        str     q4, [x1], #16
-        str     q2, [x1], #16
-        str     q0, [x1]
-        b       .Lcbc_dec_done
-.align  4
-.Lcbc_dec_five:
-        sub     x0, x0, #0x50
-        bl      _bsaes_decrypt8
-        ldr     q3, [x0], #16               // reload input
-        eor     v0.16b, v0.16b, v15.16b     // ^= IV
-        ldr     q5, [x0], #16
-        ldr     q7, [x0], #16
-        ldr     q8, [x0], #16
-        str     q0, [x1], #16               // write output
-        ldr     q15, [x0]
-        eor     v0.16b, v1.16b, v3.16b
-        eor     v1.16b, v6.16b, v5.16b
-        eor     v3.16b, v4.16b, v7.16b
-        str     q0, [x1], #16
-        eor     v0.16b, v2.16b, v8.16b
-        str     q1, [x1], #16
-        str     q3, [x1], #16
-        str     q0, [x1]
-        b       .Lcbc_dec_done
-.align  4
-.Lcbc_dec_four:
-        sub     x0, x0, #0x40
-        bl      _bsaes_decrypt8
-        ldr     q2, [x0], #16               // reload input
-        eor     v0.16b, v0.16b, v15.16b     // ^= IV
-        ldr     q3, [x0], #16
-        ldr     q5, [x0], #16
-        str     q0, [x1], #16               // write output
-        ldr     q15, [x0]
-        eor     v0.16b, v1.16b, v2.16b
-        eor     v1.16b, v6.16b, v3.16b
-        eor     v2.16b, v4.16b, v5.16b
-        str     q0, [x1], #16
-        str     q1, [x1], #16
-        str     q2, [x1]
-        b       .Lcbc_dec_done
-.align  4
-.Lcbc_dec_three:
-        sub     x0, x0, #0x30
-        bl      _bsaes_decrypt8
-        ldr     q2, [x0], #16               // reload input
-        eor     v0.16b, v0.16b, v15.16b     // ^= IV
-        ldr     q3, [x0], #16
-        ldr     q15, [x0]
-        str     q0, [x1], #16               // write output
-        eor     v0.16b, v1.16b, v2.16b
-        eor     v1.16b, v6.16b, v3.16b
-        str     q0, [x1], #16
-        str     q1, [x1]
-        b       .Lcbc_dec_done
-.align  4
-.Lcbc_dec_two:
-        sub     x0, x0, #0x20
-        bl      _bsaes_decrypt8
-        ldr     q2, [x0], #16               // reload input
-        eor     v0.16b, v0.16b, v15.16b     // ^= IV
-        ldr     q15, [x0]
-        str     q0, [x1], #16               // write output
-        eor     v0.16b, v1.16b, v2.16b
-        str     q0, [x1]
-        b       .Lcbc_dec_done
-.align  4
-.Lcbc_dec_one:
-        sub     x0, x0, #0x10
-        stp     x1, x4, [sp, #-32]!
-        str     x14, [sp, #16]
-        mov     v8.16b, v15.16b
-        mov     v15.16b, v0.16b
-        mov     x2, x3
-        bl      AES_decrypt
-        ldr     x14, [sp, #16]
-        ldp     x1, x4, [sp], #32
-        ldr     q0, [x1]                    // load result
-        eor     v0.16b, v0.16b, v8.16b      // ^= IV
-        str     q0, [x1]                    // write output
-
-.align  4
-.Lcbc_dec_done:
-        movi    v0.16b, #0
-        movi    v1.16b, #0
-.Lcbc_dec_bzero:// wipe key schedule [if any]
-        stp     q0, q1, [sp], #32
-        cmp     sp, x14
-        bne     .Lcbc_dec_bzero
-        str     q15, [x4]                   // return IV
-        ldp     d8, d9, [sp, #16]
-        ldp     d10, d15, [sp, #32]
-        ldp     fp, lr, [sp], #48
-        ret
-.size   bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-
-.globl  bsaes_ctr32_encrypt_blocks
-.type   bsaes_ctr32_encrypt_blocks,%function
-.align  4
-// On entry:
-//   x0 -> input text (whole 16-byte blocks)
-//   x1 -> output text (whole 16-byte blocks)
-//   x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
-//   x3 -> key
-//   x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
-// On exit:
-//   Output text filled in
-//   No output registers, usual AAPCS64 register preservation
-bsaes_ctr32_encrypt_blocks:
-
-        cmp     x2, #8                      // use plain AES for
-        blo     .Lctr_enc_short             // small sizes
-
-        stp     fp, lr, [sp, #-80]!
-        stp     d8, d9, [sp, #16]
-        stp     d10, d11, [sp, #32]
-        stp     d12, d13, [sp, #48]
-        stp     d14, d15, [sp, #64]
-
-        ldr     w15, [x3, #240]             // get # of rounds
-        mov     x14, sp
-
-        // allocate the key schedule on the stack
-        add     x17, sp, #96
-        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
-
-        // populate the key schedule
-        mov     x9, x3                      // pass key
-        mov     x10, x15                    // pass # of rounds
-        mov     sp, x17                     // sp is sp
-        bl      _bsaes_key_convert
-        eor     v7.16b, v7.16b, v15.16b     // fix up last round key
-        str     q7, [x17]                   // save last round key
-
-        ldr     q0, [x4]                    // load counter
-        add     x13, x11, #.LREVM0SR-.LM0_bigendian
-        ldr     q4, [sp]                    // load round0 key
-
-        movi    v8.4s, #1                   // compose 1<<96
-        movi    v9.16b, #0
-        rev32   v15.16b, v0.16b
-        rev32   v0.16b, v0.16b
-        ext     v11.16b, v9.16b, v8.16b, #4
-        rev32   v4.16b, v4.16b
-        add     v12.4s, v11.4s, v11.4s      // compose 2<<96
-        str     q4, [sp]                    // save adjusted round0 key
-        add     v13.4s, v11.4s, v12.4s      // compose 3<<96
-        add     v14.4s, v12.4s, v12.4s      // compose 4<<96
-        b       .Lctr_enc_loop
-
-.align  4
-.Lctr_enc_loop:
-        // Intermix prologue from _bsaes_encrypt8 to use the opportunity
-        // to flip byte order in 32-bit counter
-
-        add     v1.4s, v15.4s, v11.4s       // +1
-        add     x9, sp, #0x10               // pass next round key
-        add     v2.4s, v15.4s, v12.4s       // +2
-        ldr     q9, [x13]                   // .LREVM0SR
-        ldr     q8, [sp]                    // load round0 key
-        add     v3.4s, v15.4s, v13.4s       // +3
-        mov     x10, x15                    // pass rounds
-        sub     x11, x13, #.LREVM0SR-.LSR   // pass constants
-        add     v6.4s, v2.4s, v14.4s
-        add     v4.4s, v15.4s, v14.4s       // +4
-        add     v7.4s, v3.4s, v14.4s
-        add     v15.4s, v4.4s, v14.4s       // next counter
-        add     v5.4s, v1.4s, v14.4s
-
-        bl      _bsaes_encrypt8_alt
-
-        subs    x2, x2, #8
-        blo     .Lctr_enc_loop_done
-
-        ldr     q16, [x0], #16
-        ldr     q17, [x0], #16
-        eor     v1.16b, v1.16b, v17.16b
-        ldr     q17, [x0], #16
-        eor     v0.16b, v0.16b, v16.16b
-        eor     v4.16b, v4.16b, v17.16b
-        str     q0, [x1], #16
-        ldr     q16, [x0], #16
-        str     q1, [x1], #16
-        mov     v0.16b, v15.16b
-        str     q4, [x1], #16
-        ldr     q1, [x0], #16
-        eor     v4.16b, v6.16b, v16.16b
-        eor     v1.16b, v3.16b, v1.16b
-        ldr     q3, [x0], #16
-        eor     v3.16b, v7.16b, v3.16b
-        ldr     q6, [x0], #16
-        eor     v2.16b, v2.16b, v6.16b
-        ldr     q6, [x0], #16
-        eor     v5.16b, v5.16b, v6.16b
-        str     q4, [x1], #16
-        str     q1, [x1], #16
-        str     q3, [x1], #16
-        str     q2, [x1], #16
-        str     q5, [x1], #16
-
-        bne     .Lctr_enc_loop
-        b       .Lctr_enc_done
-
-.align  4
-.Lctr_enc_loop_done:
-        add     x2, x2, #8
-        ldr     q16, [x0], #16              // load input
-        eor     v0.16b, v0.16b, v16.16b
-        str     q0, [x1], #16               // write output
-        cmp     x2, #2
-        blo     .Lctr_enc_done
-        ldr     q17, [x0], #16
-        eor     v1.16b, v1.16b, v17.16b
-        str     q1, [x1], #16
-        beq     .Lctr_enc_done
-        ldr     q18, [x0], #16
-        eor     v4.16b, v4.16b, v18.16b
-        str     q4, [x1], #16
-        cmp     x2, #4
-        blo     .Lctr_enc_done
-        ldr     q19, [x0], #16
-        eor     v6.16b, v6.16b, v19.16b
-        str     q6, [x1], #16
-        beq     .Lctr_enc_done
-        ldr     q20, [x0], #16
-        eor     v3.16b, v3.16b, v20.16b
-        str     q3, [x1], #16
-        cmp     x2, #6
-        blo     .Lctr_enc_done
-        ldr     q21, [x0], #16
-        eor     v7.16b, v7.16b, v21.16b
-        str     q7, [x1], #16
-        beq     .Lctr_enc_done
-        ldr     q22, [x0]
-        eor     v2.16b, v2.16b, v22.16b
-        str     q2, [x1], #16
-
-.Lctr_enc_done:
-        movi    v0.16b, #0
-        movi    v1.16b, #0
-.Lctr_enc_bzero: // wipe key schedule [if any]
-        stp     q0, q1, [sp], #32
-        cmp     sp, x14
-        bne     .Lctr_enc_bzero
-
-        ldp     d8, d9, [sp, #16]
-        ldp     d10, d11, [sp, #32]
-        ldp     d12, d13, [sp, #48]
-        ldp     d14, d15, [sp, #64]
-        ldp     fp, lr, [sp], #80
-        ret
-
-.Lctr_enc_short:
-        stp     fp, lr, [sp, #-96]!
-        stp     x19, x20, [sp, #16]
-        stp     x21, x22, [sp, #32]
-        str     x23, [sp, #48]
-
-        mov     x19, x0                     // copy arguments
-        mov     x20, x1
-        mov     x21, x2
-        mov     x22, x3
-        ldr     w23, [x4, #12]              // load counter .LSW
-        ldr     q1, [x4]                    // load whole counter value
-#ifdef __ARMEL__
-        rev     w23, w23
-#endif
-        str     q1, [sp, #80]               // copy counter value
-
-.Lctr_enc_short_loop:
-        add     x0, sp, #80                 // input counter value
-        add     x1, sp, #64                 // output on the stack
-        mov     x2, x22                     // key
-
-        bl      AES_encrypt
-
-        ldr     q0, [x19], #16              // load input
-        ldr     q1, [sp, #64]               // load encrypted counter
-        add     x23, x23, #1
-#ifdef __ARMEL__
-        rev     w0, w23
-        str     w0, [sp, #80+12]            // next counter value
-#else
-        str     w23, [sp, #80+12]           // next counter value
-#endif
-        eor     v0.16b, v0.16b, v1.16b
-        str     q0, [x20], #16              // store output
-        subs    x21, x21, #1
-        bne     .Lctr_enc_short_loop
-
-        movi    v0.16b, #0
-        movi    v1.16b, #0
-        stp     q0, q1, [sp, #64]
-
-        ldr     x23, [sp, #48]
-        ldp     x21, x22, [sp, #32]
-        ldp     x19, x20, [sp, #16]
-        ldp     fp, lr, [sp], #96
-        ret
-.size   bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-
-.globl  bsaes_xts_encrypt
-.type   bsaes_xts_encrypt,%function
-.align  4
-// On entry:
-//   x0 -> input plaintext
-//   x1 -> output ciphertext
-//   x2 -> length of text in bytes (must be at least 16)
-//   x3 -> key1 (used to encrypt the XORed plaintext blocks)
-//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
-//   x5 -> 16-byte initial vector (typically, sector number)
-// On exit:
-//   Output ciphertext filled in
-//   No output registers, usual AAPCS64 register preservation
-bsaes_xts_encrypt:
-        // Stack layout:
-        // sp ->
-        //        nrounds*128-96 bytes: key schedule
-        // x19 ->
-        //        16 bytes: frame record
-        //        4*16 bytes: tweak storage across _bsaes_encrypt8
-        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
-        //        8*8 bytes: storage for 8 callee-saved SIMD registers
-        stp     fp, lr, [sp, #-192]!
-        stp     x19, x20, [sp, #80]
-        stp     x21, x22, [sp, #96]
-        str     x23, [sp, #112]
-        stp     d8, d9, [sp, #128]
-        stp     d10, d11, [sp, #144]
-        stp     d12, d13, [sp, #160]
-        stp     d14, d15, [sp, #176]
-
-        mov     x19, sp
-        mov     x20, x0
-        mov     x21, x1
-        mov     x22, x2
-        mov     x23, x3
-
-        // generate initial tweak
-        sub     sp, sp, #16
-        mov     x0, x5                      // iv[]
-        mov     x1, sp
-        mov     x2, x4                      // key2
-        bl      AES_encrypt
-        ldr     q11, [sp], #16
-
-        ldr     w1, [x23, #240]             // get # of rounds
-        // allocate the key schedule on the stack
-        add     x17, sp, #96
-        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
-
-        // populate the key schedule
-        mov     x9, x23                     // pass key
-        mov     x10, x1                     // pass # of rounds
-        mov     sp, x17
-        bl      _bsaes_key_convert
-        eor     v15.16b, v15.16b, v7.16b    // fix up last round key
-        str     q15, [x17]                  // save last round key
-
-        subs    x22, x22, #0x80
-        blo     .Lxts_enc_short
-        b       .Lxts_enc_loop
-
-.align  4
-.Lxts_enc_loop:
-        ldr     q8, .Lxts_magic
-        mov     x10, x1                     // pass rounds
-        add     x2, x19, #16
-        ldr     q0, [x20], #16
-        sshr    v1.2d, v11.2d, #63
-        mov     x9, sp                      // pass key schedule
-        ldr     q6, .Lxts_magic+16
-        add     v2.2d, v11.2d, v11.2d
-        cmtst   v3.2d, v11.2d, v6.2d
-        and     v1.16b, v1.16b, v8.16b
-        ext     v1.16b, v1.16b, v1.16b, #8
-        and     v3.16b, v3.16b, v8.16b
-        ldr     q4, [x20], #16
-        eor     v12.16b, v2.16b, v1.16b
-        eor     v1.16b, v4.16b, v12.16b
-        eor     v0.16b, v0.16b, v11.16b
-        cmtst   v2.2d, v12.2d, v6.2d
-        add     v4.2d, v12.2d, v12.2d
-        add     x0, x19, #16
-        ext     v3.16b, v3.16b, v3.16b, #8
-        and     v2.16b, v2.16b, v8.16b
-        eor     v13.16b, v4.16b, v3.16b
-        ldr     q3, [x20], #16
-        ext     v4.16b, v2.16b, v2.16b, #8
-        eor     v2.16b, v3.16b, v13.16b
-        ldr     q3, [x20], #16
-        add     v5.2d, v13.2d, v13.2d
-        cmtst   v7.2d, v13.2d, v6.2d
-        and     v7.16b, v7.16b, v8.16b
-        ldr     q9, [x20], #16
-        ext     v7.16b, v7.16b, v7.16b, #8
-        ldr     q10, [x20], #16
-        eor     v14.16b, v5.16b, v4.16b
-        ldr     q16, [x20], #16
-        add     v4.2d, v14.2d, v14.2d
-        eor     v3.16b, v3.16b, v14.16b
-        eor     v15.16b, v4.16b, v7.16b
-        add     v5.2d, v15.2d, v15.2d
-        ldr     q7, [x20], #16
-        cmtst   v4.2d, v14.2d, v6.2d
-        and     v17.16b, v4.16b, v8.16b
-        cmtst   v18.2d, v15.2d, v6.2d
-        eor     v4.16b, v9.16b, v15.16b
-        ext     v9.16b, v17.16b, v17.16b, #8
-        eor     v9.16b, v5.16b, v9.16b
-        add     v17.2d, v9.2d, v9.2d
-        and     v18.16b, v18.16b, v8.16b
-        eor     v5.16b, v10.16b, v9.16b
-        str     q9, [x2], #16
-        ext     v10.16b, v18.16b, v18.16b, #8
-        cmtst   v9.2d, v9.2d, v6.2d
-        and     v9.16b, v9.16b, v8.16b
-        eor     v10.16b, v17.16b, v10.16b
-        cmtst   v17.2d, v10.2d, v6.2d
-        eor     v6.16b, v16.16b, v10.16b
-        str     q10, [x2], #16
-        ext     v9.16b, v9.16b, v9.16b, #8
-        add     v10.2d, v10.2d, v10.2d
-        eor     v9.16b, v10.16b, v9.16b
-        str     q9, [x2], #16
-        eor     v7.16b, v7.16b, v9.16b
-        add     v9.2d, v9.2d, v9.2d
-        and     v8.16b, v17.16b, v8.16b
-        ext     v8.16b, v8.16b, v8.16b, #8
-        eor     v8.16b, v9.16b, v8.16b
-        str     q8, [x2]                    // next round tweak
-
-        bl      _bsaes_encrypt8
-
-        ldr     q8, [x0], #16
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        ldr     q9, [x0], #16
-        eor     v4.16b, v4.16b, v13.16b
-        eor     v6.16b, v6.16b, v14.16b
-        ldr     q10, [x0], #16
-        eor     v3.16b, v3.16b, v15.16b
-        subs    x22, x22, #0x80
-        str     q0, [x21], #16
-        ldr     q11, [x0]                   // next round tweak
-        str     q1, [x21], #16
-        eor     v0.16b, v7.16b, v8.16b
-        eor     v1.16b, v2.16b, v9.16b
-        str     q4, [x21], #16
-        eor     v2.16b, v5.16b, v10.16b
-        str     q6, [x21], #16
-        str     q3, [x21], #16
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        str     q2, [x21], #16
-        bpl     .Lxts_enc_loop
-
-.Lxts_enc_short:
-        adds    x22, x22, #0x70
-        bmi     .Lxts_enc_done
-
-        ldr     q8, .Lxts_magic
-        sshr    v1.2d, v11.2d, #63
-        add     v2.2d, v11.2d, v11.2d
-        ldr     q9, .Lxts_magic+16
-        subs    x22, x22, #0x10
-        ldr     q0, [x20], #16
-        and     v1.16b, v1.16b, v8.16b
-        cmtst   v3.2d, v11.2d, v9.2d
-        ext     v1.16b, v1.16b, v1.16b, #8
-        and     v3.16b, v3.16b, v8.16b
-        eor     v12.16b, v2.16b, v1.16b
-        ext     v1.16b, v3.16b, v3.16b, #8
-        add     v2.2d, v12.2d, v12.2d
-        cmtst   v3.2d, v12.2d, v9.2d
-        eor     v13.16b, v2.16b, v1.16b
-        and     v22.16b, v3.16b, v8.16b
-        bmi     .Lxts_enc_1
-
-        ext     v2.16b, v22.16b, v22.16b, #8
-        add     v3.2d, v13.2d, v13.2d
-        ldr     q1, [x20], #16
-        cmtst   v4.2d, v13.2d, v9.2d
-        subs    x22, x22, #0x10
-        eor     v14.16b, v3.16b, v2.16b
-        and     v23.16b, v4.16b, v8.16b
-        bmi     .Lxts_enc_2
-
-        ext     v3.16b, v23.16b, v23.16b, #8
-        add     v4.2d, v14.2d, v14.2d
-        ldr     q2, [x20], #16
-        cmtst   v5.2d, v14.2d, v9.2d
-        eor     v0.16b, v0.16b, v11.16b
-        subs    x22, x22, #0x10
-        eor     v15.16b, v4.16b, v3.16b
-        and     v24.16b, v5.16b, v8.16b
-        bmi     .Lxts_enc_3
-
-        ext     v4.16b, v24.16b, v24.16b, #8
-        add     v5.2d, v15.2d, v15.2d
-        ldr     q3, [x20], #16
-        cmtst   v6.2d, v15.2d, v9.2d
-        eor     v1.16b, v1.16b, v12.16b
-        subs    x22, x22, #0x10
-        eor     v16.16b, v5.16b, v4.16b
-        and     v25.16b, v6.16b, v8.16b
-        bmi     .Lxts_enc_4
-
-        ext     v5.16b, v25.16b, v25.16b, #8
-        add     v6.2d, v16.2d, v16.2d
-        add     x0, x19, #16
-        cmtst   v7.2d, v16.2d, v9.2d
-        ldr     q4, [x20], #16
-        eor     v2.16b, v2.16b, v13.16b
-        str     q16, [x0], #16
-        subs    x22, x22, #0x10
-        eor     v17.16b, v6.16b, v5.16b
-        and     v26.16b, v7.16b, v8.16b
-        bmi     .Lxts_enc_5
-
-        ext     v7.16b, v26.16b, v26.16b, #8
-        add     v18.2d, v17.2d, v17.2d
-        ldr     q5, [x20], #16
-        eor     v3.16b, v3.16b, v14.16b
-        str     q17, [x0], #16
-        subs    x22, x22, #0x10
-        eor     v18.16b, v18.16b, v7.16b
-        bmi     .Lxts_enc_6
-
-        ldr     q6, [x20], #16
-        eor     v4.16b, v4.16b, v15.16b
-        eor     v5.16b, v5.16b, v16.16b
-        str     q18, [x0]                   // next round tweak
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1
-        add     x0, x19, #16
-        sub     x22, x22, #0x10
-        eor     v6.16b, v6.16b, v17.16b
-
-        bl      _bsaes_encrypt8
-
-        ldr     q16, [x0], #16
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        ldr     q17, [x0], #16
-        eor     v4.16b, v4.16b, v13.16b
-        eor     v6.16b, v6.16b, v14.16b
-        eor     v3.16b, v3.16b, v15.16b
-        ldr     q11, [x0]                   // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        eor     v0.16b, v7.16b, v16.16b
-        eor     v1.16b, v2.16b, v17.16b
-        str     q4, [x21], #16
-        str     q6, [x21], #16
-        str     q3, [x21], #16
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        b       .Lxts_enc_done
-
-.align  4
-.Lxts_enc_6:
-        eor     v4.16b, v4.16b, v15.16b
-        eor     v5.16b, v5.16b, v16.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_encrypt8
-
-        ldr     q16, [x0], #16
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v4.16b, v4.16b, v13.16b
-        eor     v6.16b, v6.16b, v14.16b
-        ldr     q11, [x0]                   // next round tweak
-        eor     v3.16b, v3.16b, v15.16b
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        eor     v0.16b, v7.16b, v16.16b
-        str     q4, [x21], #16
-        str     q6, [x21], #16
-        str     q3, [x21], #16
-        str     q0, [x21], #16
-        b       .Lxts_enc_done
-
-.align  4
-.Lxts_enc_5:
-        eor     v3.16b, v3.16b, v14.16b
-        eor     v4.16b, v4.16b, v15.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_encrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        ldr     q11, [x0]                   // next round tweak
-        eor     v4.16b, v4.16b, v13.16b
-        eor     v6.16b, v6.16b, v14.16b
-        eor     v3.16b, v3.16b, v15.16b
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        str     q4, [x21], #16
-        str     q6, [x21], #16
-        str     q3, [x21], #16
-        b       .Lxts_enc_done
-
-.align  4
-.Lxts_enc_4:
-        eor     v2.16b, v2.16b, v13.16b
-        eor     v3.16b, v3.16b, v14.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_encrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v4.16b, v4.16b, v13.16b
-        eor     v6.16b, v6.16b, v14.16b
-        mov     v11.16b, v15.16b            // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        str     q4, [x21], #16
-        str     q6, [x21], #16
-        b       .Lxts_enc_done
-
-.align  4
-.Lxts_enc_3:
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v2.16b, v2.16b, v13.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_encrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v4.16b, v4.16b, v13.16b
-        mov     v11.16b, v14.16b            // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        str     q4, [x21], #16
-        b       .Lxts_enc_done
-
-.align  4
-.Lxts_enc_2:
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_encrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        mov     v11.16b, v13.16b            // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        b       .Lxts_enc_done
-
-.align  4
-.Lxts_enc_1:
-        eor     v0.16b, v0.16b, v11.16b
-        sub     x0, sp, #16
-        sub     x1, sp, #16
-        mov     x2, x23
-        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
-        mov     v14.d[0], v12.d[1]
-        str     q0, [sp, #-16]!
-
-        bl      AES_encrypt
-
-        ldr     q0, [sp], #16
-        trn1    v13.2d, v11.2d, v13.2d
-        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
-        eor     v0.16b, v0.16b, v13.16b
-        str     q0, [x21], #16
-
-.Lxts_enc_done:
-        adds    x22, x22, #0x10
-        beq     .Lxts_enc_ret
-
-        sub     x6, x21, #0x10
-        // Penultimate plaintext block produces final ciphertext part-block
-        // plus remaining part of final plaintext block. Move ciphertext part
-        // to final position and re-use penultimate ciphertext block buffer to
-        // construct final plaintext block
-.Lxts_enc_steal:
-        ldrb    w0, [x20], #1
-        ldrb    w1, [x21, #-0x10]
-        strb    w0, [x21, #-0x10]
-        strb    w1, [x21], #1
-
-        subs    x22, x22, #1
-        bhi     .Lxts_enc_steal
-
-        // Finally encrypt the penultimate ciphertext block using the
-        // last tweak
-        ldr     q0, [x6]
-        eor     v0.16b, v0.16b, v11.16b
-        str     q0, [sp, #-16]!
-        mov     x0, sp
-        mov     x1, sp
-        mov     x2, x23
-        mov     x21, x6
-        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
-
-        bl      AES_encrypt
-
-        trn1    v11.2d, v11.2d, v13.2d
-        ldr     q0, [sp], #16
-        eor     v0.16b, v0.16b, v11.16b
-        str     q0, [x21]
-
-.Lxts_enc_ret:
-
-        movi    v0.16b, #0
-        movi    v1.16b, #0
-.Lxts_enc_bzero: // wipe key schedule
-        stp     q0, q1, [sp], #32
-        cmp     sp, x19
-        bne     .Lxts_enc_bzero
-
-        ldp     x19, x20, [sp, #80]
-        ldp     x21, x22, [sp, #96]
-        ldr     x23, [sp, #112]
-        ldp     d8, d9, [sp, #128]
-        ldp     d10, d11, [sp, #144]
-        ldp     d12, d13, [sp, #160]
-        ldp     d14, d15, [sp, #176]
-        ldp     fp, lr, [sp], #192
-        ret
-.size   bsaes_xts_encrypt,.-bsaes_xts_encrypt
-
-// The assembler doesn't seem capable of de-duplicating these when expressed
-// using `ldr qd,=` syntax, so assign a symbolic address
-.align  5
-.Lxts_magic:
-.quad   1, 0x87, 0x4000000000000000, 0x4000000000000000
-
-.globl  bsaes_xts_decrypt
-.type   bsaes_xts_decrypt,%function
-.align  4
-// On entry:
-//   x0 -> input ciphertext
-//   x1 -> output plaintext
-//   x2 -> length of text in bytes (must be at least 16)
-//   x3 -> key1 (used to decrypt the XORed ciphertext blocks)
-//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
-//   x5 -> 16-byte initial vector (typically, sector number)
-// On exit:
-//   Output plaintext filled in
-//   No output registers, usual AAPCS64 register preservation
-bsaes_xts_decrypt:
-        // Stack layout:
-        // sp ->
-        //        nrounds*128-96 bytes: key schedule
-        // x19 ->
-        //        16 bytes: frame record
-        //        4*16 bytes: tweak storage across _bsaes_decrypt8
-        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
-        //        8*8 bytes: storage for 8 callee-saved SIMD registers
-        stp     fp, lr, [sp, #-192]!
-        stp     x19, x20, [sp, #80]
-        stp     x21, x22, [sp, #96]
-        str     x23, [sp, #112]
-        stp     d8, d9, [sp, #128]
-        stp     d10, d11, [sp, #144]
-        stp     d12, d13, [sp, #160]
-        stp     d14, d15, [sp, #176]
-
-        mov     x19, sp
-        mov     x20, x0
-        mov     x21, x1
-        mov     x22, x2
-        mov     x23, x3
-
-        // generate initial tweak
-        sub     sp, sp, #16
-        mov     x0, x5                      // iv[]
-        mov     x1, sp
-        mov     x2, x4                      // key2
-        bl      AES_encrypt
-        ldr     q11, [sp], #16
-
-        ldr     w1, [x23, #240]             // get # of rounds
-        // allocate the key schedule on the stack
-        add     x17, sp, #96
-        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
-
-        // populate the key schedule
-        mov     x9, x23                     // pass key
-        mov     x10, x1                     // pass # of rounds
-        mov     sp, x17
-        bl      _bsaes_key_convert
-        ldr     q6,  [sp]
-        str     q15, [x17]                  // save last round key
-        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
-        str     q6, [sp]
-
-        sub     x30, x22, #0x10
-        tst     x22, #0xf                   // if not multiple of 16
-        csel    x22, x30, x22, ne           // subtract another 16 bytes
-        subs    x22, x22, #0x80
-
-        blo     .Lxts_dec_short
-        b       .Lxts_dec_loop
-
-.align  4
-.Lxts_dec_loop:
-        ldr     q8, .Lxts_magic
-        mov     x10, x1                     // pass rounds
-        add     x2, x19, #16
-        ldr     q0, [x20], #16
-        sshr    v1.2d, v11.2d, #63
-        mov     x9, sp                      // pass key schedule
-        ldr     q6, .Lxts_magic+16
-        add     v2.2d, v11.2d, v11.2d
-        cmtst   v3.2d, v11.2d, v6.2d
-        and     v1.16b, v1.16b, v8.16b
-        ext     v1.16b, v1.16b, v1.16b, #8
-        and     v3.16b, v3.16b, v8.16b
-        ldr     q4, [x20], #16
-        eor     v12.16b, v2.16b, v1.16b
-        eor     v1.16b, v4.16b, v12.16b
-        eor     v0.16b, v0.16b, v11.16b
-        cmtst   v2.2d, v12.2d, v6.2d
-        add     v4.2d, v12.2d, v12.2d
-        add     x0, x19, #16
-        ext     v3.16b, v3.16b, v3.16b, #8
-        and     v2.16b, v2.16b, v8.16b
-        eor     v13.16b, v4.16b, v3.16b
-        ldr     q3, [x20], #16
-        ext     v4.16b, v2.16b, v2.16b, #8
-        eor     v2.16b, v3.16b, v13.16b
-        ldr     q3, [x20], #16
-        add     v5.2d, v13.2d, v13.2d
-        cmtst   v7.2d, v13.2d, v6.2d
-        and     v7.16b, v7.16b, v8.16b
-        ldr     q9, [x20], #16
-        ext     v7.16b, v7.16b, v7.16b, #8
-        ldr     q10, [x20], #16
-        eor     v14.16b, v5.16b, v4.16b
-        ldr     q16, [x20], #16
-        add     v4.2d, v14.2d, v14.2d
-        eor     v3.16b, v3.16b, v14.16b
-        eor     v15.16b, v4.16b, v7.16b
-        add     v5.2d, v15.2d, v15.2d
-        ldr     q7, [x20], #16
-        cmtst   v4.2d, v14.2d, v6.2d
-        and     v17.16b, v4.16b, v8.16b
-        cmtst   v18.2d, v15.2d, v6.2d
-        eor     v4.16b, v9.16b, v15.16b
-        ext     v9.16b, v17.16b, v17.16b, #8
-        eor     v9.16b, v5.16b, v9.16b
-        add     v17.2d, v9.2d, v9.2d
-        and     v18.16b, v18.16b, v8.16b
-        eor     v5.16b, v10.16b, v9.16b
-        str     q9, [x2], #16
-        ext     v10.16b, v18.16b, v18.16b, #8
-        cmtst   v9.2d, v9.2d, v6.2d
-        and     v9.16b, v9.16b, v8.16b
-        eor     v10.16b, v17.16b, v10.16b
-        cmtst   v17.2d, v10.2d, v6.2d
-        eor     v6.16b, v16.16b, v10.16b
-        str     q10, [x2], #16
-        ext     v9.16b, v9.16b, v9.16b, #8
-        add     v10.2d, v10.2d, v10.2d
-        eor     v9.16b, v10.16b, v9.16b
-        str     q9, [x2], #16
-        eor     v7.16b, v7.16b, v9.16b
-        add     v9.2d, v9.2d, v9.2d
-        and     v8.16b, v17.16b, v8.16b
-        ext     v8.16b, v8.16b, v8.16b, #8
-        eor     v8.16b, v9.16b, v8.16b
-        str     q8, [x2]                    // next round tweak
-
-        bl      _bsaes_decrypt8
-
-        eor     v6.16b, v6.16b, v13.16b
-        eor     v0.16b, v0.16b, v11.16b
-        ldr     q8, [x0], #16
-        eor     v7.16b, v7.16b, v8.16b
-        str     q0, [x21], #16
-        eor     v0.16b, v1.16b, v12.16b
-        ldr     q1, [x0], #16
-        eor     v1.16b, v3.16b, v1.16b
-        subs    x22, x22, #0x80
-        eor     v2.16b, v2.16b, v15.16b
-        eor     v3.16b, v4.16b, v14.16b
-        ldr     q4, [x0], #16
-        str     q0, [x21], #16
-        ldr     q11, [x0]                   // next round tweak
-        eor     v0.16b, v5.16b, v4.16b
-        str     q6, [x21], #16
-        str     q3, [x21], #16
-        str     q2, [x21], #16
-        str     q7, [x21], #16
-        str     q1, [x21], #16
-        str     q0, [x21], #16
-        bpl     .Lxts_dec_loop
-
-.Lxts_dec_short:
-        adds    x22, x22, #0x70
-        bmi     .Lxts_dec_done
-
-        ldr     q8, .Lxts_magic
-        sshr    v1.2d, v11.2d, #63
-        add     v2.2d, v11.2d, v11.2d
-        ldr     q9, .Lxts_magic+16
-        subs    x22, x22, #0x10
-        ldr     q0, [x20], #16
-        and     v1.16b, v1.16b, v8.16b
-        cmtst   v3.2d, v11.2d, v9.2d
-        ext     v1.16b, v1.16b, v1.16b, #8
-        and     v3.16b, v3.16b, v8.16b
-        eor     v12.16b, v2.16b, v1.16b
-        ext     v1.16b, v3.16b, v3.16b, #8
-        add     v2.2d, v12.2d, v12.2d
-        cmtst   v3.2d, v12.2d, v9.2d
-        eor     v13.16b, v2.16b, v1.16b
-        and     v22.16b, v3.16b, v8.16b
-        bmi     .Lxts_dec_1
-
-        ext     v2.16b, v22.16b, v22.16b, #8
-        add     v3.2d, v13.2d, v13.2d
-        ldr     q1, [x20], #16
-        cmtst   v4.2d, v13.2d, v9.2d
-        subs    x22, x22, #0x10
-        eor     v14.16b, v3.16b, v2.16b
-        and     v23.16b, v4.16b, v8.16b
-        bmi     .Lxts_dec_2
-
-        ext     v3.16b, v23.16b, v23.16b, #8
-        add     v4.2d, v14.2d, v14.2d
-        ldr     q2, [x20], #16
-        cmtst   v5.2d, v14.2d, v9.2d
-        eor     v0.16b, v0.16b, v11.16b
-        subs    x22, x22, #0x10
-        eor     v15.16b, v4.16b, v3.16b
-        and     v24.16b, v5.16b, v8.16b
-        bmi     .Lxts_dec_3
-
-        ext     v4.16b, v24.16b, v24.16b, #8
-        add     v5.2d, v15.2d, v15.2d
-        ldr     q3, [x20], #16
-        cmtst   v6.2d, v15.2d, v9.2d
-        eor     v1.16b, v1.16b, v12.16b
-        subs    x22, x22, #0x10
-        eor     v16.16b, v5.16b, v4.16b
-        and     v25.16b, v6.16b, v8.16b
-        bmi     .Lxts_dec_4
-
-        ext     v5.16b, v25.16b, v25.16b, #8
-        add     v6.2d, v16.2d, v16.2d
-        add     x0, x19, #16
-        cmtst   v7.2d, v16.2d, v9.2d
-        ldr     q4, [x20], #16
-        eor     v2.16b, v2.16b, v13.16b
-        str     q16, [x0], #16
-        subs    x22, x22, #0x10
-        eor     v17.16b, v6.16b, v5.16b
-        and     v26.16b, v7.16b, v8.16b
-        bmi     .Lxts_dec_5
-
-        ext     v7.16b, v26.16b, v26.16b, #8
-        add     v18.2d, v17.2d, v17.2d
-        ldr     q5, [x20], #16
-        eor     v3.16b, v3.16b, v14.16b
-        str     q17, [x0], #16
-        subs    x22, x22, #0x10
-        eor     v18.16b, v18.16b, v7.16b
-        bmi     .Lxts_dec_6
-
-        ldr     q6, [x20], #16
-        eor     v4.16b, v4.16b, v15.16b
-        eor     v5.16b, v5.16b, v16.16b
-        str     q18, [x0]                   // next round tweak
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1
-        add     x0, x19, #16
-        sub     x22, x22, #0x10
-        eor     v6.16b, v6.16b, v17.16b
-
-        bl      _bsaes_decrypt8
-
-        ldr     q16, [x0], #16
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        ldr     q17, [x0], #16
-        eor     v6.16b, v6.16b, v13.16b
-        eor     v4.16b, v4.16b, v14.16b
-        eor     v2.16b, v2.16b, v15.16b
-        ldr     q11, [x0]                   // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        eor     v0.16b, v7.16b, v16.16b
-        eor     v1.16b, v3.16b, v17.16b
-        str     q6, [x21], #16
-        str     q4, [x21], #16
-        str     q2, [x21], #16
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        b       .Lxts_dec_done
-
-.align  4
-.Lxts_dec_6:
-        eor     v4.16b, v4.16b, v15.16b
-        eor     v5.16b, v5.16b, v16.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_decrypt8
-
-        ldr     q16, [x0], #16
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v6.16b, v6.16b, v13.16b
-        eor     v4.16b, v4.16b, v14.16b
-        ldr     q11, [x0]                   // next round tweak
-        eor     v2.16b, v2.16b, v15.16b
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        eor     v0.16b, v7.16b, v16.16b
-        str     q6, [x21], #16
-        str     q4, [x21], #16
-        str     q2, [x21], #16
-        str     q0, [x21], #16
-        b       .Lxts_dec_done
-
-.align  4
-.Lxts_dec_5:
-        eor     v3.16b, v3.16b, v14.16b
-        eor     v4.16b, v4.16b, v15.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_decrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        ldr     q11, [x0]                   // next round tweak
-        eor     v6.16b, v6.16b, v13.16b
-        eor     v4.16b, v4.16b, v14.16b
-        eor     v2.16b, v2.16b, v15.16b
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        str     q6, [x21], #16
-        str     q4, [x21], #16
-        str     q2, [x21], #16
-        b       .Lxts_dec_done
-
-.align  4
-.Lxts_dec_4:
-        eor     v2.16b, v2.16b, v13.16b
-        eor     v3.16b, v3.16b, v14.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_decrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v6.16b, v6.16b, v13.16b
-        eor     v4.16b, v4.16b, v14.16b
-        mov     v11.16b, v15.16b            // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        str     q6, [x21], #16
-        str     q4, [x21], #16
-        b       .Lxts_dec_done
-
-.align  4
-.Lxts_dec_3:
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v2.16b, v2.16b, v13.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_decrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        eor     v6.16b, v6.16b, v13.16b
-        mov     v11.16b, v14.16b            // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        str     q6, [x21], #16
-        b       .Lxts_dec_done
-
-.align  4
-.Lxts_dec_2:
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        mov     x9, sp                      // pass key schedule
-        mov     x10, x1                     // pass rounds
-        add     x0, x19, #16
-
-        bl      _bsaes_decrypt8
-
-        eor     v0.16b, v0.16b, v11.16b
-        eor     v1.16b, v1.16b, v12.16b
-        mov     v11.16b, v13.16b            // next round tweak
-        str     q0, [x21], #16
-        str     q1, [x21], #16
-        b       .Lxts_dec_done
-
-.align  4
-.Lxts_dec_1:
-        eor     v0.16b, v0.16b, v11.16b
-        sub     x0, sp, #16
-        sub     x1, sp, #16
-        mov     x2, x23
-        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
-        mov     v14.d[0], v12.d[1]
-        str     q0, [sp, #-16]!
-
-        bl      AES_decrypt
-
-        ldr     q0, [sp], #16
-        trn1    v13.2d, v11.2d, v13.2d
-        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
-        eor     v0.16b, v0.16b, v13.16b
-        str     q0, [x21], #16
-
-.Lxts_dec_done:
-        adds    x22, x22, #0x10
-        beq     .Lxts_dec_ret
-
-        // calculate one round of extra tweak for the stolen ciphertext
-        ldr     q8, .Lxts_magic
-        sshr    v6.2d, v11.2d, #63
-        and     v6.16b, v6.16b, v8.16b
-        add     v12.2d, v11.2d, v11.2d
-        ext     v6.16b, v6.16b, v6.16b, #8
-        eor     v12.16b, v12.16b, v6.16b
-
-        // perform the final decryption with the last tweak value
-        ldr     q0, [x20], #16
-        eor     v0.16b, v0.16b, v12.16b
-        str     q0, [sp, #-16]!
-        mov     x0, sp
-        mov     x1, sp
-        mov     x2, x23
-        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
-        mov     v14.d[0], v12.d[1]
-
-        bl      AES_decrypt
-
-        trn1    v12.2d, v12.2d, v14.2d
-        trn1    v11.2d, v11.2d, v13.2d
-        ldr     q0, [sp], #16
-        eor     v0.16b, v0.16b, v12.16b
-        str     q0, [x21]
-
-        mov     x6, x21
-        // Penultimate ciphertext block produces final plaintext part-block
-        // plus remaining part of final ciphertext block. Move plaintext part
-        // to final position and re-use penultimate plaintext block buffer to
-        // construct final ciphertext block
-.Lxts_dec_steal:
-        ldrb    w1, [x21]
-        ldrb    w0, [x20], #1
-        strb    w1, [x21, #0x10]
-        strb    w0, [x21], #1
-
-        subs    x22, x22, #1
-        bhi     .Lxts_dec_steal
-
-        // Finally decrypt the penultimate plaintext block using the
-        // penultimate tweak
-        ldr     q0, [x6]
-        eor     v0.16b, v0.16b, v11.16b
-        str     q0, [sp, #-16]!
-        mov     x0, sp
-        mov     x1, sp
-        mov     x2, x23
-        mov     x21, x6
-
-        bl      AES_decrypt
-
-        trn1    v11.2d, v11.2d, v13.2d
-        ldr     q0, [sp], #16
-        eor     v0.16b, v0.16b, v11.16b
-        str     q0, [x21]
-
-.Lxts_dec_ret:
-
-        movi    v0.16b, #0
-        movi    v1.16b, #0
-.Lxts_dec_bzero: // wipe key schedule
-        stp     q0, q1, [sp], #32
-        cmp     sp, x19
-        bne     .Lxts_dec_bzero
-
-        ldp     x19, x20, [sp, #80]
-        ldp     x21, x22, [sp, #96]
-        ldr     x23, [sp, #112]
-        ldp     d8, d9, [sp, #128]
-        ldp     d10, d11, [sp, #144]
-        ldp     d12, d13, [sp, #160]
-        ldp     d14, d15, [sp, #176]
-        ldp     fp, lr, [sp], #192
-        ret
-.size   bsaes_xts_decrypt,.-bsaes_xts_decrypt
diff --git a/crypto/aes/build.info b/crypto/aes/build.info
index edf6c8106e..0b9f499ee6 100644
--- a/crypto/aes/build.info
+++ b/crypto/aes/build.info
@@ -30,8 +30,8 @@ IF[{- !$disabled{asm} -}]
 
   $AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S
   $AESDEF_armv4=AES_ASM BSAES_ASM
-  $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S
-  $AESDEF_aarch64=BSAES_ASM VPAES_ASM
+  $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S vpaes-armv8.S
+  $AESDEF_aarch64=VPAES_ASM
 
   $AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s
   $AESDEF_parisc11=AES_ASM
@@ -80,7 +80,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}]
 ENDIF
 
 GENERATE[aes-ia64.s]=asm/aes-ia64.S
-GENERATE[bsaes-armv8.S]=asm/bsaes-armv8.S
 
 GENERATE[aes-586.s]=asm/aes-586.pl
 DEPEND[aes-586.s]=../perlasm/x86asm.pl


More information about the openssl-commits mailing list