[openssl] master update
Dr. Paul Dale
pauli at openssl.org
Thu May 13 14:02:51 UTC 2021
The branch master has been updated
via da51566b256e0c0536d5b986e676863b0526bf5e (commit)
from 3ba3e350fd15c133a172095f67e6e0c99ab9b410 (commit)
- Log -----------------------------------------------------------------
commit da51566b256e0c0536d5b986e676863b0526bf5e
Author: Ben Avison <bavison at riscosopen.org>
Date: Wed Mar 10 15:54:44 2021 +0000
ARM assembly pack: translate bit-sliced AES implementation to AArch64
Reviewed-by: Tomas Mraz <tomas at openssl.org>
Reviewed-by: Paul Dale <pauli at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/14592)
-----------------------------------------------------------------------
Summary of changes:
crypto/aes/asm/bsaes-armv8.S | 2338 ++++++++++++++++++++++++++++++++++++++++++
crypto/aes/build.info | 5 +-
2 files changed, 2341 insertions(+), 2 deletions(-)
create mode 100644 crypto/aes/asm/bsaes-armv8.S
diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S
new file mode 100644
index 0000000000..9bd02d0c8a
--- /dev/null
+++ b/crypto/aes/asm/bsaes-armv8.S
@@ -0,0 +1,2338 @@
+// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// ====================================================================
+// Written by Ben Avison <bavison at riscosopen.org> for the OpenSSL
+// project. Rights for redistribution and usage in source and binary
+// forms are granted according to the OpenSSL license.
+// ====================================================================
+//
+// This implementation is a translation of bsaes-armv7 for AArch64.
+// No attempt has been made to carry across the build switches for
+// kernel targets, since the Linux kernel crypto support has moved on
+// from when it was based on OpenSSL.
+
+// A lot of hand-scheduling has been performed. Consequently, this code
+// doesn't factor out neatly into macros in the same way that the
+// AArch32 version did, and there is little to be gained by wrapping it
+// up in Perl, and it is presented as pure assembly.
+
+
+#include "crypto/arm_arch.h"
+
+.text
+
+.type _bsaes_decrypt8,%function
+.align 4
+// On entry:
+// x9 -> key (previously expanded using _bsaes_key_convert)
+// x10 = number of rounds
+// v0-v7 input data
+// On exit:
+// x9-x11 corrupted
+// other general-purpose registers preserved
+// v0-v7 output data
+// v11-v15 preserved
+// other SIMD registers corrupted
+_bsaes_decrypt8:
+ ldr q8, [x9], #16
+ adr x11, .LM0ISR
+ movi v9.16b, #0x55
+ ldr q10, [x11], #16
+ movi v16.16b, #0x33
+ movi v17.16b, #0x0f
+ sub x10, x10, #1
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v8.16b
+ eor v2.16b, v2.16b, v8.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v3.16b, v3.16b, v8.16b
+ eor v5.16b, v5.16b, v8.16b
+ tbl v0.16b, {v0.16b}, v10.16b
+ tbl v1.16b, {v1.16b}, v10.16b
+ tbl v2.16b, {v2.16b}, v10.16b
+ tbl v4.16b, {v4.16b}, v10.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v7.16b, v7.16b, v8.16b
+ tbl v3.16b, {v3.16b}, v10.16b
+ tbl v5.16b, {v5.16b}, v10.16b
+ tbl v6.16b, {v6.16b}, v10.16b
+ ushr v8.2d, v0.2d, #1
+ tbl v7.16b, {v7.16b}, v10.16b
+ ushr v10.2d, v4.2d, #1
+ ushr v18.2d, v2.2d, #1
+ eor v8.16b, v8.16b, v1.16b
+ ushr v19.2d, v6.2d, #1
+ eor v10.16b, v10.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v19.16b, v19.16b, v7.16b
+ and v10.16b, v10.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ eor v1.16b, v1.16b, v8.16b
+ shl v8.2d, v8.2d, #1
+ and v9.16b, v19.16b, v9.16b
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #1
+ eor v3.16b, v3.16b, v18.16b
+ shl v18.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v8.16b
+ shl v8.2d, v9.2d, #1
+ eor v7.16b, v7.16b, v9.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v2.16b, v2.16b, v18.16b
+ ushr v9.2d, v1.2d, #2
+ eor v6.16b, v6.16b, v8.16b
+ ushr v8.2d, v0.2d, #2
+ ushr v10.2d, v5.2d, #2
+ ushr v18.2d, v4.2d, #2
+ eor v9.16b, v9.16b, v3.16b
+ eor v8.16b, v8.16b, v2.16b
+ eor v10.16b, v10.16b, v7.16b
+ eor v18.16b, v18.16b, v6.16b
+ and v9.16b, v9.16b, v16.16b
+ and v8.16b, v8.16b, v16.16b
+ and v10.16b, v10.16b, v16.16b
+ and v16.16b, v18.16b, v16.16b
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v2.16b, v2.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v7.16b, v7.16b, v10.16b
+ shl v10.2d, v10.2d, #2
+ eor v6.16b, v6.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v1.16b, v1.16b, v9.16b
+ eor v0.16b, v0.16b, v8.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v3.2d, #4
+ ushr v9.2d, v2.2d, #4
+ ushr v10.2d, v1.2d, #4
+ ushr v16.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v7.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v10.16b, v10.16b, v5.16b
+ eor v16.16b, v16.16b, v4.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v10.16b, v10.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ eor v7.16b, v7.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #4
+ eor v4.16b, v4.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v16.16b
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
+ ldp q8, q9, [x9], #32
+ eor v0.16b, v16.16b, v0.16b
+ ldr q10, [x9], #16
+ eor v1.16b, v17.16b, v1.16b
+ ldr q16, [x9], #16
+ eor v2.16b, v18.16b, v2.16b
+ eor v3.16b, v19.16b, v3.16b
+ eor v4.16b, v8.16b, v4.16b
+ eor v5.16b, v9.16b, v5.16b
+ eor v6.16b, v10.16b, v6.16b
+ eor v7.16b, v16.16b, v7.16b
+ tbl v0.16b, {v0.16b}, v28.16b
+ tbl v1.16b, {v1.16b}, v28.16b
+ tbl v2.16b, {v2.16b}, v28.16b
+ tbl v3.16b, {v3.16b}, v28.16b
+ tbl v4.16b, {v4.16b}, v28.16b
+ tbl v5.16b, {v5.16b}, v28.16b
+ tbl v6.16b, {v6.16b}, v28.16b
+ tbl v7.16b, {v7.16b}, v28.16b
+.Ldec_sbox:
+ eor v1.16b, v1.16b, v4.16b
+ eor v3.16b, v3.16b, v4.16b
+ subs x10, x10, #1
+ eor v4.16b, v4.16b, v7.16b
+ eor v2.16b, v2.16b, v7.16b
+ eor v1.16b, v1.16b, v6.16b
+ eor v6.16b, v6.16b, v4.16b
+ eor v2.16b, v2.16b, v5.16b
+ eor v0.16b, v0.16b, v1.16b
+ eor v7.16b, v7.16b, v6.16b
+ eor v8.16b, v6.16b, v2.16b
+ and v9.16b, v4.16b, v6.16b
+ eor v10.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v0.16b
+ eor v5.16b, v5.16b, v0.16b
+ eor v16.16b, v7.16b, v4.16b
+ eor v17.16b, v4.16b, v0.16b
+ and v18.16b, v0.16b, v2.16b
+ eor v19.16b, v7.16b, v4.16b
+ eor v1.16b, v1.16b, v3.16b
+ eor v20.16b, v3.16b, v0.16b
+ eor v21.16b, v5.16b, v2.16b
+ eor v22.16b, v3.16b, v7.16b
+ and v8.16b, v17.16b, v8.16b
+ orr v17.16b, v3.16b, v5.16b
+ eor v23.16b, v1.16b, v6.16b
+ eor v24.16b, v20.16b, v16.16b
+ eor v25.16b, v1.16b, v5.16b
+ orr v26.16b, v20.16b, v21.16b
+ and v20.16b, v20.16b, v21.16b
+ and v27.16b, v7.16b, v1.16b
+ eor v21.16b, v21.16b, v23.16b
+ orr v28.16b, v16.16b, v23.16b
+ orr v29.16b, v22.16b, v25.16b
+ eor v26.16b, v26.16b, v8.16b
+ and v16.16b, v16.16b, v23.16b
+ and v22.16b, v22.16b, v25.16b
+ and v21.16b, v24.16b, v21.16b
+ eor v8.16b, v28.16b, v8.16b
+ eor v23.16b, v5.16b, v2.16b
+ eor v24.16b, v1.16b, v6.16b
+ eor v16.16b, v16.16b, v22.16b
+ eor v22.16b, v3.16b, v0.16b
+ eor v25.16b, v29.16b, v21.16b
+ eor v21.16b, v26.16b, v21.16b
+ eor v8.16b, v8.16b, v20.16b
+ eor v26.16b, v23.16b, v24.16b
+ eor v16.16b, v16.16b, v20.16b
+ eor v28.16b, v22.16b, v19.16b
+ eor v20.16b, v25.16b, v20.16b
+ eor v9.16b, v21.16b, v9.16b
+ eor v8.16b, v8.16b, v18.16b
+ eor v18.16b, v5.16b, v1.16b
+ eor v21.16b, v16.16b, v17.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v17.16b, v20.16b, v27.16b
+ eor v20.16b, v3.16b, v7.16b
+ eor v25.16b, v9.16b, v8.16b
+ eor v27.16b, v0.16b, v4.16b
+ and v29.16b, v9.16b, v17.16b
+ eor v30.16b, v8.16b, v29.16b
+ eor v31.16b, v21.16b, v29.16b
+ eor v29.16b, v21.16b, v29.16b
+ bsl v30.16b, v17.16b, v21.16b
+ bsl v31.16b, v9.16b, v8.16b
+ bsl v16.16b, v30.16b, v29.16b
+ bsl v21.16b, v29.16b, v30.16b
+ eor v8.16b, v31.16b, v30.16b
+ and v1.16b, v1.16b, v31.16b
+ and v9.16b, v16.16b, v31.16b
+ and v6.16b, v6.16b, v30.16b
+ eor v16.16b, v17.16b, v21.16b
+ and v4.16b, v4.16b, v30.16b
+ eor v17.16b, v8.16b, v30.16b
+ and v21.16b, v24.16b, v8.16b
+ eor v9.16b, v9.16b, v25.16b
+ and v19.16b, v19.16b, v8.16b
+ eor v24.16b, v30.16b, v16.16b
+ eor v25.16b, v30.16b, v16.16b
+ and v7.16b, v7.16b, v17.16b
+ and v10.16b, v10.16b, v16.16b
+ eor v29.16b, v9.16b, v16.16b
+ eor v30.16b, v31.16b, v9.16b
+ and v0.16b, v24.16b, v0.16b
+ and v9.16b, v18.16b, v9.16b
+ and v2.16b, v25.16b, v2.16b
+ eor v10.16b, v10.16b, v6.16b
+ eor v18.16b, v29.16b, v16.16b
+ and v5.16b, v30.16b, v5.16b
+ eor v24.16b, v8.16b, v29.16b
+ and v25.16b, v26.16b, v29.16b
+ and v26.16b, v28.16b, v29.16b
+ eor v8.16b, v8.16b, v29.16b
+ eor v17.16b, v17.16b, v18.16b
+ eor v5.16b, v1.16b, v5.16b
+ and v23.16b, v24.16b, v23.16b
+ eor v21.16b, v21.16b, v25.16b
+ eor v19.16b, v19.16b, v26.16b
+ eor v0.16b, v4.16b, v0.16b
+ and v3.16b, v17.16b, v3.16b
+ eor v1.16b, v9.16b, v1.16b
+ eor v9.16b, v25.16b, v23.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v2.16b, v6.16b, v2.16b
+ and v6.16b, v8.16b, v22.16b
+ eor v3.16b, v7.16b, v3.16b
+ and v8.16b, v20.16b, v18.16b
+ eor v10.16b, v10.16b, v9.16b
+ eor v0.16b, v0.16b, v19.16b
+ eor v9.16b, v1.16b, v9.16b
+ eor v1.16b, v2.16b, v21.16b
+ eor v3.16b, v3.16b, v19.16b
+ and v16.16b, v27.16b, v16.16b
+ eor v17.16b, v26.16b, v6.16b
+ eor v6.16b, v8.16b, v7.16b
+ eor v7.16b, v1.16b, v9.16b
+ eor v1.16b, v5.16b, v3.16b
+ eor v2.16b, v10.16b, v3.16b
+ eor v4.16b, v16.16b, v4.16b
+ eor v8.16b, v6.16b, v17.16b
+ eor v5.16b, v9.16b, v3.16b
+ eor v9.16b, v0.16b, v1.16b
+ eor v6.16b, v7.16b, v1.16b
+ eor v0.16b, v4.16b, v17.16b
+ eor v4.16b, v8.16b, v7.16b
+ eor v7.16b, v9.16b, v2.16b
+ eor v8.16b, v3.16b, v0.16b
+ eor v7.16b, v7.16b, v5.16b
+ eor v3.16b, v4.16b, v7.16b
+ eor v4.16b, v7.16b, v0.16b
+ eor v7.16b, v8.16b, v3.16b
+ bcc .Ldec_done
+ ext v8.16b, v0.16b, v0.16b, #8
+ ext v9.16b, v1.16b, v1.16b, #8
+ ldr q28, [x11] // load from .LISR in common case (x10 > 0)
+ ext v10.16b, v6.16b, v6.16b, #8
+ ext v16.16b, v3.16b, v3.16b, #8
+ ext v17.16b, v5.16b, v5.16b, #8
+ ext v18.16b, v4.16b, v4.16b, #8
+ eor v8.16b, v8.16b, v0.16b
+ eor v9.16b, v9.16b, v1.16b
+ eor v10.16b, v10.16b, v6.16b
+ eor v16.16b, v16.16b, v3.16b
+ eor v17.16b, v17.16b, v5.16b
+ ext v19.16b, v2.16b, v2.16b, #8
+ ext v20.16b, v7.16b, v7.16b, #8
+ eor v18.16b, v18.16b, v4.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v8.16b, v2.16b, v10.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v19.16b, v2.16b
+ eor v9.16b, v20.16b, v7.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v8.16b, v8.16b, v16.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v2.16b, v3.16b, v2.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v3.16b, v5.16b, v9.16b
+ eor v5.16b, v8.16b, v17.16b
+ eor v7.16b, v7.16b, v17.16b
+ ext v8.16b, v0.16b, v0.16b, #12
+ ext v9.16b, v6.16b, v6.16b, #12
+ ext v10.16b, v4.16b, v4.16b, #12
+ ext v16.16b, v1.16b, v1.16b, #12
+ ext v17.16b, v5.16b, v5.16b, #12
+ ext v18.16b, v7.16b, v7.16b, #12
+ eor v0.16b, v0.16b, v8.16b
+ eor v6.16b, v6.16b, v9.16b
+ eor v4.16b, v4.16b, v10.16b
+ ext v19.16b, v2.16b, v2.16b, #12
+ ext v20.16b, v3.16b, v3.16b, #12
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v2.16b, v2.16b, v19.16b
+ eor v16.16b, v16.16b, v0.16b
+ eor v3.16b, v3.16b, v20.16b
+ eor v17.16b, v17.16b, v4.16b
+ eor v10.16b, v10.16b, v6.16b
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v9.16b, v9.16b, v1.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v8.16b, v8.16b, v3.16b
+ eor v16.16b, v16.16b, v3.16b
+ eor v18.16b, v18.16b, v5.16b
+ eor v19.16b, v19.16b, v7.16b
+ ext v21.16b, v5.16b, v5.16b, #8
+ ext v5.16b, v7.16b, v7.16b, #8
+ eor v7.16b, v20.16b, v2.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ ext v20.16b, v3.16b, v3.16b, #8
+ eor v17.16b, v17.16b, v3.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ eor v3.16b, v10.16b, v3.16b
+ ext v10.16b, v6.16b, v6.16b, #8
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v5.16b, v18.16b
+ eor v3.16b, v3.16b, v4.16b
+ eor v7.16b, v20.16b, v7.16b
+ eor v6.16b, v2.16b, v19.16b
+ eor v4.16b, v21.16b, v17.16b
+ eor v2.16b, v10.16b, v9.16b
+ bne .Ldec_loop
+ ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ ushr v8.2d, v0.2d, #1
+ movi v9.16b, #0x55
+ ldr q10, [x9]
+ ushr v16.2d, v2.2d, #1
+ movi v17.16b, #0x33
+ ushr v18.2d, v6.2d, #1
+ movi v19.16b, #0x0f
+ eor v8.16b, v8.16b, v1.16b
+ ushr v20.2d, v3.2d, #1
+ eor v16.16b, v16.16b, v7.16b
+ eor v18.16b, v18.16b, v4.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v20.16b, v20.16b, v5.16b
+ and v16.16b, v16.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ shl v21.2d, v8.2d, #1
+ eor v1.16b, v1.16b, v8.16b
+ and v8.16b, v20.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ shl v9.2d, v16.2d, #1
+ eor v4.16b, v4.16b, v18.16b
+ shl v16.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v21.16b
+ shl v18.2d, v8.2d, #1
+ eor v5.16b, v5.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v6.16b, v6.16b, v16.16b
+ ushr v8.2d, v1.2d, #2
+ eor v3.16b, v3.16b, v18.16b
+ ushr v9.2d, v0.2d, #2
+ ushr v16.2d, v7.2d, #2
+ ushr v18.2d, v2.2d, #2
+ eor v8.16b, v8.16b, v4.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ and v17.16b, v18.16b, v17.16b
+ eor v4.16b, v4.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v5.16b, v5.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #2
+ eor v1.16b, v1.16b, v8.16b
+ eor v0.16b, v0.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ eor v2.16b, v2.16b, v17.16b
+ ushr v8.2d, v4.2d, #4
+ ushr v9.2d, v6.2d, #4
+ ushr v16.2d, v1.2d, #4
+ ushr v17.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v5.16b
+ eor v9.16b, v9.16b, v3.16b
+ eor v16.16b, v16.16b, v7.16b
+ eor v17.16b, v17.16b, v2.16b
+ and v8.16b, v8.16b, v19.16b
+ and v9.16b, v9.16b, v19.16b
+ and v16.16b, v16.16b, v19.16b
+ and v17.16b, v17.16b, v19.16b
+ eor v5.16b, v5.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v7.16b, v7.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v2.16b, v2.16b, v17.16b
+ shl v17.2d, v17.2d, #4
+ eor v4.16b, v4.16b, v8.16b
+ eor v6.16b, v6.16b, v9.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v3.16b, v3.16b, v10.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v10.16b
+ ret
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+// InvShiftRows constants
+// Used in _bsaes_decrypt8, which assumes contiguity
+// .LM0ISR used with round 0 key
+// .LISR used with middle round keys
+// .LISRM0 used with final round key
+.LM0ISR:
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+
+// ShiftRows constants
+// Used in _bsaes_encrypt8, which assumes contiguity
+// .LM0SR used with round 0 key
+// .LSR used with middle round keys
+// .LSRM0 used with final round key
+.LM0SR:
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+
+.LM0_bigendian:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LM0_littleendian:
+.quad 0x0105090d0004080c, 0x03070b0f02060a0e
+
+// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into
+// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
+.LREVM0SR:
+.quad 0x090d01050c000408, 0x03070b0f060a0e02
+
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+// On entry:
+// x9 -> key (previously expanded using _bsaes_key_convert)
+// x10 = number of rounds
+// v0-v7 input data
+// On exit:
+// x9-x11 corrupted
+// other general-purpose registers preserved
+// v0-v7 output data
+// v11-v15 preserved
+// other SIMD registers corrupted
+_bsaes_encrypt8:
+ ldr q8, [x9], #16
+ adr x11, .LM0SR
+ ldr q9, [x11], #16
+_bsaes_encrypt8_alt:
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v8.16b
+ sub x10, x10, #1
+ eor v2.16b, v2.16b, v8.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v3.16b, v3.16b, v8.16b
+ eor v5.16b, v5.16b, v8.16b
+ tbl v0.16b, {v0.16b}, v9.16b
+ tbl v1.16b, {v1.16b}, v9.16b
+ tbl v2.16b, {v2.16b}, v9.16b
+ tbl v4.16b, {v4.16b}, v9.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v7.16b, v7.16b, v8.16b
+ tbl v3.16b, {v3.16b}, v9.16b
+ tbl v5.16b, {v5.16b}, v9.16b
+ tbl v6.16b, {v6.16b}, v9.16b
+ ushr v8.2d, v0.2d, #1
+ movi v10.16b, #0x55
+ tbl v7.16b, {v7.16b}, v9.16b
+ ushr v9.2d, v4.2d, #1
+ movi v16.16b, #0x33
+ ushr v17.2d, v2.2d, #1
+ eor v8.16b, v8.16b, v1.16b
+ movi v18.16b, #0x0f
+ ushr v19.2d, v6.2d, #1
+ eor v9.16b, v9.16b, v5.16b
+ eor v17.16b, v17.16b, v3.16b
+ and v8.16b, v8.16b, v10.16b
+ eor v19.16b, v19.16b, v7.16b
+ and v9.16b, v9.16b, v10.16b
+ and v17.16b, v17.16b, v10.16b
+ eor v1.16b, v1.16b, v8.16b
+ shl v8.2d, v8.2d, #1
+ and v10.16b, v19.16b, v10.16b
+ eor v5.16b, v5.16b, v9.16b
+ shl v9.2d, v9.2d, #1
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #1
+ eor v0.16b, v0.16b, v8.16b
+ shl v8.2d, v10.2d, #1
+ eor v7.16b, v7.16b, v10.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v2.16b, v17.16b
+ ushr v9.2d, v1.2d, #2
+ eor v6.16b, v6.16b, v8.16b
+ ushr v8.2d, v0.2d, #2
+ ushr v10.2d, v5.2d, #2
+ ushr v17.2d, v4.2d, #2
+ eor v9.16b, v9.16b, v3.16b
+ eor v8.16b, v8.16b, v2.16b
+ eor v10.16b, v10.16b, v7.16b
+ eor v17.16b, v17.16b, v6.16b
+ and v9.16b, v9.16b, v16.16b
+ and v8.16b, v8.16b, v16.16b
+ and v10.16b, v10.16b, v16.16b
+ and v16.16b, v17.16b, v16.16b
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v2.16b, v2.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v7.16b, v7.16b, v10.16b
+ shl v10.2d, v10.2d, #2
+ eor v6.16b, v6.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v1.16b, v1.16b, v9.16b
+ eor v0.16b, v0.16b, v8.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v3.2d, #4
+ ushr v9.2d, v2.2d, #4
+ ushr v10.2d, v1.2d, #4
+ ushr v16.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v7.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v10.16b, v10.16b, v5.16b
+ eor v16.16b, v16.16b, v4.16b
+ and v8.16b, v8.16b, v18.16b
+ and v9.16b, v9.16b, v18.16b
+ and v10.16b, v10.16b, v18.16b
+ and v16.16b, v16.16b, v18.16b
+ eor v7.16b, v7.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #4
+ eor v4.16b, v4.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v16.16b
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
+ ldp q8, q9, [x9], #32
+ eor v0.16b, v16.16b, v0.16b
+ ldr q10, [x9], #16
+ eor v1.16b, v17.16b, v1.16b
+ ldr q16, [x9], #16
+ eor v2.16b, v18.16b, v2.16b
+ eor v3.16b, v19.16b, v3.16b
+ eor v4.16b, v8.16b, v4.16b
+ eor v5.16b, v9.16b, v5.16b
+ eor v6.16b, v10.16b, v6.16b
+ eor v7.16b, v16.16b, v7.16b
+ tbl v0.16b, {v0.16b}, v28.16b
+ tbl v1.16b, {v1.16b}, v28.16b
+ tbl v2.16b, {v2.16b}, v28.16b
+ tbl v3.16b, {v3.16b}, v28.16b
+ tbl v4.16b, {v4.16b}, v28.16b
+ tbl v5.16b, {v5.16b}, v28.16b
+ tbl v6.16b, {v6.16b}, v28.16b
+ tbl v7.16b, {v7.16b}, v28.16b
+.Lenc_sbox:
+ eor v5.16b, v5.16b, v6.16b
+ eor v3.16b, v3.16b, v0.16b
+ subs x10, x10, #1
+ eor v2.16b, v2.16b, v1.16b
+ eor v5.16b, v5.16b, v0.16b
+ eor v8.16b, v3.16b, v7.16b
+ eor v6.16b, v6.16b, v2.16b
+ eor v7.16b, v7.16b, v5.16b
+ eor v8.16b, v8.16b, v4.16b
+ eor v3.16b, v6.16b, v3.16b
+ eor v4.16b, v4.16b, v5.16b
+ eor v6.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v7.16b
+ eor v1.16b, v8.16b, v1.16b
+ eor v8.16b, v7.16b, v4.16b
+ eor v9.16b, v3.16b, v0.16b
+ eor v10.16b, v7.16b, v6.16b
+ eor v16.16b, v5.16b, v3.16b
+ eor v17.16b, v6.16b, v2.16b
+ eor v18.16b, v5.16b, v1.16b
+ eor v19.16b, v2.16b, v4.16b
+ eor v20.16b, v1.16b, v0.16b
+ orr v21.16b, v8.16b, v9.16b
+ orr v22.16b, v10.16b, v16.16b
+ eor v23.16b, v8.16b, v17.16b
+ eor v24.16b, v9.16b, v18.16b
+ and v19.16b, v19.16b, v20.16b
+ orr v20.16b, v17.16b, v18.16b
+ and v8.16b, v8.16b, v9.16b
+ and v9.16b, v17.16b, v18.16b
+ and v17.16b, v23.16b, v24.16b
+ and v10.16b, v10.16b, v16.16b
+ eor v16.16b, v21.16b, v19.16b
+ eor v18.16b, v20.16b, v19.16b
+ and v19.16b, v2.16b, v1.16b
+ and v20.16b, v6.16b, v5.16b
+ eor v21.16b, v22.16b, v17.16b
+ eor v9.16b, v9.16b, v10.16b
+ eor v10.16b, v16.16b, v17.16b
+ eor v16.16b, v18.16b, v8.16b
+ and v17.16b, v4.16b, v0.16b
+ orr v18.16b, v7.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v8.16b, v9.16b, v8.16b
+ eor v9.16b, v10.16b, v19.16b
+ eor v10.16b, v3.16b, v0.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v17.16b, v5.16b, v1.16b
+ eor v19.16b, v21.16b, v20.16b
+ eor v20.16b, v8.16b, v18.16b
+ eor v8.16b, v8.16b, v18.16b
+ eor v18.16b, v7.16b, v4.16b
+ eor v21.16b, v9.16b, v16.16b
+ eor v22.16b, v6.16b, v2.16b
+ and v23.16b, v9.16b, v19.16b
+ eor v24.16b, v10.16b, v17.16b
+ eor v25.16b, v0.16b, v1.16b
+ eor v26.16b, v7.16b, v6.16b
+ eor v27.16b, v18.16b, v22.16b
+ eor v28.16b, v3.16b, v5.16b
+ eor v29.16b, v16.16b, v23.16b
+ eor v30.16b, v20.16b, v23.16b
+ eor v23.16b, v20.16b, v23.16b
+ eor v31.16b, v4.16b, v2.16b
+ bsl v29.16b, v19.16b, v20.16b
+ bsl v30.16b, v9.16b, v16.16b
+ bsl v8.16b, v29.16b, v23.16b
+ bsl v20.16b, v23.16b, v29.16b
+ eor v9.16b, v30.16b, v29.16b
+ and v5.16b, v5.16b, v30.16b
+ and v8.16b, v8.16b, v30.16b
+ and v1.16b, v1.16b, v29.16b
+ eor v16.16b, v19.16b, v20.16b
+ and v2.16b, v2.16b, v29.16b
+ eor v19.16b, v9.16b, v29.16b
+ and v17.16b, v17.16b, v9.16b
+ eor v8.16b, v8.16b, v21.16b
+ and v20.16b, v22.16b, v9.16b
+ eor v21.16b, v29.16b, v16.16b
+ eor v22.16b, v29.16b, v16.16b
+ and v23.16b, v25.16b, v16.16b
+ and v6.16b, v6.16b, v19.16b
+ eor v25.16b, v8.16b, v16.16b
+ eor v29.16b, v30.16b, v8.16b
+ and v4.16b, v21.16b, v4.16b
+ and v8.16b, v28.16b, v8.16b
+ and v0.16b, v22.16b, v0.16b
+ eor v21.16b, v23.16b, v1.16b
+ eor v22.16b, v9.16b, v25.16b
+ eor v9.16b, v9.16b, v25.16b
+ eor v23.16b, v25.16b, v16.16b
+ and v3.16b, v29.16b, v3.16b
+ and v24.16b, v24.16b, v25.16b
+ and v25.16b, v27.16b, v25.16b
+ and v10.16b, v22.16b, v10.16b
+ and v9.16b, v9.16b, v18.16b
+ eor v18.16b, v19.16b, v23.16b
+ and v19.16b, v26.16b, v23.16b
+ eor v3.16b, v5.16b, v3.16b
+ eor v17.16b, v17.16b, v24.16b
+ eor v10.16b, v24.16b, v10.16b
+ and v16.16b, v31.16b, v16.16b
+ eor v20.16b, v20.16b, v25.16b
+ eor v9.16b, v25.16b, v9.16b
+ eor v4.16b, v2.16b, v4.16b
+ and v7.16b, v18.16b, v7.16b
+ eor v18.16b, v19.16b, v6.16b
+ eor v5.16b, v8.16b, v5.16b
+ eor v0.16b, v1.16b, v0.16b
+ eor v1.16b, v21.16b, v10.16b
+ eor v8.16b, v3.16b, v17.16b
+ eor v2.16b, v16.16b, v2.16b
+ eor v3.16b, v6.16b, v7.16b
+ eor v6.16b, v18.16b, v9.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v10.16b, v5.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v9.16b, v2.16b, v9.16b
+ eor v3.16b, v3.16b, v20.16b
+ eor v7.16b, v6.16b, v1.16b
+ eor v5.16b, v8.16b, v4.16b
+ eor v6.16b, v10.16b, v1.16b
+ eor v2.16b, v4.16b, v0.16b
+ eor v4.16b, v3.16b, v10.16b
+ eor v9.16b, v9.16b, v7.16b
+ eor v3.16b, v0.16b, v5.16b
+ eor v0.16b, v1.16b, v4.16b
+ eor v1.16b, v4.16b, v8.16b
+ eor v4.16b, v9.16b, v5.16b
+ eor v6.16b, v6.16b, v3.16b
+ bcc .Lenc_done
+ ext v8.16b, v0.16b, v0.16b, #12
+ ext v9.16b, v4.16b, v4.16b, #12
+ ldr q28, [x11]
+ ext v10.16b, v6.16b, v6.16b, #12
+ ext v16.16b, v1.16b, v1.16b, #12
+ ext v17.16b, v3.16b, v3.16b, #12
+ ext v18.16b, v7.16b, v7.16b, #12
+ eor v0.16b, v0.16b, v8.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ ext v19.16b, v2.16b, v2.16b, #12
+ ext v20.16b, v5.16b, v5.16b, #12
+ eor v1.16b, v1.16b, v16.16b
+ eor v3.16b, v3.16b, v17.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v2.16b, v2.16b, v19.16b
+ eor v16.16b, v16.16b, v0.16b
+ eor v5.16b, v5.16b, v20.16b
+ eor v17.16b, v17.16b, v6.16b
+ eor v10.16b, v10.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v9.16b, v9.16b, v1.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v8.16b, v8.16b, v5.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v7.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v20.16b, v20.16b, v2.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v21.16b, v5.16b, v5.16b, #8
+ eor v17.16b, v17.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ eor v10.16b, v10.16b, v5.16b
+ ext v22.16b, v4.16b, v4.16b, #8
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v7.16b, v18.16b
+ eor v4.16b, v3.16b, v17.16b
+ eor v3.16b, v6.16b, v10.16b
+ eor v7.16b, v21.16b, v20.16b
+ eor v6.16b, v2.16b, v19.16b
+ eor v2.16b, v22.16b, v9.16b
+ bne .Lenc_loop
+ ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ ushr v8.2d, v0.2d, #1
+ movi v9.16b, #0x55
+ ldr q10, [x9]
+ ushr v16.2d, v3.2d, #1
+ movi v17.16b, #0x33
+ ushr v18.2d, v4.2d, #1
+ movi v19.16b, #0x0f
+ eor v8.16b, v8.16b, v1.16b
+ ushr v20.2d, v2.2d, #1
+ eor v16.16b, v16.16b, v7.16b
+ eor v18.16b, v18.16b, v6.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v20.16b, v20.16b, v5.16b
+ and v16.16b, v16.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ shl v21.2d, v8.2d, #1
+ eor v1.16b, v1.16b, v8.16b
+ and v8.16b, v20.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ shl v9.2d, v16.2d, #1
+ eor v6.16b, v6.16b, v18.16b
+ shl v16.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v21.16b
+ shl v18.2d, v8.2d, #1
+ eor v5.16b, v5.16b, v8.16b
+ eor v3.16b, v3.16b, v9.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v1.2d, #2
+ eor v2.16b, v2.16b, v18.16b
+ ushr v9.2d, v0.2d, #2
+ ushr v16.2d, v7.2d, #2
+ ushr v18.2d, v3.2d, #2
+ eor v8.16b, v8.16b, v6.16b
+ eor v9.16b, v9.16b, v4.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v2.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ and v17.16b, v18.16b, v17.16b
+ eor v6.16b, v6.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v4.16b, v4.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v5.16b, v5.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v2.16b, v2.16b, v17.16b
+ shl v17.2d, v17.2d, #2
+ eor v1.16b, v1.16b, v8.16b
+ eor v0.16b, v0.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ eor v3.16b, v3.16b, v17.16b
+ ushr v8.2d, v6.2d, #4
+ ushr v9.2d, v4.2d, #4
+ ushr v16.2d, v1.2d, #4
+ ushr v17.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v5.16b
+ eor v9.16b, v9.16b, v2.16b
+ eor v16.16b, v16.16b, v7.16b
+ eor v17.16b, v17.16b, v3.16b
+ and v8.16b, v8.16b, v19.16b
+ and v9.16b, v9.16b, v19.16b
+ and v16.16b, v16.16b, v19.16b
+ and v17.16b, v17.16b, v19.16b
+ eor v5.16b, v5.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v2.16b, v2.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v7.16b, v7.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #4
+ eor v6.16b, v6.16b, v8.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v3.16b, v3.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v10.16b
+ ret
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_key_convert,%function
+.align 4
+// On entry:
+// x9 -> input key (big-endian)
+// x10 = number of rounds
+// x17 -> output key (native endianness)
+// On exit:
+// x9, x10 corrupted
+// x11 -> .LM0_bigendian
+// x17 -> last quadword of output key
+// other general-purpose registers preserved
+// v2-v6 preserved
+// v7.16b[] = 0x63
+// v8-v14 preserved
+// v15 = last round key (converted to native endianness)
+// other SIMD registers corrupted
+_bsaes_key_convert:
+#ifdef __ARMEL__
+ adr x11, .LM0_littleendian
+#else
+ adr x11, .LM0_bigendian
+#endif
+ ldr q0, [x9], #16 // load round 0 key
+ ldr q1, [x11] // .LM0
+ ldr q15, [x9], #16 // load round 1 key
+
+ movi v7.16b, #0x63 // compose .L63
+ movi v16.16b, #0x01 // bit masks
+ movi v17.16b, #0x02
+ movi v18.16b, #0x04
+ movi v19.16b, #0x08
+ movi v20.16b, #0x10
+ movi v21.16b, #0x20
+ movi v22.16b, #0x40
+ movi v23.16b, #0x80
+
+#ifdef __ARMEL__
+ rev32 v0.16b, v0.16b
+#endif
+ sub x10, x10, #1
+ str q0, [x17], #16 // save round 0 key
+
+.align 4
+.Lkey_loop:
+ tbl v0.16b, {v15.16b}, v1.16b
+ ldr q15, [x9], #16 // load next round key
+
+ eor v0.16b, v0.16b, v7.16b
+ cmtst v24.16b, v0.16b, v16.16b
+ cmtst v25.16b, v0.16b, v17.16b
+ cmtst v26.16b, v0.16b, v18.16b
+ cmtst v27.16b, v0.16b, v19.16b
+ cmtst v28.16b, v0.16b, v20.16b
+ cmtst v29.16b, v0.16b, v21.16b
+ cmtst v30.16b, v0.16b, v22.16b
+ cmtst v31.16b, v0.16b, v23.16b
+ sub x10, x10, #1
+ st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
+ st1 {v28.16b-v31.16b}, [x17], #64
+ cbnz x10, .Lkey_loop
+
+ // don't save last round key
+#ifdef __ARMEL__
+ rev32 v15.16b, v15.16b
+ adr x11, .LM0_bigendian
+#endif
+ ret
+.size _bsaes_key_convert,.-_bsaes_key_convert
+
+.globl bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 4
+// On entry:
+// x0 -> input ciphertext
+// x1 -> output plaintext
+// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
+// x3 -> key
+// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
+// w5 must be == 0
+// On exit:
+// Output plaintext filled in
+// Initialisation vector overwritten with last quadword of ciphertext
+// No output registers, usual AAPCS64 register preservation
+bsaes_cbc_encrypt:
+ cmp x2, #128
+ blo AES_cbc_encrypt
+
+ // it is up to the caller to make sure we are called with enc == 0
+
+ stp fp, lr, [sp, #-48]!
+ stp d8, d9, [sp, #16]
+ stp d10, d15, [sp, #32]
+ lsr x2, x2, #4 // len in 16 byte blocks
+
+ ldr w15, [x3, #240] // get # of rounds
+ mov x14, sp
+
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x3 // pass key
+ mov x10, x15 // pass # of rounds
+ mov sp, x17 // sp is sp
+ bl _bsaes_key_convert
+ ldr q6, [sp]
+ str q15, [x17] // save last round key
+ eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
+ str q6, [sp]
+
+ ldr q15, [x4] // load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs x2, x2, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ ldr q0, [x0], #16 // load input
+ mov x9, sp // pass the key
+ ldr q1, [x0], #16
+ mov x10, x15
+ ldr q2, [x0], #16
+ ldr q3, [x0], #16
+ ldr q4, [x0], #16
+ ldr q5, [x0], #16
+ ldr q6, [x0], #16
+ ldr q7, [x0], #-7*16
+
+ bl _bsaes_decrypt8
+
+ ldr q16, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ eor v1.16b, v1.16b, v16.16b
+ str q0, [x1], #16 // write output
+ ldr q0, [x0], #16
+ str q1, [x1], #16
+ ldr q1, [x0], #16
+ eor v1.16b, v4.16b, v1.16b
+ ldr q4, [x0], #16
+ eor v2.16b, v2.16b, v4.16b
+ eor v0.16b, v6.16b, v0.16b
+ ldr q4, [x0], #16
+ str q0, [x1], #16
+ str q1, [x1], #16
+ eor v0.16b, v7.16b, v4.16b
+ ldr q1, [x0], #16
+ str q2, [x1], #16
+ ldr q2, [x0], #16
+ ldr q15, [x0], #16
+ str q0, [x1], #16
+ eor v0.16b, v5.16b, v2.16b
+ eor v1.16b, v3.16b, v1.16b
+ str q1, [x1], #16
+ str q0, [x1], #16
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds x2, x2, #8
+ beq .Lcbc_dec_done
+
+ ldr q0, [x0], #16 // load input
+ cmp x2, #2
+ blo .Lcbc_dec_one
+ ldr q1, [x0], #16
+ mov x9, sp // pass the key
+ mov x10, x15
+ beq .Lcbc_dec_two
+ ldr q2, [x0], #16
+ cmp x2, #4
+ blo .Lcbc_dec_three
+ ldr q3, [x0], #16
+ beq .Lcbc_dec_four
+ ldr q4, [x0], #16
+ cmp x2, #6
+ blo .Lcbc_dec_five
+ ldr q5, [x0], #16
+ beq .Lcbc_dec_six
+ ldr q6, [x0], #-6*16
+
+ bl _bsaes_decrypt8
+
+ ldr q5, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q8, [x0], #16
+ ldr q9, [x0], #16
+ ldr q10, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q0, [x0], #16
+ eor v1.16b, v1.16b, v5.16b
+ ldr q5, [x0], #16
+ eor v6.16b, v6.16b, v8.16b
+ ldr q15, [x0]
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ str q1, [x1], #16
+ eor v0.16b, v7.16b, v0.16b
+ str q6, [x1], #16
+ eor v1.16b, v3.16b, v5.16b
+ str q4, [x1], #16
+ str q2, [x1], #16
+ str q0, [x1], #16
+ str q1, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub x0, x0, #0x60
+ bl _bsaes_decrypt8
+ ldr q3, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q5, [x0], #16
+ ldr q8, [x0], #16
+ ldr q9, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q0, [x0], #16
+ eor v1.16b, v1.16b, v3.16b
+ ldr q15, [x0]
+ eor v3.16b, v6.16b, v5.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ str q1, [x1], #16
+ eor v0.16b, v7.16b, v0.16b
+ str q3, [x1], #16
+ str q4, [x1], #16
+ str q2, [x1], #16
+ str q0, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub x0, x0, #0x50
+ bl _bsaes_decrypt8
+ ldr q3, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q5, [x0], #16
+ ldr q7, [x0], #16
+ ldr q8, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q15, [x0]
+ eor v0.16b, v1.16b, v3.16b
+ eor v1.16b, v6.16b, v5.16b
+ eor v3.16b, v4.16b, v7.16b
+ str q0, [x1], #16
+ eor v0.16b, v2.16b, v8.16b
+ str q1, [x1], #16
+ str q3, [x1], #16
+ str q0, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub x0, x0, #0x40
+ bl _bsaes_decrypt8
+ ldr q2, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q3, [x0], #16
+ ldr q5, [x0], #16
+ str q0, [x1], #16 // write output
+ ldr q15, [x0]
+ eor v0.16b, v1.16b, v2.16b
+ eor v1.16b, v6.16b, v3.16b
+ eor v2.16b, v4.16b, v5.16b
+ str q0, [x1], #16
+ str q1, [x1], #16
+ str q2, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub x0, x0, #0x30
+ bl _bsaes_decrypt8
+ ldr q2, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q3, [x0], #16
+ ldr q15, [x0]
+ str q0, [x1], #16 // write output
+ eor v0.16b, v1.16b, v2.16b
+ eor v1.16b, v6.16b, v3.16b
+ str q0, [x1], #16
+ str q1, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub x0, x0, #0x20
+ bl _bsaes_decrypt8
+ ldr q2, [x0], #16 // reload input
+ eor v0.16b, v0.16b, v15.16b // ^= IV
+ ldr q15, [x0]
+ str q0, [x1], #16 // write output
+ eor v0.16b, v1.16b, v2.16b
+ str q0, [x1]
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub x0, x0, #0x10
+ stp x1, x4, [sp, #-32]!
+ str x14, [sp, #16]
+ mov v8.16b, v15.16b
+ mov v15.16b, v0.16b
+ mov x2, x3
+ bl AES_decrypt
+ ldr x14, [sp, #16]
+ ldp x1, x4, [sp], #32
+ ldr q0, [x1] // load result
+ eor v0.16b, v0.16b, v8.16b // ^= IV
+ str q0, [x1] // write output
+
+.align 4
+.Lcbc_dec_done:
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lcbc_dec_bzero:// wipe key schedule [if any]
+ stp q0, q1, [sp], #32
+ cmp sp, x14
+ bne .Lcbc_dec_bzero
+ str q15, [x4] // return IV
+ ldp d8, d9, [sp, #16]
+ ldp d10, d15, [sp, #32]
+ ldp fp, lr, [sp], #48
+ ret
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 4
+// On entry:
+// x0 -> input text (whole 16-byte blocks)
+// x1 -> output text (whole 16-byte blocks)
+// x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
+// x3 -> key
+// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
+// On exit:
+// Output text filled in
+// No output registers, usual AAPCS64 register preservation
+bsaes_ctr32_encrypt_blocks:
+
+ cmp x2, #8 // use plain AES for
+ blo .Lctr_enc_short // small sizes
+
+ stp fp, lr, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+
+ ldr w15, [x3, #240] // get # of rounds
+ mov x14, sp
+
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x3 // pass key
+ mov x10, x15 // pass # of rounds
+ mov sp, x17 // sp is sp
+ bl _bsaes_key_convert
+ eor v7.16b, v7.16b, v15.16b // fix up last round key
+ str q7, [x17] // save last round key
+
+ ldr q0, [x4] // load counter
+ add x13, x11, #.LREVM0SR-.LM0_bigendian
+ ldr q4, [sp] // load round0 key
+
+ movi v8.4s, #1 // compose 1<<96
+ movi v9.16b, #0
+ rev32 v15.16b, v0.16b
+ rev32 v0.16b, v0.16b
+ ext v11.16b, v9.16b, v8.16b, #4
+ rev32 v4.16b, v4.16b
+ add v12.4s, v11.4s, v11.4s // compose 2<<96
+ str q4, [sp] // save adjusted round0 key
+ add v13.4s, v11.4s, v12.4s // compose 3<<96
+ add v14.4s, v12.4s, v12.4s // compose 4<<96
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ // Intermix prologue from _bsaes_encrypt8 to use the opportunity
+ // to flip byte order in 32-bit counter
+
+ add v1.4s, v15.4s, v11.4s // +1
+ add x9, sp, #0x10 // pass next round key
+ add v2.4s, v15.4s, v12.4s // +2
+ ldr q9, [x13] // .LREVM0SR
+ ldr q8, [sp] // load round0 key
+ add v3.4s, v15.4s, v13.4s // +3
+ mov x10, x15 // pass rounds
+ sub x11, x13, #.LREVM0SR-.LSR // pass constants
+ add v6.4s, v2.4s, v14.4s
+ add v4.4s, v15.4s, v14.4s // +4
+ add v7.4s, v3.4s, v14.4s
+ add v15.4s, v4.4s, v14.4s // next counter
+ add v5.4s, v1.4s, v14.4s
+
+ bl _bsaes_encrypt8_alt
+
+ subs x2, x2, #8
+ blo .Lctr_enc_loop_done
+
+ ldr q16, [x0], #16
+ ldr q17, [x0], #16
+ eor v1.16b, v1.16b, v17.16b
+ ldr q17, [x0], #16
+ eor v0.16b, v0.16b, v16.16b
+ eor v4.16b, v4.16b, v17.16b
+ str q0, [x1], #16
+ ldr q16, [x0], #16
+ str q1, [x1], #16
+ mov v0.16b, v15.16b
+ str q4, [x1], #16
+ ldr q1, [x0], #16
+ eor v4.16b, v6.16b, v16.16b
+ eor v1.16b, v3.16b, v1.16b
+ ldr q3, [x0], #16
+ eor v3.16b, v7.16b, v3.16b
+ ldr q6, [x0], #16
+ eor v2.16b, v2.16b, v6.16b
+ ldr q6, [x0], #16
+ eor v5.16b, v5.16b, v6.16b
+ str q4, [x1], #16
+ str q1, [x1], #16
+ str q3, [x1], #16
+ str q2, [x1], #16
+ str q5, [x1], #16
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add x2, x2, #8
+ ldr q16, [x0], #16 // load input
+ eor v0.16b, v0.16b, v16.16b
+ str q0, [x1], #16 // write output
+ cmp x2, #2
+ blo .Lctr_enc_done
+ ldr q17, [x0], #16
+ eor v1.16b, v1.16b, v17.16b
+ str q1, [x1], #16
+ beq .Lctr_enc_done
+ ldr q18, [x0], #16
+ eor v4.16b, v4.16b, v18.16b
+ str q4, [x1], #16
+ cmp x2, #4
+ blo .Lctr_enc_done
+ ldr q19, [x0], #16
+ eor v6.16b, v6.16b, v19.16b
+ str q6, [x1], #16
+ beq .Lctr_enc_done
+ ldr q20, [x0], #16
+ eor v3.16b, v3.16b, v20.16b
+ str q3, [x1], #16
+ cmp x2, #6
+ blo .Lctr_enc_done
+ ldr q21, [x0], #16
+ eor v7.16b, v7.16b, v21.16b
+ str q7, [x1], #16
+ beq .Lctr_enc_done
+ ldr q22, [x0]
+ eor v2.16b, v2.16b, v22.16b
+ str q2, [x1], #16
+
+.Lctr_enc_done:
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lctr_enc_bzero: // wipe key schedule [if any]
+ stp q0, q1, [sp], #32
+ cmp sp, x14
+ bne .Lctr_enc_bzero
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp fp, lr, [sp], #80
+ ret
+
+.Lctr_enc_short:
+ stp fp, lr, [sp, #-96]!
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+ str x23, [sp, #48]
+
+ mov x19, x0 // copy arguments
+ mov x20, x1
+ mov x21, x2
+ mov x22, x3
+ ldr w23, [x4, #12] // load counter .LSW
+ ldr q1, [x4] // load whole counter value
+#ifdef __ARMEL__
+ rev w23, w23
+#endif
+ str q1, [sp, #80] // copy counter value
+
+.Lctr_enc_short_loop:
+ add x0, sp, #80 // input counter value
+ add x1, sp, #64 // output on the stack
+ mov x2, x22 // key
+
+ bl AES_encrypt
+
+ ldr q0, [x19], #16 // load input
+ ldr q1, [sp, #64] // load encrypted counter
+ add x23, x23, #1
+#ifdef __ARMEL__
+ rev w0, w23
+ str w0, [sp, #80+12] // next counter value
+#else
+ str w23, [sp, #80+12] // next counter value
+#endif
+ eor v0.16b, v0.16b, v1.16b
+ str q0, [x20], #16 // store output
+ subs x21, x21, #1
+ bne .Lctr_enc_short_loop
+
+ movi v0.16b, #0
+ movi v1.16b, #0
+ stp q0, q1, [sp, #64]
+
+ ldr x23, [sp, #48]
+ ldp x21, x22, [sp, #32]
+ ldp x19, x20, [sp, #16]
+ ldp fp, lr, [sp], #96
+ ret
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+// On entry:
+// x0 -> input plaintext
+// x1 -> output ciphertext
+// x2 -> length of text in bytes (must be at least 16)
+// x3 -> key1 (used to encrypt the XORed plaintext blocks)
+// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
+// x5 -> 16-byte initial vector (typically, sector number)
+// On exit:
+// Output ciphertext filled in
+// No output registers, usual AAPCS64 register preservation
+bsaes_xts_encrypt:
+ // Stack layout:
+ // sp ->
+ // nrounds*128-96 bytes: key schedule
+ // x19 ->
+ // 16 bytes: frame record
+ // 4*16 bytes: tweak storage across _bsaes_encrypt8
+ // 6*8 bytes: storage for 5 callee-saved general-purpose registers
+ // 8*8 bytes: storage for 8 callee-saved SIMD registers
+ stp fp, lr, [sp, #-192]!
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ str x23, [sp, #112]
+ stp d8, d9, [sp, #128]
+ stp d10, d11, [sp, #144]
+ stp d12, d13, [sp, #160]
+ stp d14, d15, [sp, #176]
+
+ mov x19, sp
+ mov x20, x0
+ mov x21, x1
+ mov x22, x2
+ mov x23, x3
+
+ // generate initial tweak
+ sub sp, sp, #16
+ mov x0, x5 // iv[]
+ mov x1, sp
+ mov x2, x4 // key2
+ bl AES_encrypt
+ ldr q11, [sp], #16
+
+ ldr w1, [x23, #240] // get # of rounds
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x23 // pass key
+ mov x10, x1 // pass # of rounds
+ mov sp, x17
+ bl _bsaes_key_convert
+ eor v15.16b, v15.16b, v7.16b // fix up last round key
+ str q15, [x17] // save last round key
+
+ subs x22, x22, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ ldr q8, .Lxts_magic
+ mov x10, x1 // pass rounds
+ add x2, x19, #16
+ ldr q0, [x20], #16
+ sshr v1.2d, v11.2d, #63
+ mov x9, sp // pass key schedule
+ ldr q6, .Lxts_magic+16
+ add v2.2d, v11.2d, v11.2d
+ cmtst v3.2d, v11.2d, v6.2d
+ and v1.16b, v1.16b, v8.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ ldr q4, [x20], #16
+ eor v12.16b, v2.16b, v1.16b
+ eor v1.16b, v4.16b, v12.16b
+ eor v0.16b, v0.16b, v11.16b
+ cmtst v2.2d, v12.2d, v6.2d
+ add v4.2d, v12.2d, v12.2d
+ add x0, x19, #16
+ ext v3.16b, v3.16b, v3.16b, #8
+ and v2.16b, v2.16b, v8.16b
+ eor v13.16b, v4.16b, v3.16b
+ ldr q3, [x20], #16
+ ext v4.16b, v2.16b, v2.16b, #8
+ eor v2.16b, v3.16b, v13.16b
+ ldr q3, [x20], #16
+ add v5.2d, v13.2d, v13.2d
+ cmtst v7.2d, v13.2d, v6.2d
+ and v7.16b, v7.16b, v8.16b
+ ldr q9, [x20], #16
+ ext v7.16b, v7.16b, v7.16b, #8
+ ldr q10, [x20], #16
+ eor v14.16b, v5.16b, v4.16b
+ ldr q16, [x20], #16
+ add v4.2d, v14.2d, v14.2d
+ eor v3.16b, v3.16b, v14.16b
+ eor v15.16b, v4.16b, v7.16b
+ add v5.2d, v15.2d, v15.2d
+ ldr q7, [x20], #16
+ cmtst v4.2d, v14.2d, v6.2d
+ and v17.16b, v4.16b, v8.16b
+ cmtst v18.2d, v15.2d, v6.2d
+ eor v4.16b, v9.16b, v15.16b
+ ext v9.16b, v17.16b, v17.16b, #8
+ eor v9.16b, v5.16b, v9.16b
+ add v17.2d, v9.2d, v9.2d
+ and v18.16b, v18.16b, v8.16b
+ eor v5.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ ext v10.16b, v18.16b, v18.16b, #8
+ cmtst v9.2d, v9.2d, v6.2d
+ and v9.16b, v9.16b, v8.16b
+ eor v10.16b, v17.16b, v10.16b
+ cmtst v17.2d, v10.2d, v6.2d
+ eor v6.16b, v16.16b, v10.16b
+ str q10, [x2], #16
+ ext v9.16b, v9.16b, v9.16b, #8
+ add v10.2d, v10.2d, v10.2d
+ eor v9.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ eor v7.16b, v7.16b, v9.16b
+ add v9.2d, v9.2d, v9.2d
+ and v8.16b, v17.16b, v8.16b
+ ext v8.16b, v8.16b, v8.16b, #8
+ eor v8.16b, v9.16b, v8.16b
+ str q8, [x2] // next round tweak
+
+ bl _bsaes_encrypt8
+
+ ldr q8, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q9, [x0], #16
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ ldr q10, [x0], #16
+ eor v3.16b, v3.16b, v15.16b
+ subs x22, x22, #0x80
+ str q0, [x21], #16
+ ldr q11, [x0] // next round tweak
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v8.16b
+ eor v1.16b, v2.16b, v9.16b
+ str q4, [x21], #16
+ eor v2.16b, v5.16b, v10.16b
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q2, [x21], #16
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds x22, x22, #0x70
+ bmi .Lxts_enc_done
+
+ ldr q8, .Lxts_magic
+ sshr v1.2d, v11.2d, #63
+ add v2.2d, v11.2d, v11.2d
+ ldr q9, .Lxts_magic+16
+ subs x22, x22, #0x10
+ ldr q0, [x20], #16
+ and v1.16b, v1.16b, v8.16b
+ cmtst v3.2d, v11.2d, v9.2d
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ eor v12.16b, v2.16b, v1.16b
+ ext v1.16b, v3.16b, v3.16b, #8
+ add v2.2d, v12.2d, v12.2d
+ cmtst v3.2d, v12.2d, v9.2d
+ eor v13.16b, v2.16b, v1.16b
+ and v22.16b, v3.16b, v8.16b
+ bmi .Lxts_enc_1
+
+ ext v2.16b, v22.16b, v22.16b, #8
+ add v3.2d, v13.2d, v13.2d
+ ldr q1, [x20], #16
+ cmtst v4.2d, v13.2d, v9.2d
+ subs x22, x22, #0x10
+ eor v14.16b, v3.16b, v2.16b
+ and v23.16b, v4.16b, v8.16b
+ bmi .Lxts_enc_2
+
+ ext v3.16b, v23.16b, v23.16b, #8
+ add v4.2d, v14.2d, v14.2d
+ ldr q2, [x20], #16
+ cmtst v5.2d, v14.2d, v9.2d
+ eor v0.16b, v0.16b, v11.16b
+ subs x22, x22, #0x10
+ eor v15.16b, v4.16b, v3.16b
+ and v24.16b, v5.16b, v8.16b
+ bmi .Lxts_enc_3
+
+ ext v4.16b, v24.16b, v24.16b, #8
+ add v5.2d, v15.2d, v15.2d
+ ldr q3, [x20], #16
+ cmtst v6.2d, v15.2d, v9.2d
+ eor v1.16b, v1.16b, v12.16b
+ subs x22, x22, #0x10
+ eor v16.16b, v5.16b, v4.16b
+ and v25.16b, v6.16b, v8.16b
+ bmi .Lxts_enc_4
+
+ ext v5.16b, v25.16b, v25.16b, #8
+ add v6.2d, v16.2d, v16.2d
+ add x0, x19, #16
+ cmtst v7.2d, v16.2d, v9.2d
+ ldr q4, [x20], #16
+ eor v2.16b, v2.16b, v13.16b
+ str q16, [x0], #16
+ subs x22, x22, #0x10
+ eor v17.16b, v6.16b, v5.16b
+ and v26.16b, v7.16b, v8.16b
+ bmi .Lxts_enc_5
+
+ ext v7.16b, v26.16b, v26.16b, #8
+ add v18.2d, v17.2d, v17.2d
+ ldr q5, [x20], #16
+ eor v3.16b, v3.16b, v14.16b
+ str q17, [x0], #16
+ subs x22, x22, #0x10
+ eor v18.16b, v18.16b, v7.16b
+ bmi .Lxts_enc_6
+
+ ldr q6, [x20], #16
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ str q18, [x0] // next round tweak
+ mov x9, sp // pass key schedule
+ mov x10, x1
+ add x0, x19, #16
+ sub x22, x22, #0x10
+ eor v6.16b, v6.16b, v17.16b
+
+ bl _bsaes_encrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q17, [x0], #16
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ ldr q11, [x0] // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ eor v1.16b, v2.16b, v17.16b
+ str q4, [x21], #16
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_6:
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ ldr q11, [x0] // next round tweak
+ eor v3.16b, v3.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ str q4, [x21], #16
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q0, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_5:
+ eor v3.16b, v3.16b, v14.16b
+ eor v4.16b, v4.16b, v15.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q11, [x0] // next round tweak
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q4, [x21], #16
+ str q6, [x21], #16
+ str q3, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_4:
+ eor v2.16b, v2.16b, v13.16b
+ eor v3.16b, v3.16b, v14.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ mov v11.16b, v15.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q4, [x21], #16
+ str q6, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_3:
+ eor v1.16b, v1.16b, v12.16b
+ eor v2.16b, v2.16b, v13.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v13.16b
+ mov v11.16b, v14.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q4, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_2:
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_encrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov v11.16b, v13.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_enc_done
+
+.align 4
+.Lxts_enc_1:
+ eor v0.16b, v0.16b, v11.16b
+ sub x0, sp, #16
+ sub x1, sp, #16
+ mov x2, x23
+ mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
+ mov v14.d[0], v12.d[1]
+ str q0, [sp, #-16]!
+
+ bl AES_encrypt
+
+ ldr q0, [sp], #16
+ trn1 v13.2d, v11.2d, v13.2d
+ trn1 v11.2d, v12.2d, v14.2d // next round tweak
+ eor v0.16b, v0.16b, v13.16b
+ str q0, [x21], #16
+
+.Lxts_enc_done:
+ adds x22, x22, #0x10
+ beq .Lxts_enc_ret
+
+ sub x6, x21, #0x10
+ // Penultimate plaintext block produces final ciphertext part-block
+ // plus remaining part of final plaintext block. Move ciphertext part
+ // to final position and re-use penultimate ciphertext block buffer to
+ // construct final plaintext block
+.Lxts_enc_steal:
+ ldrb w0, [x20], #1
+ ldrb w1, [x21, #-0x10]
+ strb w0, [x21, #-0x10]
+ strb w1, [x21], #1
+
+ subs x22, x22, #1
+ bhi .Lxts_enc_steal
+
+ // Finally encrypt the penultimate ciphertext block using the
+ // last tweak
+ ldr q0, [x6]
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [sp, #-16]!
+ mov x0, sp
+ mov x1, sp
+ mov x2, x23
+ mov x21, x6
+ mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
+
+ bl AES_encrypt
+
+ trn1 v11.2d, v11.2d, v13.2d
+ ldr q0, [sp], #16
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [x21]
+
+.Lxts_enc_ret:
+
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lxts_enc_bzero: // wipe key schedule
+ stp q0, q1, [sp], #32
+ cmp sp, x19
+ bne .Lxts_enc_bzero
+
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldr x23, [sp, #112]
+ ldp d8, d9, [sp, #128]
+ ldp d10, d11, [sp, #144]
+ ldp d12, d13, [sp, #160]
+ ldp d14, d15, [sp, #176]
+ ldp fp, lr, [sp], #192
+ ret
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+// The assembler doesn't seem capable of de-duplicating these when expressed
+// using `ldr qd,=` syntax, so assign a symbolic address
+.align 5
+.Lxts_magic:
+.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+// On entry:
+// x0 -> input ciphertext
+// x1 -> output plaintext
+// x2 -> length of text in bytes (must be at least 16)
+// x3 -> key1 (used to decrypt the XORed ciphertext blocks)
+// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
+// x5 -> 16-byte initial vector (typically, sector number)
+// On exit:
+// Output plaintext filled in
+// No output registers, usual AAPCS64 register preservation
+bsaes_xts_decrypt:
+ // Stack layout:
+ // sp ->
+ // nrounds*128-96 bytes: key schedule
+ // x19 ->
+ // 16 bytes: frame record
+ // 4*16 bytes: tweak storage across _bsaes_decrypt8
+ // 6*8 bytes: storage for 5 callee-saved general-purpose registers
+ // 8*8 bytes: storage for 8 callee-saved SIMD registers
+ stp fp, lr, [sp, #-192]!
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ str x23, [sp, #112]
+ stp d8, d9, [sp, #128]
+ stp d10, d11, [sp, #144]
+ stp d12, d13, [sp, #160]
+ stp d14, d15, [sp, #176]
+
+ mov x19, sp
+ mov x20, x0
+ mov x21, x1
+ mov x22, x2
+ mov x23, x3
+
+ // generate initial tweak
+ sub sp, sp, #16
+ mov x0, x5 // iv[]
+ mov x1, sp
+ mov x2, x4 // key2
+ bl AES_encrypt
+ ldr q11, [sp], #16
+
+ ldr w1, [x23, #240] // get # of rounds
+ // allocate the key schedule on the stack
+ add x17, sp, #96
+ sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
+
+ // populate the key schedule
+ mov x9, x23 // pass key
+ mov x10, x1 // pass # of rounds
+ mov sp, x17
+ bl _bsaes_key_convert
+ ldr q6, [sp]
+ str q15, [x17] // save last round key
+ eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
+ str q6, [sp]
+
+ sub x30, x22, #0x10
+ tst x22, #0xf // if not multiple of 16
+ csel x22, x30, x22, ne // subtract another 16 bytes
+ subs x22, x22, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ ldr q8, .Lxts_magic
+ mov x10, x1 // pass rounds
+ add x2, x19, #16
+ ldr q0, [x20], #16
+ sshr v1.2d, v11.2d, #63
+ mov x9, sp // pass key schedule
+ ldr q6, .Lxts_magic+16
+ add v2.2d, v11.2d, v11.2d
+ cmtst v3.2d, v11.2d, v6.2d
+ and v1.16b, v1.16b, v8.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ ldr q4, [x20], #16
+ eor v12.16b, v2.16b, v1.16b
+ eor v1.16b, v4.16b, v12.16b
+ eor v0.16b, v0.16b, v11.16b
+ cmtst v2.2d, v12.2d, v6.2d
+ add v4.2d, v12.2d, v12.2d
+ add x0, x19, #16
+ ext v3.16b, v3.16b, v3.16b, #8
+ and v2.16b, v2.16b, v8.16b
+ eor v13.16b, v4.16b, v3.16b
+ ldr q3, [x20], #16
+ ext v4.16b, v2.16b, v2.16b, #8
+ eor v2.16b, v3.16b, v13.16b
+ ldr q3, [x20], #16
+ add v5.2d, v13.2d, v13.2d
+ cmtst v7.2d, v13.2d, v6.2d
+ and v7.16b, v7.16b, v8.16b
+ ldr q9, [x20], #16
+ ext v7.16b, v7.16b, v7.16b, #8
+ ldr q10, [x20], #16
+ eor v14.16b, v5.16b, v4.16b
+ ldr q16, [x20], #16
+ add v4.2d, v14.2d, v14.2d
+ eor v3.16b, v3.16b, v14.16b
+ eor v15.16b, v4.16b, v7.16b
+ add v5.2d, v15.2d, v15.2d
+ ldr q7, [x20], #16
+ cmtst v4.2d, v14.2d, v6.2d
+ and v17.16b, v4.16b, v8.16b
+ cmtst v18.2d, v15.2d, v6.2d
+ eor v4.16b, v9.16b, v15.16b
+ ext v9.16b, v17.16b, v17.16b, #8
+ eor v9.16b, v5.16b, v9.16b
+ add v17.2d, v9.2d, v9.2d
+ and v18.16b, v18.16b, v8.16b
+ eor v5.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ ext v10.16b, v18.16b, v18.16b, #8
+ cmtst v9.2d, v9.2d, v6.2d
+ and v9.16b, v9.16b, v8.16b
+ eor v10.16b, v17.16b, v10.16b
+ cmtst v17.2d, v10.2d, v6.2d
+ eor v6.16b, v16.16b, v10.16b
+ str q10, [x2], #16
+ ext v9.16b, v9.16b, v9.16b, #8
+ add v10.2d, v10.2d, v10.2d
+ eor v9.16b, v10.16b, v9.16b
+ str q9, [x2], #16
+ eor v7.16b, v7.16b, v9.16b
+ add v9.2d, v9.2d, v9.2d
+ and v8.16b, v17.16b, v8.16b
+ ext v8.16b, v8.16b, v8.16b, #8
+ eor v8.16b, v9.16b, v8.16b
+ str q8, [x2] // next round tweak
+
+ bl _bsaes_decrypt8
+
+ eor v6.16b, v6.16b, v13.16b
+ eor v0.16b, v0.16b, v11.16b
+ ldr q8, [x0], #16
+ eor v7.16b, v7.16b, v8.16b
+ str q0, [x21], #16
+ eor v0.16b, v1.16b, v12.16b
+ ldr q1, [x0], #16
+ eor v1.16b, v3.16b, v1.16b
+ subs x22, x22, #0x80
+ eor v2.16b, v2.16b, v15.16b
+ eor v3.16b, v4.16b, v14.16b
+ ldr q4, [x0], #16
+ str q0, [x21], #16
+ ldr q11, [x0] // next round tweak
+ eor v0.16b, v5.16b, v4.16b
+ str q6, [x21], #16
+ str q3, [x21], #16
+ str q2, [x21], #16
+ str q7, [x21], #16
+ str q1, [x21], #16
+ str q0, [x21], #16
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds x22, x22, #0x70
+ bmi .Lxts_dec_done
+
+ ldr q8, .Lxts_magic
+ sshr v1.2d, v11.2d, #63
+ add v2.2d, v11.2d, v11.2d
+ ldr q9, .Lxts_magic+16
+ subs x22, x22, #0x10
+ ldr q0, [x20], #16
+ and v1.16b, v1.16b, v8.16b
+ cmtst v3.2d, v11.2d, v9.2d
+ ext v1.16b, v1.16b, v1.16b, #8
+ and v3.16b, v3.16b, v8.16b
+ eor v12.16b, v2.16b, v1.16b
+ ext v1.16b, v3.16b, v3.16b, #8
+ add v2.2d, v12.2d, v12.2d
+ cmtst v3.2d, v12.2d, v9.2d
+ eor v13.16b, v2.16b, v1.16b
+ and v22.16b, v3.16b, v8.16b
+ bmi .Lxts_dec_1
+
+ ext v2.16b, v22.16b, v22.16b, #8
+ add v3.2d, v13.2d, v13.2d
+ ldr q1, [x20], #16
+ cmtst v4.2d, v13.2d, v9.2d
+ subs x22, x22, #0x10
+ eor v14.16b, v3.16b, v2.16b
+ and v23.16b, v4.16b, v8.16b
+ bmi .Lxts_dec_2
+
+ ext v3.16b, v23.16b, v23.16b, #8
+ add v4.2d, v14.2d, v14.2d
+ ldr q2, [x20], #16
+ cmtst v5.2d, v14.2d, v9.2d
+ eor v0.16b, v0.16b, v11.16b
+ subs x22, x22, #0x10
+ eor v15.16b, v4.16b, v3.16b
+ and v24.16b, v5.16b, v8.16b
+ bmi .Lxts_dec_3
+
+ ext v4.16b, v24.16b, v24.16b, #8
+ add v5.2d, v15.2d, v15.2d
+ ldr q3, [x20], #16
+ cmtst v6.2d, v15.2d, v9.2d
+ eor v1.16b, v1.16b, v12.16b
+ subs x22, x22, #0x10
+ eor v16.16b, v5.16b, v4.16b
+ and v25.16b, v6.16b, v8.16b
+ bmi .Lxts_dec_4
+
+ ext v5.16b, v25.16b, v25.16b, #8
+ add v6.2d, v16.2d, v16.2d
+ add x0, x19, #16
+ cmtst v7.2d, v16.2d, v9.2d
+ ldr q4, [x20], #16
+ eor v2.16b, v2.16b, v13.16b
+ str q16, [x0], #16
+ subs x22, x22, #0x10
+ eor v17.16b, v6.16b, v5.16b
+ and v26.16b, v7.16b, v8.16b
+ bmi .Lxts_dec_5
+
+ ext v7.16b, v26.16b, v26.16b, #8
+ add v18.2d, v17.2d, v17.2d
+ ldr q5, [x20], #16
+ eor v3.16b, v3.16b, v14.16b
+ str q17, [x0], #16
+ subs x22, x22, #0x10
+ eor v18.16b, v18.16b, v7.16b
+ bmi .Lxts_dec_6
+
+ ldr q6, [x20], #16
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ str q18, [x0] // next round tweak
+ mov x9, sp // pass key schedule
+ mov x10, x1
+ add x0, x19, #16
+ sub x22, x22, #0x10
+ eor v6.16b, v6.16b, v17.16b
+
+ bl _bsaes_decrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q17, [x0], #16
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ eor v2.16b, v2.16b, v15.16b
+ ldr q11, [x0] // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ eor v1.16b, v3.16b, v17.16b
+ str q6, [x21], #16
+ str q4, [x21], #16
+ str q2, [x21], #16
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_6:
+ eor v4.16b, v4.16b, v15.16b
+ eor v5.16b, v5.16b, v16.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ ldr q16, [x0], #16
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ ldr q11, [x0] // next round tweak
+ eor v2.16b, v2.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ eor v0.16b, v7.16b, v16.16b
+ str q6, [x21], #16
+ str q4, [x21], #16
+ str q2, [x21], #16
+ str q0, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_5:
+ eor v3.16b, v3.16b, v14.16b
+ eor v4.16b, v4.16b, v15.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ ldr q11, [x0] // next round tweak
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ eor v2.16b, v2.16b, v15.16b
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q6, [x21], #16
+ str q4, [x21], #16
+ str q2, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_4:
+ eor v2.16b, v2.16b, v13.16b
+ eor v3.16b, v3.16b, v14.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v4.16b, v4.16b, v14.16b
+ mov v11.16b, v15.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q6, [x21], #16
+ str q4, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_3:
+ eor v1.16b, v1.16b, v12.16b
+ eor v2.16b, v2.16b, v13.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ mov v11.16b, v14.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ str q6, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_2:
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov x9, sp // pass key schedule
+ mov x10, x1 // pass rounds
+ add x0, x19, #16
+
+ bl _bsaes_decrypt8
+
+ eor v0.16b, v0.16b, v11.16b
+ eor v1.16b, v1.16b, v12.16b
+ mov v11.16b, v13.16b // next round tweak
+ str q0, [x21], #16
+ str q1, [x21], #16
+ b .Lxts_dec_done
+
+.align 4
+.Lxts_dec_1:
+ eor v0.16b, v0.16b, v11.16b
+ sub x0, sp, #16
+ sub x1, sp, #16
+ mov x2, x23
+ mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
+ mov v14.d[0], v12.d[1]
+ str q0, [sp, #-16]!
+
+ bl AES_decrypt
+
+ ldr q0, [sp], #16
+ trn1 v13.2d, v11.2d, v13.2d
+ trn1 v11.2d, v12.2d, v14.2d // next round tweak
+ eor v0.16b, v0.16b, v13.16b
+ str q0, [x21], #16
+
+.Lxts_dec_done:
+ adds x22, x22, #0x10
+ beq .Lxts_dec_ret
+
+ // calculate one round of extra tweak for the stolen ciphertext
+ ldr q8, .Lxts_magic
+ sshr v6.2d, v11.2d, #63
+ and v6.16b, v6.16b, v8.16b
+ add v12.2d, v11.2d, v11.2d
+ ext v6.16b, v6.16b, v6.16b, #8
+ eor v12.16b, v12.16b, v6.16b
+
+ // perform the final decryption with the last tweak value
+ ldr q0, [x20], #16
+ eor v0.16b, v0.16b, v12.16b
+ str q0, [sp, #-16]!
+ mov x0, sp
+ mov x1, sp
+ mov x2, x23
+ mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
+ mov v14.d[0], v12.d[1]
+
+ bl AES_decrypt
+
+ trn1 v12.2d, v12.2d, v14.2d
+ trn1 v11.2d, v11.2d, v13.2d
+ ldr q0, [sp], #16
+ eor v0.16b, v0.16b, v12.16b
+ str q0, [x21]
+
+ mov x6, x21
+ // Penultimate ciphertext block produces final plaintext part-block
+ // plus remaining part of final ciphertext block. Move plaintext part
+ // to final position and re-use penultimate plaintext block buffer to
+ // construct final ciphertext block
+.Lxts_dec_steal:
+ ldrb w1, [x21]
+ ldrb w0, [x20], #1
+ strb w1, [x21, #0x10]
+ strb w0, [x21], #1
+
+ subs x22, x22, #1
+ bhi .Lxts_dec_steal
+
+ // Finally decrypt the penultimate plaintext block using the
+ // penultimate tweak
+ ldr q0, [x6]
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [sp, #-16]!
+ mov x0, sp
+ mov x1, sp
+ mov x2, x23
+ mov x21, x6
+
+ bl AES_decrypt
+
+ trn1 v11.2d, v11.2d, v13.2d
+ ldr q0, [sp], #16
+ eor v0.16b, v0.16b, v11.16b
+ str q0, [x21]
+
+.Lxts_dec_ret:
+
+ movi v0.16b, #0
+ movi v1.16b, #0
+.Lxts_dec_bzero: // wipe key schedule
+ stp q0, q1, [sp], #32
+ cmp sp, x19
+ bne .Lxts_dec_bzero
+
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldr x23, [sp, #112]
+ ldp d8, d9, [sp, #128]
+ ldp d10, d11, [sp, #144]
+ ldp d12, d13, [sp, #160]
+ ldp d14, d15, [sp, #176]
+ ldp fp, lr, [sp], #192
+ ret
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
diff --git a/crypto/aes/build.info b/crypto/aes/build.info
index 66bff1ae73..b17f7e5c1f 100644
--- a/crypto/aes/build.info
+++ b/crypto/aes/build.info
@@ -30,8 +30,8 @@ IF[{- !$disabled{asm} -}]
$AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S
$AESDEF_armv4=AES_ASM BSAES_ASM
- $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S vpaes-armv8.S
- $AESDEF_aarch64=VPAES_ASM
+ $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S
+ $AESDEF_aarch64=BSAES_ASM VPAES_ASM
$AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s
$AESDEF_parisc11=AES_ASM
@@ -73,6 +73,7 @@ DEFINE[../../providers/libfips.a]=$AESDEF
DEFINE[../../providers/libdefault.a]=$AESDEF
GENERATE[aes-ia64.s]=asm/aes-ia64.S
+GENERATE[bsaes-armv8.S]=asm/bsaes-armv8.S
GENERATE[aes-586.s]=asm/aes-586.pl
DEPEND[aes-586.s]=../perlasm/x86asm.pl
More information about the openssl-commits
mailing list