[openssl] master update
Matt Caswell
matt at openssl.org
Thu May 20 07:51:55 UTC 2021
The branch master has been updated
via e3884ec5c37334e585e9208ce69d7e5b3cad4624 (commit)
from b7140b0604bdfaa034452d97648a9c23a97568e4 (commit)
- Log -----------------------------------------------------------------
commit e3884ec5c37334e585e9208ce69d7e5b3cad4624
Author: Pauli <pauli at openssl.org>
Date: Thu May 20 13:51:59 2021 +1000
Revert "ARM assembly pack: translate bit-sliced AES implementation to AArch64"
This reverts commit da51566b256e0c0536d5b986e676863b0526bf5e.
Fixes #15321
Reviewed-by: Tim Hudson <tjh at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/15364)
-----------------------------------------------------------------------
Summary of changes:
crypto/aes/asm/bsaes-armv8.S | 2338 ------------------------------------------
crypto/aes/build.info | 5 +-
2 files changed, 2 insertions(+), 2341 deletions(-)
delete mode 100644 crypto/aes/asm/bsaes-armv8.S
diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S
deleted file mode 100644
index 9bd02d0c8a..0000000000
--- a/crypto/aes/asm/bsaes-armv8.S
+++ /dev/null
@@ -1,2338 +0,0 @@
-// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-//
-// ====================================================================
-// Written by Ben Avison <bavison at riscosopen.org> for the OpenSSL
-// project. Rights for redistribution and usage in source and binary
-// forms are granted according to the OpenSSL license.
-// ====================================================================
-//
-// This implementation is a translation of bsaes-armv7 for AArch64.
-// No attempt has been made to carry across the build switches for
-// kernel targets, since the Linux kernel crypto support has moved on
-// from when it was based on OpenSSL.
-
-// A lot of hand-scheduling has been performed. Consequently, this code
-// doesn't factor out neatly into macros in the same way that the
-// AArch32 version did, and there is little to be gained by wrapping it
-// up in Perl, and it is presented as pure assembly.
-
-
-#include "crypto/arm_arch.h"
-
-.text
-
-.type _bsaes_decrypt8,%function
-.align 4
-// On entry:
-// x9 -> key (previously expanded using _bsaes_key_convert)
-// x10 = number of rounds
-// v0-v7 input data
-// On exit:
-// x9-x11 corrupted
-// other general-purpose registers preserved
-// v0-v7 output data
-// v11-v15 preserved
-// other SIMD registers corrupted
-_bsaes_decrypt8:
- ldr q8, [x9], #16
- adr x11, .LM0ISR
- movi v9.16b, #0x55
- ldr q10, [x11], #16
- movi v16.16b, #0x33
- movi v17.16b, #0x0f
- sub x10, x10, #1
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v8.16b
- eor v2.16b, v2.16b, v8.16b
- eor v4.16b, v4.16b, v8.16b
- eor v3.16b, v3.16b, v8.16b
- eor v5.16b, v5.16b, v8.16b
- tbl v0.16b, {v0.16b}, v10.16b
- tbl v1.16b, {v1.16b}, v10.16b
- tbl v2.16b, {v2.16b}, v10.16b
- tbl v4.16b, {v4.16b}, v10.16b
- eor v6.16b, v6.16b, v8.16b
- eor v7.16b, v7.16b, v8.16b
- tbl v3.16b, {v3.16b}, v10.16b
- tbl v5.16b, {v5.16b}, v10.16b
- tbl v6.16b, {v6.16b}, v10.16b
- ushr v8.2d, v0.2d, #1
- tbl v7.16b, {v7.16b}, v10.16b
- ushr v10.2d, v4.2d, #1
- ushr v18.2d, v2.2d, #1
- eor v8.16b, v8.16b, v1.16b
- ushr v19.2d, v6.2d, #1
- eor v10.16b, v10.16b, v5.16b
- eor v18.16b, v18.16b, v3.16b
- and v8.16b, v8.16b, v9.16b
- eor v19.16b, v19.16b, v7.16b
- and v10.16b, v10.16b, v9.16b
- and v18.16b, v18.16b, v9.16b
- eor v1.16b, v1.16b, v8.16b
- shl v8.2d, v8.2d, #1
- and v9.16b, v19.16b, v9.16b
- eor v5.16b, v5.16b, v10.16b
- shl v10.2d, v10.2d, #1
- eor v3.16b, v3.16b, v18.16b
- shl v18.2d, v18.2d, #1
- eor v0.16b, v0.16b, v8.16b
- shl v8.2d, v9.2d, #1
- eor v7.16b, v7.16b, v9.16b
- eor v4.16b, v4.16b, v10.16b
- eor v2.16b, v2.16b, v18.16b
- ushr v9.2d, v1.2d, #2
- eor v6.16b, v6.16b, v8.16b
- ushr v8.2d, v0.2d, #2
- ushr v10.2d, v5.2d, #2
- ushr v18.2d, v4.2d, #2
- eor v9.16b, v9.16b, v3.16b
- eor v8.16b, v8.16b, v2.16b
- eor v10.16b, v10.16b, v7.16b
- eor v18.16b, v18.16b, v6.16b
- and v9.16b, v9.16b, v16.16b
- and v8.16b, v8.16b, v16.16b
- and v10.16b, v10.16b, v16.16b
- and v16.16b, v18.16b, v16.16b
- eor v3.16b, v3.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v2.16b, v2.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v7.16b, v7.16b, v10.16b
- shl v10.2d, v10.2d, #2
- eor v6.16b, v6.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v1.16b, v1.16b, v9.16b
- eor v0.16b, v0.16b, v8.16b
- eor v5.16b, v5.16b, v10.16b
- eor v4.16b, v4.16b, v16.16b
- ushr v8.2d, v3.2d, #4
- ushr v9.2d, v2.2d, #4
- ushr v10.2d, v1.2d, #4
- ushr v16.2d, v0.2d, #4
- eor v8.16b, v8.16b, v7.16b
- eor v9.16b, v9.16b, v6.16b
- eor v10.16b, v10.16b, v5.16b
- eor v16.16b, v16.16b, v4.16b
- and v8.16b, v8.16b, v17.16b
- and v9.16b, v9.16b, v17.16b
- and v10.16b, v10.16b, v17.16b
- and v16.16b, v16.16b, v17.16b
- eor v7.16b, v7.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v6.16b, v6.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v5.16b, v5.16b, v10.16b
- shl v10.2d, v10.2d, #4
- eor v4.16b, v4.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v3.16b, v3.16b, v8.16b
- eor v2.16b, v2.16b, v9.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v16.16b
- b .Ldec_sbox
-.align 4
-.Ldec_loop:
- ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
- ldp q8, q9, [x9], #32
- eor v0.16b, v16.16b, v0.16b
- ldr q10, [x9], #16
- eor v1.16b, v17.16b, v1.16b
- ldr q16, [x9], #16
- eor v2.16b, v18.16b, v2.16b
- eor v3.16b, v19.16b, v3.16b
- eor v4.16b, v8.16b, v4.16b
- eor v5.16b, v9.16b, v5.16b
- eor v6.16b, v10.16b, v6.16b
- eor v7.16b, v16.16b, v7.16b
- tbl v0.16b, {v0.16b}, v28.16b
- tbl v1.16b, {v1.16b}, v28.16b
- tbl v2.16b, {v2.16b}, v28.16b
- tbl v3.16b, {v3.16b}, v28.16b
- tbl v4.16b, {v4.16b}, v28.16b
- tbl v5.16b, {v5.16b}, v28.16b
- tbl v6.16b, {v6.16b}, v28.16b
- tbl v7.16b, {v7.16b}, v28.16b
-.Ldec_sbox:
- eor v1.16b, v1.16b, v4.16b
- eor v3.16b, v3.16b, v4.16b
- subs x10, x10, #1
- eor v4.16b, v4.16b, v7.16b
- eor v2.16b, v2.16b, v7.16b
- eor v1.16b, v1.16b, v6.16b
- eor v6.16b, v6.16b, v4.16b
- eor v2.16b, v2.16b, v5.16b
- eor v0.16b, v0.16b, v1.16b
- eor v7.16b, v7.16b, v6.16b
- eor v8.16b, v6.16b, v2.16b
- and v9.16b, v4.16b, v6.16b
- eor v10.16b, v2.16b, v6.16b
- eor v3.16b, v3.16b, v0.16b
- eor v5.16b, v5.16b, v0.16b
- eor v16.16b, v7.16b, v4.16b
- eor v17.16b, v4.16b, v0.16b
- and v18.16b, v0.16b, v2.16b
- eor v19.16b, v7.16b, v4.16b
- eor v1.16b, v1.16b, v3.16b
- eor v20.16b, v3.16b, v0.16b
- eor v21.16b, v5.16b, v2.16b
- eor v22.16b, v3.16b, v7.16b
- and v8.16b, v17.16b, v8.16b
- orr v17.16b, v3.16b, v5.16b
- eor v23.16b, v1.16b, v6.16b
- eor v24.16b, v20.16b, v16.16b
- eor v25.16b, v1.16b, v5.16b
- orr v26.16b, v20.16b, v21.16b
- and v20.16b, v20.16b, v21.16b
- and v27.16b, v7.16b, v1.16b
- eor v21.16b, v21.16b, v23.16b
- orr v28.16b, v16.16b, v23.16b
- orr v29.16b, v22.16b, v25.16b
- eor v26.16b, v26.16b, v8.16b
- and v16.16b, v16.16b, v23.16b
- and v22.16b, v22.16b, v25.16b
- and v21.16b, v24.16b, v21.16b
- eor v8.16b, v28.16b, v8.16b
- eor v23.16b, v5.16b, v2.16b
- eor v24.16b, v1.16b, v6.16b
- eor v16.16b, v16.16b, v22.16b
- eor v22.16b, v3.16b, v0.16b
- eor v25.16b, v29.16b, v21.16b
- eor v21.16b, v26.16b, v21.16b
- eor v8.16b, v8.16b, v20.16b
- eor v26.16b, v23.16b, v24.16b
- eor v16.16b, v16.16b, v20.16b
- eor v28.16b, v22.16b, v19.16b
- eor v20.16b, v25.16b, v20.16b
- eor v9.16b, v21.16b, v9.16b
- eor v8.16b, v8.16b, v18.16b
- eor v18.16b, v5.16b, v1.16b
- eor v21.16b, v16.16b, v17.16b
- eor v16.16b, v16.16b, v17.16b
- eor v17.16b, v20.16b, v27.16b
- eor v20.16b, v3.16b, v7.16b
- eor v25.16b, v9.16b, v8.16b
- eor v27.16b, v0.16b, v4.16b
- and v29.16b, v9.16b, v17.16b
- eor v30.16b, v8.16b, v29.16b
- eor v31.16b, v21.16b, v29.16b
- eor v29.16b, v21.16b, v29.16b
- bsl v30.16b, v17.16b, v21.16b
- bsl v31.16b, v9.16b, v8.16b
- bsl v16.16b, v30.16b, v29.16b
- bsl v21.16b, v29.16b, v30.16b
- eor v8.16b, v31.16b, v30.16b
- and v1.16b, v1.16b, v31.16b
- and v9.16b, v16.16b, v31.16b
- and v6.16b, v6.16b, v30.16b
- eor v16.16b, v17.16b, v21.16b
- and v4.16b, v4.16b, v30.16b
- eor v17.16b, v8.16b, v30.16b
- and v21.16b, v24.16b, v8.16b
- eor v9.16b, v9.16b, v25.16b
- and v19.16b, v19.16b, v8.16b
- eor v24.16b, v30.16b, v16.16b
- eor v25.16b, v30.16b, v16.16b
- and v7.16b, v7.16b, v17.16b
- and v10.16b, v10.16b, v16.16b
- eor v29.16b, v9.16b, v16.16b
- eor v30.16b, v31.16b, v9.16b
- and v0.16b, v24.16b, v0.16b
- and v9.16b, v18.16b, v9.16b
- and v2.16b, v25.16b, v2.16b
- eor v10.16b, v10.16b, v6.16b
- eor v18.16b, v29.16b, v16.16b
- and v5.16b, v30.16b, v5.16b
- eor v24.16b, v8.16b, v29.16b
- and v25.16b, v26.16b, v29.16b
- and v26.16b, v28.16b, v29.16b
- eor v8.16b, v8.16b, v29.16b
- eor v17.16b, v17.16b, v18.16b
- eor v5.16b, v1.16b, v5.16b
- and v23.16b, v24.16b, v23.16b
- eor v21.16b, v21.16b, v25.16b
- eor v19.16b, v19.16b, v26.16b
- eor v0.16b, v4.16b, v0.16b
- and v3.16b, v17.16b, v3.16b
- eor v1.16b, v9.16b, v1.16b
- eor v9.16b, v25.16b, v23.16b
- eor v5.16b, v5.16b, v21.16b
- eor v2.16b, v6.16b, v2.16b
- and v6.16b, v8.16b, v22.16b
- eor v3.16b, v7.16b, v3.16b
- and v8.16b, v20.16b, v18.16b
- eor v10.16b, v10.16b, v9.16b
- eor v0.16b, v0.16b, v19.16b
- eor v9.16b, v1.16b, v9.16b
- eor v1.16b, v2.16b, v21.16b
- eor v3.16b, v3.16b, v19.16b
- and v16.16b, v27.16b, v16.16b
- eor v17.16b, v26.16b, v6.16b
- eor v6.16b, v8.16b, v7.16b
- eor v7.16b, v1.16b, v9.16b
- eor v1.16b, v5.16b, v3.16b
- eor v2.16b, v10.16b, v3.16b
- eor v4.16b, v16.16b, v4.16b
- eor v8.16b, v6.16b, v17.16b
- eor v5.16b, v9.16b, v3.16b
- eor v9.16b, v0.16b, v1.16b
- eor v6.16b, v7.16b, v1.16b
- eor v0.16b, v4.16b, v17.16b
- eor v4.16b, v8.16b, v7.16b
- eor v7.16b, v9.16b, v2.16b
- eor v8.16b, v3.16b, v0.16b
- eor v7.16b, v7.16b, v5.16b
- eor v3.16b, v4.16b, v7.16b
- eor v4.16b, v7.16b, v0.16b
- eor v7.16b, v8.16b, v3.16b
- bcc .Ldec_done
- ext v8.16b, v0.16b, v0.16b, #8
- ext v9.16b, v1.16b, v1.16b, #8
- ldr q28, [x11] // load from .LISR in common case (x10 > 0)
- ext v10.16b, v6.16b, v6.16b, #8
- ext v16.16b, v3.16b, v3.16b, #8
- ext v17.16b, v5.16b, v5.16b, #8
- ext v18.16b, v4.16b, v4.16b, #8
- eor v8.16b, v8.16b, v0.16b
- eor v9.16b, v9.16b, v1.16b
- eor v10.16b, v10.16b, v6.16b
- eor v16.16b, v16.16b, v3.16b
- eor v17.16b, v17.16b, v5.16b
- ext v19.16b, v2.16b, v2.16b, #8
- ext v20.16b, v7.16b, v7.16b, #8
- eor v18.16b, v18.16b, v4.16b
- eor v6.16b, v6.16b, v8.16b
- eor v8.16b, v2.16b, v10.16b
- eor v4.16b, v4.16b, v9.16b
- eor v2.16b, v19.16b, v2.16b
- eor v9.16b, v20.16b, v7.16b
- eor v0.16b, v0.16b, v16.16b
- eor v1.16b, v1.16b, v16.16b
- eor v6.16b, v6.16b, v17.16b
- eor v8.16b, v8.16b, v16.16b
- eor v7.16b, v7.16b, v18.16b
- eor v4.16b, v4.16b, v16.16b
- eor v2.16b, v3.16b, v2.16b
- eor v1.16b, v1.16b, v17.16b
- eor v3.16b, v5.16b, v9.16b
- eor v5.16b, v8.16b, v17.16b
- eor v7.16b, v7.16b, v17.16b
- ext v8.16b, v0.16b, v0.16b, #12
- ext v9.16b, v6.16b, v6.16b, #12
- ext v10.16b, v4.16b, v4.16b, #12
- ext v16.16b, v1.16b, v1.16b, #12
- ext v17.16b, v5.16b, v5.16b, #12
- ext v18.16b, v7.16b, v7.16b, #12
- eor v0.16b, v0.16b, v8.16b
- eor v6.16b, v6.16b, v9.16b
- eor v4.16b, v4.16b, v10.16b
- ext v19.16b, v2.16b, v2.16b, #12
- ext v20.16b, v3.16b, v3.16b, #12
- eor v1.16b, v1.16b, v16.16b
- eor v5.16b, v5.16b, v17.16b
- eor v7.16b, v7.16b, v18.16b
- eor v2.16b, v2.16b, v19.16b
- eor v16.16b, v16.16b, v0.16b
- eor v3.16b, v3.16b, v20.16b
- eor v17.16b, v17.16b, v4.16b
- eor v10.16b, v10.16b, v6.16b
- ext v0.16b, v0.16b, v0.16b, #8
- eor v9.16b, v9.16b, v1.16b
- ext v1.16b, v1.16b, v1.16b, #8
- eor v8.16b, v8.16b, v3.16b
- eor v16.16b, v16.16b, v3.16b
- eor v18.16b, v18.16b, v5.16b
- eor v19.16b, v19.16b, v7.16b
- ext v21.16b, v5.16b, v5.16b, #8
- ext v5.16b, v7.16b, v7.16b, #8
- eor v7.16b, v20.16b, v2.16b
- ext v4.16b, v4.16b, v4.16b, #8
- ext v20.16b, v3.16b, v3.16b, #8
- eor v17.16b, v17.16b, v3.16b
- ext v2.16b, v2.16b, v2.16b, #8
- eor v3.16b, v10.16b, v3.16b
- ext v10.16b, v6.16b, v6.16b, #8
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v16.16b
- eor v5.16b, v5.16b, v18.16b
- eor v3.16b, v3.16b, v4.16b
- eor v7.16b, v20.16b, v7.16b
- eor v6.16b, v2.16b, v19.16b
- eor v4.16b, v21.16b, v17.16b
- eor v2.16b, v10.16b, v9.16b
- bne .Ldec_loop
- ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
- b .Ldec_loop
-.align 4
-.Ldec_done:
- ushr v8.2d, v0.2d, #1
- movi v9.16b, #0x55
- ldr q10, [x9]
- ushr v16.2d, v2.2d, #1
- movi v17.16b, #0x33
- ushr v18.2d, v6.2d, #1
- movi v19.16b, #0x0f
- eor v8.16b, v8.16b, v1.16b
- ushr v20.2d, v3.2d, #1
- eor v16.16b, v16.16b, v7.16b
- eor v18.16b, v18.16b, v4.16b
- and v8.16b, v8.16b, v9.16b
- eor v20.16b, v20.16b, v5.16b
- and v16.16b, v16.16b, v9.16b
- and v18.16b, v18.16b, v9.16b
- shl v21.2d, v8.2d, #1
- eor v1.16b, v1.16b, v8.16b
- and v8.16b, v20.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- shl v9.2d, v16.2d, #1
- eor v4.16b, v4.16b, v18.16b
- shl v16.2d, v18.2d, #1
- eor v0.16b, v0.16b, v21.16b
- shl v18.2d, v8.2d, #1
- eor v5.16b, v5.16b, v8.16b
- eor v2.16b, v2.16b, v9.16b
- eor v6.16b, v6.16b, v16.16b
- ushr v8.2d, v1.2d, #2
- eor v3.16b, v3.16b, v18.16b
- ushr v9.2d, v0.2d, #2
- ushr v16.2d, v7.2d, #2
- ushr v18.2d, v2.2d, #2
- eor v8.16b, v8.16b, v4.16b
- eor v9.16b, v9.16b, v6.16b
- eor v16.16b, v16.16b, v5.16b
- eor v18.16b, v18.16b, v3.16b
- and v8.16b, v8.16b, v17.16b
- and v9.16b, v9.16b, v17.16b
- and v16.16b, v16.16b, v17.16b
- and v17.16b, v18.16b, v17.16b
- eor v4.16b, v4.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v6.16b, v6.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v5.16b, v5.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v3.16b, v3.16b, v17.16b
- shl v17.2d, v17.2d, #2
- eor v1.16b, v1.16b, v8.16b
- eor v0.16b, v0.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- eor v2.16b, v2.16b, v17.16b
- ushr v8.2d, v4.2d, #4
- ushr v9.2d, v6.2d, #4
- ushr v16.2d, v1.2d, #4
- ushr v17.2d, v0.2d, #4
- eor v8.16b, v8.16b, v5.16b
- eor v9.16b, v9.16b, v3.16b
- eor v16.16b, v16.16b, v7.16b
- eor v17.16b, v17.16b, v2.16b
- and v8.16b, v8.16b, v19.16b
- and v9.16b, v9.16b, v19.16b
- and v16.16b, v16.16b, v19.16b
- and v17.16b, v17.16b, v19.16b
- eor v5.16b, v5.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v3.16b, v3.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v7.16b, v7.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v2.16b, v2.16b, v17.16b
- shl v17.2d, v17.2d, #4
- eor v4.16b, v4.16b, v8.16b
- eor v6.16b, v6.16b, v9.16b
- eor v7.16b, v7.16b, v10.16b
- eor v1.16b, v1.16b, v16.16b
- eor v2.16b, v2.16b, v10.16b
- eor v0.16b, v0.16b, v17.16b
- eor v4.16b, v4.16b, v10.16b
- eor v6.16b, v6.16b, v10.16b
- eor v3.16b, v3.16b, v10.16b
- eor v5.16b, v5.16b, v10.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v10.16b
- ret
-.size _bsaes_decrypt8,.-_bsaes_decrypt8
-
-.type _bsaes_const,%object
-.align 6
-_bsaes_const:
-// InvShiftRows constants
-// Used in _bsaes_decrypt8, which assumes contiguity
-// .LM0ISR used with round 0 key
-// .LISR used with middle round keys
-// .LISRM0 used with final round key
-.LM0ISR:
-.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
-.LISR:
-.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
-.LISRM0:
-.quad 0x01040b0e0205080f, 0x0306090c00070a0d
-
-// ShiftRows constants
-// Used in _bsaes_encrypt8, which assumes contiguity
-// .LM0SR used with round 0 key
-// .LSR used with middle round keys
-// .LSRM0 used with final round key
-.LM0SR:
-.quad 0x0a0e02060f03070b, 0x0004080c05090d01
-.LSR:
-.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
-.LSRM0:
-.quad 0x0304090e00050a0f, 0x01060b0c0207080d
-
-.LM0_bigendian:
-.quad 0x02060a0e03070b0f, 0x0004080c0105090d
-.LM0_littleendian:
-.quad 0x0105090d0004080c, 0x03070b0f02060a0e
-
-// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into
-// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
-.LREVM0SR:
-.quad 0x090d01050c000408, 0x03070b0f060a0e02
-
-.align 6
-.size _bsaes_const,.-_bsaes_const
-
-.type _bsaes_encrypt8,%function
-.align 4
-// On entry:
-// x9 -> key (previously expanded using _bsaes_key_convert)
-// x10 = number of rounds
-// v0-v7 input data
-// On exit:
-// x9-x11 corrupted
-// other general-purpose registers preserved
-// v0-v7 output data
-// v11-v15 preserved
-// other SIMD registers corrupted
-_bsaes_encrypt8:
- ldr q8, [x9], #16
- adr x11, .LM0SR
- ldr q9, [x11], #16
-_bsaes_encrypt8_alt:
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v8.16b
- sub x10, x10, #1
- eor v2.16b, v2.16b, v8.16b
- eor v4.16b, v4.16b, v8.16b
- eor v3.16b, v3.16b, v8.16b
- eor v5.16b, v5.16b, v8.16b
- tbl v0.16b, {v0.16b}, v9.16b
- tbl v1.16b, {v1.16b}, v9.16b
- tbl v2.16b, {v2.16b}, v9.16b
- tbl v4.16b, {v4.16b}, v9.16b
- eor v6.16b, v6.16b, v8.16b
- eor v7.16b, v7.16b, v8.16b
- tbl v3.16b, {v3.16b}, v9.16b
- tbl v5.16b, {v5.16b}, v9.16b
- tbl v6.16b, {v6.16b}, v9.16b
- ushr v8.2d, v0.2d, #1
- movi v10.16b, #0x55
- tbl v7.16b, {v7.16b}, v9.16b
- ushr v9.2d, v4.2d, #1
- movi v16.16b, #0x33
- ushr v17.2d, v2.2d, #1
- eor v8.16b, v8.16b, v1.16b
- movi v18.16b, #0x0f
- ushr v19.2d, v6.2d, #1
- eor v9.16b, v9.16b, v5.16b
- eor v17.16b, v17.16b, v3.16b
- and v8.16b, v8.16b, v10.16b
- eor v19.16b, v19.16b, v7.16b
- and v9.16b, v9.16b, v10.16b
- and v17.16b, v17.16b, v10.16b
- eor v1.16b, v1.16b, v8.16b
- shl v8.2d, v8.2d, #1
- and v10.16b, v19.16b, v10.16b
- eor v5.16b, v5.16b, v9.16b
- shl v9.2d, v9.2d, #1
- eor v3.16b, v3.16b, v17.16b
- shl v17.2d, v17.2d, #1
- eor v0.16b, v0.16b, v8.16b
- shl v8.2d, v10.2d, #1
- eor v7.16b, v7.16b, v10.16b
- eor v4.16b, v4.16b, v9.16b
- eor v2.16b, v2.16b, v17.16b
- ushr v9.2d, v1.2d, #2
- eor v6.16b, v6.16b, v8.16b
- ushr v8.2d, v0.2d, #2
- ushr v10.2d, v5.2d, #2
- ushr v17.2d, v4.2d, #2
- eor v9.16b, v9.16b, v3.16b
- eor v8.16b, v8.16b, v2.16b
- eor v10.16b, v10.16b, v7.16b
- eor v17.16b, v17.16b, v6.16b
- and v9.16b, v9.16b, v16.16b
- and v8.16b, v8.16b, v16.16b
- and v10.16b, v10.16b, v16.16b
- and v16.16b, v17.16b, v16.16b
- eor v3.16b, v3.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v2.16b, v2.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v7.16b, v7.16b, v10.16b
- shl v10.2d, v10.2d, #2
- eor v6.16b, v6.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v1.16b, v1.16b, v9.16b
- eor v0.16b, v0.16b, v8.16b
- eor v5.16b, v5.16b, v10.16b
- eor v4.16b, v4.16b, v16.16b
- ushr v8.2d, v3.2d, #4
- ushr v9.2d, v2.2d, #4
- ushr v10.2d, v1.2d, #4
- ushr v16.2d, v0.2d, #4
- eor v8.16b, v8.16b, v7.16b
- eor v9.16b, v9.16b, v6.16b
- eor v10.16b, v10.16b, v5.16b
- eor v16.16b, v16.16b, v4.16b
- and v8.16b, v8.16b, v18.16b
- and v9.16b, v9.16b, v18.16b
- and v10.16b, v10.16b, v18.16b
- and v16.16b, v16.16b, v18.16b
- eor v7.16b, v7.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v6.16b, v6.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v5.16b, v5.16b, v10.16b
- shl v10.2d, v10.2d, #4
- eor v4.16b, v4.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v3.16b, v3.16b, v8.16b
- eor v2.16b, v2.16b, v9.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v16.16b
- b .Lenc_sbox
-.align 4
-.Lenc_loop:
- ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
- ldp q8, q9, [x9], #32
- eor v0.16b, v16.16b, v0.16b
- ldr q10, [x9], #16
- eor v1.16b, v17.16b, v1.16b
- ldr q16, [x9], #16
- eor v2.16b, v18.16b, v2.16b
- eor v3.16b, v19.16b, v3.16b
- eor v4.16b, v8.16b, v4.16b
- eor v5.16b, v9.16b, v5.16b
- eor v6.16b, v10.16b, v6.16b
- eor v7.16b, v16.16b, v7.16b
- tbl v0.16b, {v0.16b}, v28.16b
- tbl v1.16b, {v1.16b}, v28.16b
- tbl v2.16b, {v2.16b}, v28.16b
- tbl v3.16b, {v3.16b}, v28.16b
- tbl v4.16b, {v4.16b}, v28.16b
- tbl v5.16b, {v5.16b}, v28.16b
- tbl v6.16b, {v6.16b}, v28.16b
- tbl v7.16b, {v7.16b}, v28.16b
-.Lenc_sbox:
- eor v5.16b, v5.16b, v6.16b
- eor v3.16b, v3.16b, v0.16b
- subs x10, x10, #1
- eor v2.16b, v2.16b, v1.16b
- eor v5.16b, v5.16b, v0.16b
- eor v8.16b, v3.16b, v7.16b
- eor v6.16b, v6.16b, v2.16b
- eor v7.16b, v7.16b, v5.16b
- eor v8.16b, v8.16b, v4.16b
- eor v3.16b, v6.16b, v3.16b
- eor v4.16b, v4.16b, v5.16b
- eor v6.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v7.16b
- eor v1.16b, v8.16b, v1.16b
- eor v8.16b, v7.16b, v4.16b
- eor v9.16b, v3.16b, v0.16b
- eor v10.16b, v7.16b, v6.16b
- eor v16.16b, v5.16b, v3.16b
- eor v17.16b, v6.16b, v2.16b
- eor v18.16b, v5.16b, v1.16b
- eor v19.16b, v2.16b, v4.16b
- eor v20.16b, v1.16b, v0.16b
- orr v21.16b, v8.16b, v9.16b
- orr v22.16b, v10.16b, v16.16b
- eor v23.16b, v8.16b, v17.16b
- eor v24.16b, v9.16b, v18.16b
- and v19.16b, v19.16b, v20.16b
- orr v20.16b, v17.16b, v18.16b
- and v8.16b, v8.16b, v9.16b
- and v9.16b, v17.16b, v18.16b
- and v17.16b, v23.16b, v24.16b
- and v10.16b, v10.16b, v16.16b
- eor v16.16b, v21.16b, v19.16b
- eor v18.16b, v20.16b, v19.16b
- and v19.16b, v2.16b, v1.16b
- and v20.16b, v6.16b, v5.16b
- eor v21.16b, v22.16b, v17.16b
- eor v9.16b, v9.16b, v10.16b
- eor v10.16b, v16.16b, v17.16b
- eor v16.16b, v18.16b, v8.16b
- and v17.16b, v4.16b, v0.16b
- orr v18.16b, v7.16b, v3.16b
- eor v21.16b, v21.16b, v8.16b
- eor v8.16b, v9.16b, v8.16b
- eor v9.16b, v10.16b, v19.16b
- eor v10.16b, v3.16b, v0.16b
- eor v16.16b, v16.16b, v17.16b
- eor v17.16b, v5.16b, v1.16b
- eor v19.16b, v21.16b, v20.16b
- eor v20.16b, v8.16b, v18.16b
- eor v8.16b, v8.16b, v18.16b
- eor v18.16b, v7.16b, v4.16b
- eor v21.16b, v9.16b, v16.16b
- eor v22.16b, v6.16b, v2.16b
- and v23.16b, v9.16b, v19.16b
- eor v24.16b, v10.16b, v17.16b
- eor v25.16b, v0.16b, v1.16b
- eor v26.16b, v7.16b, v6.16b
- eor v27.16b, v18.16b, v22.16b
- eor v28.16b, v3.16b, v5.16b
- eor v29.16b, v16.16b, v23.16b
- eor v30.16b, v20.16b, v23.16b
- eor v23.16b, v20.16b, v23.16b
- eor v31.16b, v4.16b, v2.16b
- bsl v29.16b, v19.16b, v20.16b
- bsl v30.16b, v9.16b, v16.16b
- bsl v8.16b, v29.16b, v23.16b
- bsl v20.16b, v23.16b, v29.16b
- eor v9.16b, v30.16b, v29.16b
- and v5.16b, v5.16b, v30.16b
- and v8.16b, v8.16b, v30.16b
- and v1.16b, v1.16b, v29.16b
- eor v16.16b, v19.16b, v20.16b
- and v2.16b, v2.16b, v29.16b
- eor v19.16b, v9.16b, v29.16b
- and v17.16b, v17.16b, v9.16b
- eor v8.16b, v8.16b, v21.16b
- and v20.16b, v22.16b, v9.16b
- eor v21.16b, v29.16b, v16.16b
- eor v22.16b, v29.16b, v16.16b
- and v23.16b, v25.16b, v16.16b
- and v6.16b, v6.16b, v19.16b
- eor v25.16b, v8.16b, v16.16b
- eor v29.16b, v30.16b, v8.16b
- and v4.16b, v21.16b, v4.16b
- and v8.16b, v28.16b, v8.16b
- and v0.16b, v22.16b, v0.16b
- eor v21.16b, v23.16b, v1.16b
- eor v22.16b, v9.16b, v25.16b
- eor v9.16b, v9.16b, v25.16b
- eor v23.16b, v25.16b, v16.16b
- and v3.16b, v29.16b, v3.16b
- and v24.16b, v24.16b, v25.16b
- and v25.16b, v27.16b, v25.16b
- and v10.16b, v22.16b, v10.16b
- and v9.16b, v9.16b, v18.16b
- eor v18.16b, v19.16b, v23.16b
- and v19.16b, v26.16b, v23.16b
- eor v3.16b, v5.16b, v3.16b
- eor v17.16b, v17.16b, v24.16b
- eor v10.16b, v24.16b, v10.16b
- and v16.16b, v31.16b, v16.16b
- eor v20.16b, v20.16b, v25.16b
- eor v9.16b, v25.16b, v9.16b
- eor v4.16b, v2.16b, v4.16b
- and v7.16b, v18.16b, v7.16b
- eor v18.16b, v19.16b, v6.16b
- eor v5.16b, v8.16b, v5.16b
- eor v0.16b, v1.16b, v0.16b
- eor v1.16b, v21.16b, v10.16b
- eor v8.16b, v3.16b, v17.16b
- eor v2.16b, v16.16b, v2.16b
- eor v3.16b, v6.16b, v7.16b
- eor v6.16b, v18.16b, v9.16b
- eor v4.16b, v4.16b, v20.16b
- eor v10.16b, v5.16b, v10.16b
- eor v0.16b, v0.16b, v17.16b
- eor v9.16b, v2.16b, v9.16b
- eor v3.16b, v3.16b, v20.16b
- eor v7.16b, v6.16b, v1.16b
- eor v5.16b, v8.16b, v4.16b
- eor v6.16b, v10.16b, v1.16b
- eor v2.16b, v4.16b, v0.16b
- eor v4.16b, v3.16b, v10.16b
- eor v9.16b, v9.16b, v7.16b
- eor v3.16b, v0.16b, v5.16b
- eor v0.16b, v1.16b, v4.16b
- eor v1.16b, v4.16b, v8.16b
- eor v4.16b, v9.16b, v5.16b
- eor v6.16b, v6.16b, v3.16b
- bcc .Lenc_done
- ext v8.16b, v0.16b, v0.16b, #12
- ext v9.16b, v4.16b, v4.16b, #12
- ldr q28, [x11]
- ext v10.16b, v6.16b, v6.16b, #12
- ext v16.16b, v1.16b, v1.16b, #12
- ext v17.16b, v3.16b, v3.16b, #12
- ext v18.16b, v7.16b, v7.16b, #12
- eor v0.16b, v0.16b, v8.16b
- eor v4.16b, v4.16b, v9.16b
- eor v6.16b, v6.16b, v10.16b
- ext v19.16b, v2.16b, v2.16b, #12
- ext v20.16b, v5.16b, v5.16b, #12
- eor v1.16b, v1.16b, v16.16b
- eor v3.16b, v3.16b, v17.16b
- eor v7.16b, v7.16b, v18.16b
- eor v2.16b, v2.16b, v19.16b
- eor v16.16b, v16.16b, v0.16b
- eor v5.16b, v5.16b, v20.16b
- eor v17.16b, v17.16b, v6.16b
- eor v10.16b, v10.16b, v4.16b
- ext v0.16b, v0.16b, v0.16b, #8
- eor v9.16b, v9.16b, v1.16b
- ext v1.16b, v1.16b, v1.16b, #8
- eor v8.16b, v8.16b, v5.16b
- eor v16.16b, v16.16b, v5.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v7.16b
- ext v3.16b, v3.16b, v3.16b, #8
- ext v7.16b, v7.16b, v7.16b, #8
- eor v20.16b, v20.16b, v2.16b
- ext v6.16b, v6.16b, v6.16b, #8
- ext v21.16b, v5.16b, v5.16b, #8
- eor v17.16b, v17.16b, v5.16b
- ext v2.16b, v2.16b, v2.16b, #8
- eor v10.16b, v10.16b, v5.16b
- ext v22.16b, v4.16b, v4.16b, #8
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v16.16b
- eor v5.16b, v7.16b, v18.16b
- eor v4.16b, v3.16b, v17.16b
- eor v3.16b, v6.16b, v10.16b
- eor v7.16b, v21.16b, v20.16b
- eor v6.16b, v2.16b, v19.16b
- eor v2.16b, v22.16b, v9.16b
- bne .Lenc_loop
- ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
- b .Lenc_loop
-.align 4
-.Lenc_done:
- ushr v8.2d, v0.2d, #1
- movi v9.16b, #0x55
- ldr q10, [x9]
- ushr v16.2d, v3.2d, #1
- movi v17.16b, #0x33
- ushr v18.2d, v4.2d, #1
- movi v19.16b, #0x0f
- eor v8.16b, v8.16b, v1.16b
- ushr v20.2d, v2.2d, #1
- eor v16.16b, v16.16b, v7.16b
- eor v18.16b, v18.16b, v6.16b
- and v8.16b, v8.16b, v9.16b
- eor v20.16b, v20.16b, v5.16b
- and v16.16b, v16.16b, v9.16b
- and v18.16b, v18.16b, v9.16b
- shl v21.2d, v8.2d, #1
- eor v1.16b, v1.16b, v8.16b
- and v8.16b, v20.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- shl v9.2d, v16.2d, #1
- eor v6.16b, v6.16b, v18.16b
- shl v16.2d, v18.2d, #1
- eor v0.16b, v0.16b, v21.16b
- shl v18.2d, v8.2d, #1
- eor v5.16b, v5.16b, v8.16b
- eor v3.16b, v3.16b, v9.16b
- eor v4.16b, v4.16b, v16.16b
- ushr v8.2d, v1.2d, #2
- eor v2.16b, v2.16b, v18.16b
- ushr v9.2d, v0.2d, #2
- ushr v16.2d, v7.2d, #2
- ushr v18.2d, v3.2d, #2
- eor v8.16b, v8.16b, v6.16b
- eor v9.16b, v9.16b, v4.16b
- eor v16.16b, v16.16b, v5.16b
- eor v18.16b, v18.16b, v2.16b
- and v8.16b, v8.16b, v17.16b
- and v9.16b, v9.16b, v17.16b
- and v16.16b, v16.16b, v17.16b
- and v17.16b, v18.16b, v17.16b
- eor v6.16b, v6.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v4.16b, v4.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v5.16b, v5.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v2.16b, v2.16b, v17.16b
- shl v17.2d, v17.2d, #2
- eor v1.16b, v1.16b, v8.16b
- eor v0.16b, v0.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- eor v3.16b, v3.16b, v17.16b
- ushr v8.2d, v6.2d, #4
- ushr v9.2d, v4.2d, #4
- ushr v16.2d, v1.2d, #4
- ushr v17.2d, v0.2d, #4
- eor v8.16b, v8.16b, v5.16b
- eor v9.16b, v9.16b, v2.16b
- eor v16.16b, v16.16b, v7.16b
- eor v17.16b, v17.16b, v3.16b
- and v8.16b, v8.16b, v19.16b
- and v9.16b, v9.16b, v19.16b
- and v16.16b, v16.16b, v19.16b
- and v17.16b, v17.16b, v19.16b
- eor v5.16b, v5.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v2.16b, v2.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v7.16b, v7.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v3.16b, v3.16b, v17.16b
- shl v17.2d, v17.2d, #4
- eor v6.16b, v6.16b, v8.16b
- eor v4.16b, v4.16b, v9.16b
- eor v7.16b, v7.16b, v10.16b
- eor v1.16b, v1.16b, v16.16b
- eor v3.16b, v3.16b, v10.16b
- eor v0.16b, v0.16b, v17.16b
- eor v6.16b, v6.16b, v10.16b
- eor v4.16b, v4.16b, v10.16b
- eor v2.16b, v2.16b, v10.16b
- eor v5.16b, v5.16b, v10.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v10.16b
- ret
-.size _bsaes_encrypt8,.-_bsaes_encrypt8
-
-.type _bsaes_key_convert,%function
-.align 4
-// On entry:
-// x9 -> input key (big-endian)
-// x10 = number of rounds
-// x17 -> output key (native endianness)
-// On exit:
-// x9, x10 corrupted
-// x11 -> .LM0_bigendian
-// x17 -> last quadword of output key
-// other general-purpose registers preserved
-// v2-v6 preserved
-// v7.16b[] = 0x63
-// v8-v14 preserved
-// v15 = last round key (converted to native endianness)
-// other SIMD registers corrupted
-_bsaes_key_convert:
-#ifdef __ARMEL__
- adr x11, .LM0_littleendian
-#else
- adr x11, .LM0_bigendian
-#endif
- ldr q0, [x9], #16 // load round 0 key
- ldr q1, [x11] // .LM0
- ldr q15, [x9], #16 // load round 1 key
-
- movi v7.16b, #0x63 // compose .L63
- movi v16.16b, #0x01 // bit masks
- movi v17.16b, #0x02
- movi v18.16b, #0x04
- movi v19.16b, #0x08
- movi v20.16b, #0x10
- movi v21.16b, #0x20
- movi v22.16b, #0x40
- movi v23.16b, #0x80
-
-#ifdef __ARMEL__
- rev32 v0.16b, v0.16b
-#endif
- sub x10, x10, #1
- str q0, [x17], #16 // save round 0 key
-
-.align 4
-.Lkey_loop:
- tbl v0.16b, {v15.16b}, v1.16b
- ldr q15, [x9], #16 // load next round key
-
- eor v0.16b, v0.16b, v7.16b
- cmtst v24.16b, v0.16b, v16.16b
- cmtst v25.16b, v0.16b, v17.16b
- cmtst v26.16b, v0.16b, v18.16b
- cmtst v27.16b, v0.16b, v19.16b
- cmtst v28.16b, v0.16b, v20.16b
- cmtst v29.16b, v0.16b, v21.16b
- cmtst v30.16b, v0.16b, v22.16b
- cmtst v31.16b, v0.16b, v23.16b
- sub x10, x10, #1
- st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
- st1 {v28.16b-v31.16b}, [x17], #64
- cbnz x10, .Lkey_loop
-
- // don't save last round key
-#ifdef __ARMEL__
- rev32 v15.16b, v15.16b
- adr x11, .LM0_bigendian
-#endif
- ret
-.size _bsaes_key_convert,.-_bsaes_key_convert
-
-.globl bsaes_cbc_encrypt
-.type bsaes_cbc_encrypt,%function
-.align 4
-// On entry:
-// x0 -> input ciphertext
-// x1 -> output plaintext
-// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
-// x3 -> key
-// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
-// w5 must be == 0
-// On exit:
-// Output plaintext filled in
-// Initialisation vector overwritten with last quadword of ciphertext
-// No output registers, usual AAPCS64 register preservation
-bsaes_cbc_encrypt:
- cmp x2, #128
- blo AES_cbc_encrypt
-
- // it is up to the caller to make sure we are called with enc == 0
-
- stp fp, lr, [sp, #-48]!
- stp d8, d9, [sp, #16]
- stp d10, d15, [sp, #32]
- lsr x2, x2, #4 // len in 16 byte blocks
-
- ldr w15, [x3, #240] // get # of rounds
- mov x14, sp
-
- // allocate the key schedule on the stack
- add x17, sp, #96
- sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
-
- // populate the key schedule
- mov x9, x3 // pass key
- mov x10, x15 // pass # of rounds
- mov sp, x17 // sp is sp
- bl _bsaes_key_convert
- ldr q6, [sp]
- str q15, [x17] // save last round key
- eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
- str q6, [sp]
-
- ldr q15, [x4] // load IV
- b .Lcbc_dec_loop
-
-.align 4
-.Lcbc_dec_loop:
- subs x2, x2, #0x8
- bmi .Lcbc_dec_loop_finish
-
- ldr q0, [x0], #16 // load input
- mov x9, sp // pass the key
- ldr q1, [x0], #16
- mov x10, x15
- ldr q2, [x0], #16
- ldr q3, [x0], #16
- ldr q4, [x0], #16
- ldr q5, [x0], #16
- ldr q6, [x0], #16
- ldr q7, [x0], #-7*16
-
- bl _bsaes_decrypt8
-
- ldr q16, [x0], #16 // reload input
- eor v0.16b, v0.16b, v15.16b // ^= IV
- eor v1.16b, v1.16b, v16.16b
- str q0, [x1], #16 // write output
- ldr q0, [x0], #16
- str q1, [x1], #16
- ldr q1, [x0], #16
- eor v1.16b, v4.16b, v1.16b
- ldr q4, [x0], #16
- eor v2.16b, v2.16b, v4.16b
- eor v0.16b, v6.16b, v0.16b
- ldr q4, [x0], #16
- str q0, [x1], #16
- str q1, [x1], #16
- eor v0.16b, v7.16b, v4.16b
- ldr q1, [x0], #16
- str q2, [x1], #16
- ldr q2, [x0], #16
- ldr q15, [x0], #16
- str q0, [x1], #16
- eor v0.16b, v5.16b, v2.16b
- eor v1.16b, v3.16b, v1.16b
- str q1, [x1], #16
- str q0, [x1], #16
-
- b .Lcbc_dec_loop
-
-.Lcbc_dec_loop_finish:
- adds x2, x2, #8
- beq .Lcbc_dec_done
-
- ldr q0, [x0], #16 // load input
- cmp x2, #2
- blo .Lcbc_dec_one
- ldr q1, [x0], #16
- mov x9, sp // pass the key
- mov x10, x15
- beq .Lcbc_dec_two
- ldr q2, [x0], #16
- cmp x2, #4
- blo .Lcbc_dec_three
- ldr q3, [x0], #16
- beq .Lcbc_dec_four
- ldr q4, [x0], #16
- cmp x2, #6
- blo .Lcbc_dec_five
- ldr q5, [x0], #16
- beq .Lcbc_dec_six
- ldr q6, [x0], #-6*16
-
- bl _bsaes_decrypt8
-
- ldr q5, [x0], #16 // reload input
- eor v0.16b, v0.16b, v15.16b // ^= IV
- ldr q8, [x0], #16
- ldr q9, [x0], #16
- ldr q10, [x0], #16
- str q0, [x1], #16 // write output
- ldr q0, [x0], #16
- eor v1.16b, v1.16b, v5.16b
- ldr q5, [x0], #16
- eor v6.16b, v6.16b, v8.16b
- ldr q15, [x0]
- eor v4.16b, v4.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- str q1, [x1], #16
- eor v0.16b, v7.16b, v0.16b
- str q6, [x1], #16
- eor v1.16b, v3.16b, v5.16b
- str q4, [x1], #16
- str q2, [x1], #16
- str q0, [x1], #16
- str q1, [x1]
- b .Lcbc_dec_done
-.align 4
-.Lcbc_dec_six:
- sub x0, x0, #0x60
- bl _bsaes_decrypt8
- ldr q3, [x0], #16 // reload input
- eor v0.16b, v0.16b, v15.16b // ^= IV
- ldr q5, [x0], #16
- ldr q8, [x0], #16
- ldr q9, [x0], #16
- str q0, [x1], #16 // write output
- ldr q0, [x0], #16
- eor v1.16b, v1.16b, v3.16b
- ldr q15, [x0]
- eor v3.16b, v6.16b, v5.16b
- eor v4.16b, v4.16b, v8.16b
- eor v2.16b, v2.16b, v9.16b
- str q1, [x1], #16
- eor v0.16b, v7.16b, v0.16b
- str q3, [x1], #16
- str q4, [x1], #16
- str q2, [x1], #16
- str q0, [x1]
- b .Lcbc_dec_done
-.align 4
-.Lcbc_dec_five:
- sub x0, x0, #0x50
- bl _bsaes_decrypt8
- ldr q3, [x0], #16 // reload input
- eor v0.16b, v0.16b, v15.16b // ^= IV
- ldr q5, [x0], #16
- ldr q7, [x0], #16
- ldr q8, [x0], #16
- str q0, [x1], #16 // write output
- ldr q15, [x0]
- eor v0.16b, v1.16b, v3.16b
- eor v1.16b, v6.16b, v5.16b
- eor v3.16b, v4.16b, v7.16b
- str q0, [x1], #16
- eor v0.16b, v2.16b, v8.16b
- str q1, [x1], #16
- str q3, [x1], #16
- str q0, [x1]
- b .Lcbc_dec_done
-.align 4
-.Lcbc_dec_four:
- sub x0, x0, #0x40
- bl _bsaes_decrypt8
- ldr q2, [x0], #16 // reload input
- eor v0.16b, v0.16b, v15.16b // ^= IV
- ldr q3, [x0], #16
- ldr q5, [x0], #16
- str q0, [x1], #16 // write output
- ldr q15, [x0]
- eor v0.16b, v1.16b, v2.16b
- eor v1.16b, v6.16b, v3.16b
- eor v2.16b, v4.16b, v5.16b
- str q0, [x1], #16
- str q1, [x1], #16
- str q2, [x1]
- b .Lcbc_dec_done
-.align 4
-.Lcbc_dec_three:
- sub x0, x0, #0x30
- bl _bsaes_decrypt8
- ldr q2, [x0], #16 // reload input
- eor v0.16b, v0.16b, v15.16b // ^= IV
- ldr q3, [x0], #16
- ldr q15, [x0]
- str q0, [x1], #16 // write output
- eor v0.16b, v1.16b, v2.16b
- eor v1.16b, v6.16b, v3.16b
- str q0, [x1], #16
- str q1, [x1]
- b .Lcbc_dec_done
-.align 4
-.Lcbc_dec_two:
- sub x0, x0, #0x20
- bl _bsaes_decrypt8
- ldr q2, [x0], #16 // reload input
- eor v0.16b, v0.16b, v15.16b // ^= IV
- ldr q15, [x0]
- str q0, [x1], #16 // write output
- eor v0.16b, v1.16b, v2.16b
- str q0, [x1]
- b .Lcbc_dec_done
-.align 4
-.Lcbc_dec_one:
- sub x0, x0, #0x10
- stp x1, x4, [sp, #-32]!
- str x14, [sp, #16]
- mov v8.16b, v15.16b
- mov v15.16b, v0.16b
- mov x2, x3
- bl AES_decrypt
- ldr x14, [sp, #16]
- ldp x1, x4, [sp], #32
- ldr q0, [x1] // load result
- eor v0.16b, v0.16b, v8.16b // ^= IV
- str q0, [x1] // write output
-
-.align 4
-.Lcbc_dec_done:
- movi v0.16b, #0
- movi v1.16b, #0
-.Lcbc_dec_bzero:// wipe key schedule [if any]
- stp q0, q1, [sp], #32
- cmp sp, x14
- bne .Lcbc_dec_bzero
- str q15, [x4] // return IV
- ldp d8, d9, [sp, #16]
- ldp d10, d15, [sp, #32]
- ldp fp, lr, [sp], #48
- ret
-.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-
-.globl bsaes_ctr32_encrypt_blocks
-.type bsaes_ctr32_encrypt_blocks,%function
-.align 4
-// On entry:
-// x0 -> input text (whole 16-byte blocks)
-// x1 -> output text (whole 16-byte blocks)
-// x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
-// x3 -> key
-// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
-// On exit:
-// Output text filled in
-// No output registers, usual AAPCS64 register preservation
-bsaes_ctr32_encrypt_blocks:
-
- cmp x2, #8 // use plain AES for
- blo .Lctr_enc_short // small sizes
-
- stp fp, lr, [sp, #-80]!
- stp d8, d9, [sp, #16]
- stp d10, d11, [sp, #32]
- stp d12, d13, [sp, #48]
- stp d14, d15, [sp, #64]
-
- ldr w15, [x3, #240] // get # of rounds
- mov x14, sp
-
- // allocate the key schedule on the stack
- add x17, sp, #96
- sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
-
- // populate the key schedule
- mov x9, x3 // pass key
- mov x10, x15 // pass # of rounds
- mov sp, x17 // sp is sp
- bl _bsaes_key_convert
- eor v7.16b, v7.16b, v15.16b // fix up last round key
- str q7, [x17] // save last round key
-
- ldr q0, [x4] // load counter
- add x13, x11, #.LREVM0SR-.LM0_bigendian
- ldr q4, [sp] // load round0 key
-
- movi v8.4s, #1 // compose 1<<96
- movi v9.16b, #0
- rev32 v15.16b, v0.16b
- rev32 v0.16b, v0.16b
- ext v11.16b, v9.16b, v8.16b, #4
- rev32 v4.16b, v4.16b
- add v12.4s, v11.4s, v11.4s // compose 2<<96
- str q4, [sp] // save adjusted round0 key
- add v13.4s, v11.4s, v12.4s // compose 3<<96
- add v14.4s, v12.4s, v12.4s // compose 4<<96
- b .Lctr_enc_loop
-
-.align 4
-.Lctr_enc_loop:
- // Intermix prologue from _bsaes_encrypt8 to use the opportunity
- // to flip byte order in 32-bit counter
-
- add v1.4s, v15.4s, v11.4s // +1
- add x9, sp, #0x10 // pass next round key
- add v2.4s, v15.4s, v12.4s // +2
- ldr q9, [x13] // .LREVM0SR
- ldr q8, [sp] // load round0 key
- add v3.4s, v15.4s, v13.4s // +3
- mov x10, x15 // pass rounds
- sub x11, x13, #.LREVM0SR-.LSR // pass constants
- add v6.4s, v2.4s, v14.4s
- add v4.4s, v15.4s, v14.4s // +4
- add v7.4s, v3.4s, v14.4s
- add v15.4s, v4.4s, v14.4s // next counter
- add v5.4s, v1.4s, v14.4s
-
- bl _bsaes_encrypt8_alt
-
- subs x2, x2, #8
- blo .Lctr_enc_loop_done
-
- ldr q16, [x0], #16
- ldr q17, [x0], #16
- eor v1.16b, v1.16b, v17.16b
- ldr q17, [x0], #16
- eor v0.16b, v0.16b, v16.16b
- eor v4.16b, v4.16b, v17.16b
- str q0, [x1], #16
- ldr q16, [x0], #16
- str q1, [x1], #16
- mov v0.16b, v15.16b
- str q4, [x1], #16
- ldr q1, [x0], #16
- eor v4.16b, v6.16b, v16.16b
- eor v1.16b, v3.16b, v1.16b
- ldr q3, [x0], #16
- eor v3.16b, v7.16b, v3.16b
- ldr q6, [x0], #16
- eor v2.16b, v2.16b, v6.16b
- ldr q6, [x0], #16
- eor v5.16b, v5.16b, v6.16b
- str q4, [x1], #16
- str q1, [x1], #16
- str q3, [x1], #16
- str q2, [x1], #16
- str q5, [x1], #16
-
- bne .Lctr_enc_loop
- b .Lctr_enc_done
-
-.align 4
-.Lctr_enc_loop_done:
- add x2, x2, #8
- ldr q16, [x0], #16 // load input
- eor v0.16b, v0.16b, v16.16b
- str q0, [x1], #16 // write output
- cmp x2, #2
- blo .Lctr_enc_done
- ldr q17, [x0], #16
- eor v1.16b, v1.16b, v17.16b
- str q1, [x1], #16
- beq .Lctr_enc_done
- ldr q18, [x0], #16
- eor v4.16b, v4.16b, v18.16b
- str q4, [x1], #16
- cmp x2, #4
- blo .Lctr_enc_done
- ldr q19, [x0], #16
- eor v6.16b, v6.16b, v19.16b
- str q6, [x1], #16
- beq .Lctr_enc_done
- ldr q20, [x0], #16
- eor v3.16b, v3.16b, v20.16b
- str q3, [x1], #16
- cmp x2, #6
- blo .Lctr_enc_done
- ldr q21, [x0], #16
- eor v7.16b, v7.16b, v21.16b
- str q7, [x1], #16
- beq .Lctr_enc_done
- ldr q22, [x0]
- eor v2.16b, v2.16b, v22.16b
- str q2, [x1], #16
-
-.Lctr_enc_done:
- movi v0.16b, #0
- movi v1.16b, #0
-.Lctr_enc_bzero: // wipe key schedule [if any]
- stp q0, q1, [sp], #32
- cmp sp, x14
- bne .Lctr_enc_bzero
-
- ldp d8, d9, [sp, #16]
- ldp d10, d11, [sp, #32]
- ldp d12, d13, [sp, #48]
- ldp d14, d15, [sp, #64]
- ldp fp, lr, [sp], #80
- ret
-
-.Lctr_enc_short:
- stp fp, lr, [sp, #-96]!
- stp x19, x20, [sp, #16]
- stp x21, x22, [sp, #32]
- str x23, [sp, #48]
-
- mov x19, x0 // copy arguments
- mov x20, x1
- mov x21, x2
- mov x22, x3
- ldr w23, [x4, #12] // load counter .LSW
- ldr q1, [x4] // load whole counter value
-#ifdef __ARMEL__
- rev w23, w23
-#endif
- str q1, [sp, #80] // copy counter value
-
-.Lctr_enc_short_loop:
- add x0, sp, #80 // input counter value
- add x1, sp, #64 // output on the stack
- mov x2, x22 // key
-
- bl AES_encrypt
-
- ldr q0, [x19], #16 // load input
- ldr q1, [sp, #64] // load encrypted counter
- add x23, x23, #1
-#ifdef __ARMEL__
- rev w0, w23
- str w0, [sp, #80+12] // next counter value
-#else
- str w23, [sp, #80+12] // next counter value
-#endif
- eor v0.16b, v0.16b, v1.16b
- str q0, [x20], #16 // store output
- subs x21, x21, #1
- bne .Lctr_enc_short_loop
-
- movi v0.16b, #0
- movi v1.16b, #0
- stp q0, q1, [sp, #64]
-
- ldr x23, [sp, #48]
- ldp x21, x22, [sp, #32]
- ldp x19, x20, [sp, #16]
- ldp fp, lr, [sp], #96
- ret
-.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-
-.globl bsaes_xts_encrypt
-.type bsaes_xts_encrypt,%function
-.align 4
-// On entry:
-// x0 -> input plaintext
-// x1 -> output ciphertext
-// x2 -> length of text in bytes (must be at least 16)
-// x3 -> key1 (used to encrypt the XORed plaintext blocks)
-// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
-// x5 -> 16-byte initial vector (typically, sector number)
-// On exit:
-// Output ciphertext filled in
-// No output registers, usual AAPCS64 register preservation
-bsaes_xts_encrypt:
- // Stack layout:
- // sp ->
- // nrounds*128-96 bytes: key schedule
- // x19 ->
- // 16 bytes: frame record
- // 4*16 bytes: tweak storage across _bsaes_encrypt8
- // 6*8 bytes: storage for 5 callee-saved general-purpose registers
- // 8*8 bytes: storage for 8 callee-saved SIMD registers
- stp fp, lr, [sp, #-192]!
- stp x19, x20, [sp, #80]
- stp x21, x22, [sp, #96]
- str x23, [sp, #112]
- stp d8, d9, [sp, #128]
- stp d10, d11, [sp, #144]
- stp d12, d13, [sp, #160]
- stp d14, d15, [sp, #176]
-
- mov x19, sp
- mov x20, x0
- mov x21, x1
- mov x22, x2
- mov x23, x3
-
- // generate initial tweak
- sub sp, sp, #16
- mov x0, x5 // iv[]
- mov x1, sp
- mov x2, x4 // key2
- bl AES_encrypt
- ldr q11, [sp], #16
-
- ldr w1, [x23, #240] // get # of rounds
- // allocate the key schedule on the stack
- add x17, sp, #96
- sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
-
- // populate the key schedule
- mov x9, x23 // pass key
- mov x10, x1 // pass # of rounds
- mov sp, x17
- bl _bsaes_key_convert
- eor v15.16b, v15.16b, v7.16b // fix up last round key
- str q15, [x17] // save last round key
-
- subs x22, x22, #0x80
- blo .Lxts_enc_short
- b .Lxts_enc_loop
-
-.align 4
-.Lxts_enc_loop:
- ldr q8, .Lxts_magic
- mov x10, x1 // pass rounds
- add x2, x19, #16
- ldr q0, [x20], #16
- sshr v1.2d, v11.2d, #63
- mov x9, sp // pass key schedule
- ldr q6, .Lxts_magic+16
- add v2.2d, v11.2d, v11.2d
- cmtst v3.2d, v11.2d, v6.2d
- and v1.16b, v1.16b, v8.16b
- ext v1.16b, v1.16b, v1.16b, #8
- and v3.16b, v3.16b, v8.16b
- ldr q4, [x20], #16
- eor v12.16b, v2.16b, v1.16b
- eor v1.16b, v4.16b, v12.16b
- eor v0.16b, v0.16b, v11.16b
- cmtst v2.2d, v12.2d, v6.2d
- add v4.2d, v12.2d, v12.2d
- add x0, x19, #16
- ext v3.16b, v3.16b, v3.16b, #8
- and v2.16b, v2.16b, v8.16b
- eor v13.16b, v4.16b, v3.16b
- ldr q3, [x20], #16
- ext v4.16b, v2.16b, v2.16b, #8
- eor v2.16b, v3.16b, v13.16b
- ldr q3, [x20], #16
- add v5.2d, v13.2d, v13.2d
- cmtst v7.2d, v13.2d, v6.2d
- and v7.16b, v7.16b, v8.16b
- ldr q9, [x20], #16
- ext v7.16b, v7.16b, v7.16b, #8
- ldr q10, [x20], #16
- eor v14.16b, v5.16b, v4.16b
- ldr q16, [x20], #16
- add v4.2d, v14.2d, v14.2d
- eor v3.16b, v3.16b, v14.16b
- eor v15.16b, v4.16b, v7.16b
- add v5.2d, v15.2d, v15.2d
- ldr q7, [x20], #16
- cmtst v4.2d, v14.2d, v6.2d
- and v17.16b, v4.16b, v8.16b
- cmtst v18.2d, v15.2d, v6.2d
- eor v4.16b, v9.16b, v15.16b
- ext v9.16b, v17.16b, v17.16b, #8
- eor v9.16b, v5.16b, v9.16b
- add v17.2d, v9.2d, v9.2d
- and v18.16b, v18.16b, v8.16b
- eor v5.16b, v10.16b, v9.16b
- str q9, [x2], #16
- ext v10.16b, v18.16b, v18.16b, #8
- cmtst v9.2d, v9.2d, v6.2d
- and v9.16b, v9.16b, v8.16b
- eor v10.16b, v17.16b, v10.16b
- cmtst v17.2d, v10.2d, v6.2d
- eor v6.16b, v16.16b, v10.16b
- str q10, [x2], #16
- ext v9.16b, v9.16b, v9.16b, #8
- add v10.2d, v10.2d, v10.2d
- eor v9.16b, v10.16b, v9.16b
- str q9, [x2], #16
- eor v7.16b, v7.16b, v9.16b
- add v9.2d, v9.2d, v9.2d
- and v8.16b, v17.16b, v8.16b
- ext v8.16b, v8.16b, v8.16b, #8
- eor v8.16b, v9.16b, v8.16b
- str q8, [x2] // next round tweak
-
- bl _bsaes_encrypt8
-
- ldr q8, [x0], #16
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- ldr q9, [x0], #16
- eor v4.16b, v4.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- ldr q10, [x0], #16
- eor v3.16b, v3.16b, v15.16b
- subs x22, x22, #0x80
- str q0, [x21], #16
- ldr q11, [x0] // next round tweak
- str q1, [x21], #16
- eor v0.16b, v7.16b, v8.16b
- eor v1.16b, v2.16b, v9.16b
- str q4, [x21], #16
- eor v2.16b, v5.16b, v10.16b
- str q6, [x21], #16
- str q3, [x21], #16
- str q0, [x21], #16
- str q1, [x21], #16
- str q2, [x21], #16
- bpl .Lxts_enc_loop
-
-.Lxts_enc_short:
- adds x22, x22, #0x70
- bmi .Lxts_enc_done
-
- ldr q8, .Lxts_magic
- sshr v1.2d, v11.2d, #63
- add v2.2d, v11.2d, v11.2d
- ldr q9, .Lxts_magic+16
- subs x22, x22, #0x10
- ldr q0, [x20], #16
- and v1.16b, v1.16b, v8.16b
- cmtst v3.2d, v11.2d, v9.2d
- ext v1.16b, v1.16b, v1.16b, #8
- and v3.16b, v3.16b, v8.16b
- eor v12.16b, v2.16b, v1.16b
- ext v1.16b, v3.16b, v3.16b, #8
- add v2.2d, v12.2d, v12.2d
- cmtst v3.2d, v12.2d, v9.2d
- eor v13.16b, v2.16b, v1.16b
- and v22.16b, v3.16b, v8.16b
- bmi .Lxts_enc_1
-
- ext v2.16b, v22.16b, v22.16b, #8
- add v3.2d, v13.2d, v13.2d
- ldr q1, [x20], #16
- cmtst v4.2d, v13.2d, v9.2d
- subs x22, x22, #0x10
- eor v14.16b, v3.16b, v2.16b
- and v23.16b, v4.16b, v8.16b
- bmi .Lxts_enc_2
-
- ext v3.16b, v23.16b, v23.16b, #8
- add v4.2d, v14.2d, v14.2d
- ldr q2, [x20], #16
- cmtst v5.2d, v14.2d, v9.2d
- eor v0.16b, v0.16b, v11.16b
- subs x22, x22, #0x10
- eor v15.16b, v4.16b, v3.16b
- and v24.16b, v5.16b, v8.16b
- bmi .Lxts_enc_3
-
- ext v4.16b, v24.16b, v24.16b, #8
- add v5.2d, v15.2d, v15.2d
- ldr q3, [x20], #16
- cmtst v6.2d, v15.2d, v9.2d
- eor v1.16b, v1.16b, v12.16b
- subs x22, x22, #0x10
- eor v16.16b, v5.16b, v4.16b
- and v25.16b, v6.16b, v8.16b
- bmi .Lxts_enc_4
-
- ext v5.16b, v25.16b, v25.16b, #8
- add v6.2d, v16.2d, v16.2d
- add x0, x19, #16
- cmtst v7.2d, v16.2d, v9.2d
- ldr q4, [x20], #16
- eor v2.16b, v2.16b, v13.16b
- str q16, [x0], #16
- subs x22, x22, #0x10
- eor v17.16b, v6.16b, v5.16b
- and v26.16b, v7.16b, v8.16b
- bmi .Lxts_enc_5
-
- ext v7.16b, v26.16b, v26.16b, #8
- add v18.2d, v17.2d, v17.2d
- ldr q5, [x20], #16
- eor v3.16b, v3.16b, v14.16b
- str q17, [x0], #16
- subs x22, x22, #0x10
- eor v18.16b, v18.16b, v7.16b
- bmi .Lxts_enc_6
-
- ldr q6, [x20], #16
- eor v4.16b, v4.16b, v15.16b
- eor v5.16b, v5.16b, v16.16b
- str q18, [x0] // next round tweak
- mov x9, sp // pass key schedule
- mov x10, x1
- add x0, x19, #16
- sub x22, x22, #0x10
- eor v6.16b, v6.16b, v17.16b
-
- bl _bsaes_encrypt8
-
- ldr q16, [x0], #16
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- ldr q17, [x0], #16
- eor v4.16b, v4.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v3.16b, v3.16b, v15.16b
- ldr q11, [x0] // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- eor v0.16b, v7.16b, v16.16b
- eor v1.16b, v2.16b, v17.16b
- str q4, [x21], #16
- str q6, [x21], #16
- str q3, [x21], #16
- str q0, [x21], #16
- str q1, [x21], #16
- b .Lxts_enc_done
-
-.align 4
-.Lxts_enc_6:
- eor v4.16b, v4.16b, v15.16b
- eor v5.16b, v5.16b, v16.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_encrypt8
-
- ldr q16, [x0], #16
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- eor v4.16b, v4.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- ldr q11, [x0] // next round tweak
- eor v3.16b, v3.16b, v15.16b
- str q0, [x21], #16
- str q1, [x21], #16
- eor v0.16b, v7.16b, v16.16b
- str q4, [x21], #16
- str q6, [x21], #16
- str q3, [x21], #16
- str q0, [x21], #16
- b .Lxts_enc_done
-
-.align 4
-.Lxts_enc_5:
- eor v3.16b, v3.16b, v14.16b
- eor v4.16b, v4.16b, v15.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_encrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- ldr q11, [x0] // next round tweak
- eor v4.16b, v4.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v3.16b, v3.16b, v15.16b
- str q0, [x21], #16
- str q1, [x21], #16
- str q4, [x21], #16
- str q6, [x21], #16
- str q3, [x21], #16
- b .Lxts_enc_done
-
-.align 4
-.Lxts_enc_4:
- eor v2.16b, v2.16b, v13.16b
- eor v3.16b, v3.16b, v14.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_encrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- eor v4.16b, v4.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- mov v11.16b, v15.16b // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- str q4, [x21], #16
- str q6, [x21], #16
- b .Lxts_enc_done
-
-.align 4
-.Lxts_enc_3:
- eor v1.16b, v1.16b, v12.16b
- eor v2.16b, v2.16b, v13.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_encrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- eor v4.16b, v4.16b, v13.16b
- mov v11.16b, v14.16b // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- str q4, [x21], #16
- b .Lxts_enc_done
-
-.align 4
-.Lxts_enc_2:
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_encrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- mov v11.16b, v13.16b // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- b .Lxts_enc_done
-
-.align 4
-.Lxts_enc_1:
- eor v0.16b, v0.16b, v11.16b
- sub x0, sp, #16
- sub x1, sp, #16
- mov x2, x23
- mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
- mov v14.d[0], v12.d[1]
- str q0, [sp, #-16]!
-
- bl AES_encrypt
-
- ldr q0, [sp], #16
- trn1 v13.2d, v11.2d, v13.2d
- trn1 v11.2d, v12.2d, v14.2d // next round tweak
- eor v0.16b, v0.16b, v13.16b
- str q0, [x21], #16
-
-.Lxts_enc_done:
- adds x22, x22, #0x10
- beq .Lxts_enc_ret
-
- sub x6, x21, #0x10
- // Penultimate plaintext block produces final ciphertext part-block
- // plus remaining part of final plaintext block. Move ciphertext part
- // to final position and re-use penultimate ciphertext block buffer to
- // construct final plaintext block
-.Lxts_enc_steal:
- ldrb w0, [x20], #1
- ldrb w1, [x21, #-0x10]
- strb w0, [x21, #-0x10]
- strb w1, [x21], #1
-
- subs x22, x22, #1
- bhi .Lxts_enc_steal
-
- // Finally encrypt the penultimate ciphertext block using the
- // last tweak
- ldr q0, [x6]
- eor v0.16b, v0.16b, v11.16b
- str q0, [sp, #-16]!
- mov x0, sp
- mov x1, sp
- mov x2, x23
- mov x21, x6
- mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
-
- bl AES_encrypt
-
- trn1 v11.2d, v11.2d, v13.2d
- ldr q0, [sp], #16
- eor v0.16b, v0.16b, v11.16b
- str q0, [x21]
-
-.Lxts_enc_ret:
-
- movi v0.16b, #0
- movi v1.16b, #0
-.Lxts_enc_bzero: // wipe key schedule
- stp q0, q1, [sp], #32
- cmp sp, x19
- bne .Lxts_enc_bzero
-
- ldp x19, x20, [sp, #80]
- ldp x21, x22, [sp, #96]
- ldr x23, [sp, #112]
- ldp d8, d9, [sp, #128]
- ldp d10, d11, [sp, #144]
- ldp d12, d13, [sp, #160]
- ldp d14, d15, [sp, #176]
- ldp fp, lr, [sp], #192
- ret
-.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
-
-// The assembler doesn't seem capable of de-duplicating these when expressed
-// using `ldr qd,=` syntax, so assign a symbolic address
-.align 5
-.Lxts_magic:
-.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
-
-.globl bsaes_xts_decrypt
-.type bsaes_xts_decrypt,%function
-.align 4
-// On entry:
-// x0 -> input ciphertext
-// x1 -> output plaintext
-// x2 -> length of text in bytes (must be at least 16)
-// x3 -> key1 (used to decrypt the XORed ciphertext blocks)
-// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
-// x5 -> 16-byte initial vector (typically, sector number)
-// On exit:
-// Output plaintext filled in
-// No output registers, usual AAPCS64 register preservation
-bsaes_xts_decrypt:
- // Stack layout:
- // sp ->
- // nrounds*128-96 bytes: key schedule
- // x19 ->
- // 16 bytes: frame record
- // 4*16 bytes: tweak storage across _bsaes_decrypt8
- // 6*8 bytes: storage for 5 callee-saved general-purpose registers
- // 8*8 bytes: storage for 8 callee-saved SIMD registers
- stp fp, lr, [sp, #-192]!
- stp x19, x20, [sp, #80]
- stp x21, x22, [sp, #96]
- str x23, [sp, #112]
- stp d8, d9, [sp, #128]
- stp d10, d11, [sp, #144]
- stp d12, d13, [sp, #160]
- stp d14, d15, [sp, #176]
-
- mov x19, sp
- mov x20, x0
- mov x21, x1
- mov x22, x2
- mov x23, x3
-
- // generate initial tweak
- sub sp, sp, #16
- mov x0, x5 // iv[]
- mov x1, sp
- mov x2, x4 // key2
- bl AES_encrypt
- ldr q11, [sp], #16
-
- ldr w1, [x23, #240] // get # of rounds
- // allocate the key schedule on the stack
- add x17, sp, #96
- sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
-
- // populate the key schedule
- mov x9, x23 // pass key
- mov x10, x1 // pass # of rounds
- mov sp, x17
- bl _bsaes_key_convert
- ldr q6, [sp]
- str q15, [x17] // save last round key
- eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
- str q6, [sp]
-
- sub x30, x22, #0x10
- tst x22, #0xf // if not multiple of 16
- csel x22, x30, x22, ne // subtract another 16 bytes
- subs x22, x22, #0x80
-
- blo .Lxts_dec_short
- b .Lxts_dec_loop
-
-.align 4
-.Lxts_dec_loop:
- ldr q8, .Lxts_magic
- mov x10, x1 // pass rounds
- add x2, x19, #16
- ldr q0, [x20], #16
- sshr v1.2d, v11.2d, #63
- mov x9, sp // pass key schedule
- ldr q6, .Lxts_magic+16
- add v2.2d, v11.2d, v11.2d
- cmtst v3.2d, v11.2d, v6.2d
- and v1.16b, v1.16b, v8.16b
- ext v1.16b, v1.16b, v1.16b, #8
- and v3.16b, v3.16b, v8.16b
- ldr q4, [x20], #16
- eor v12.16b, v2.16b, v1.16b
- eor v1.16b, v4.16b, v12.16b
- eor v0.16b, v0.16b, v11.16b
- cmtst v2.2d, v12.2d, v6.2d
- add v4.2d, v12.2d, v12.2d
- add x0, x19, #16
- ext v3.16b, v3.16b, v3.16b, #8
- and v2.16b, v2.16b, v8.16b
- eor v13.16b, v4.16b, v3.16b
- ldr q3, [x20], #16
- ext v4.16b, v2.16b, v2.16b, #8
- eor v2.16b, v3.16b, v13.16b
- ldr q3, [x20], #16
- add v5.2d, v13.2d, v13.2d
- cmtst v7.2d, v13.2d, v6.2d
- and v7.16b, v7.16b, v8.16b
- ldr q9, [x20], #16
- ext v7.16b, v7.16b, v7.16b, #8
- ldr q10, [x20], #16
- eor v14.16b, v5.16b, v4.16b
- ldr q16, [x20], #16
- add v4.2d, v14.2d, v14.2d
- eor v3.16b, v3.16b, v14.16b
- eor v15.16b, v4.16b, v7.16b
- add v5.2d, v15.2d, v15.2d
- ldr q7, [x20], #16
- cmtst v4.2d, v14.2d, v6.2d
- and v17.16b, v4.16b, v8.16b
- cmtst v18.2d, v15.2d, v6.2d
- eor v4.16b, v9.16b, v15.16b
- ext v9.16b, v17.16b, v17.16b, #8
- eor v9.16b, v5.16b, v9.16b
- add v17.2d, v9.2d, v9.2d
- and v18.16b, v18.16b, v8.16b
- eor v5.16b, v10.16b, v9.16b
- str q9, [x2], #16
- ext v10.16b, v18.16b, v18.16b, #8
- cmtst v9.2d, v9.2d, v6.2d
- and v9.16b, v9.16b, v8.16b
- eor v10.16b, v17.16b, v10.16b
- cmtst v17.2d, v10.2d, v6.2d
- eor v6.16b, v16.16b, v10.16b
- str q10, [x2], #16
- ext v9.16b, v9.16b, v9.16b, #8
- add v10.2d, v10.2d, v10.2d
- eor v9.16b, v10.16b, v9.16b
- str q9, [x2], #16
- eor v7.16b, v7.16b, v9.16b
- add v9.2d, v9.2d, v9.2d
- and v8.16b, v17.16b, v8.16b
- ext v8.16b, v8.16b, v8.16b, #8
- eor v8.16b, v9.16b, v8.16b
- str q8, [x2] // next round tweak
-
- bl _bsaes_decrypt8
-
- eor v6.16b, v6.16b, v13.16b
- eor v0.16b, v0.16b, v11.16b
- ldr q8, [x0], #16
- eor v7.16b, v7.16b, v8.16b
- str q0, [x21], #16
- eor v0.16b, v1.16b, v12.16b
- ldr q1, [x0], #16
- eor v1.16b, v3.16b, v1.16b
- subs x22, x22, #0x80
- eor v2.16b, v2.16b, v15.16b
- eor v3.16b, v4.16b, v14.16b
- ldr q4, [x0], #16
- str q0, [x21], #16
- ldr q11, [x0] // next round tweak
- eor v0.16b, v5.16b, v4.16b
- str q6, [x21], #16
- str q3, [x21], #16
- str q2, [x21], #16
- str q7, [x21], #16
- str q1, [x21], #16
- str q0, [x21], #16
- bpl .Lxts_dec_loop
-
-.Lxts_dec_short:
- adds x22, x22, #0x70
- bmi .Lxts_dec_done
-
- ldr q8, .Lxts_magic
- sshr v1.2d, v11.2d, #63
- add v2.2d, v11.2d, v11.2d
- ldr q9, .Lxts_magic+16
- subs x22, x22, #0x10
- ldr q0, [x20], #16
- and v1.16b, v1.16b, v8.16b
- cmtst v3.2d, v11.2d, v9.2d
- ext v1.16b, v1.16b, v1.16b, #8
- and v3.16b, v3.16b, v8.16b
- eor v12.16b, v2.16b, v1.16b
- ext v1.16b, v3.16b, v3.16b, #8
- add v2.2d, v12.2d, v12.2d
- cmtst v3.2d, v12.2d, v9.2d
- eor v13.16b, v2.16b, v1.16b
- and v22.16b, v3.16b, v8.16b
- bmi .Lxts_dec_1
-
- ext v2.16b, v22.16b, v22.16b, #8
- add v3.2d, v13.2d, v13.2d
- ldr q1, [x20], #16
- cmtst v4.2d, v13.2d, v9.2d
- subs x22, x22, #0x10
- eor v14.16b, v3.16b, v2.16b
- and v23.16b, v4.16b, v8.16b
- bmi .Lxts_dec_2
-
- ext v3.16b, v23.16b, v23.16b, #8
- add v4.2d, v14.2d, v14.2d
- ldr q2, [x20], #16
- cmtst v5.2d, v14.2d, v9.2d
- eor v0.16b, v0.16b, v11.16b
- subs x22, x22, #0x10
- eor v15.16b, v4.16b, v3.16b
- and v24.16b, v5.16b, v8.16b
- bmi .Lxts_dec_3
-
- ext v4.16b, v24.16b, v24.16b, #8
- add v5.2d, v15.2d, v15.2d
- ldr q3, [x20], #16
- cmtst v6.2d, v15.2d, v9.2d
- eor v1.16b, v1.16b, v12.16b
- subs x22, x22, #0x10
- eor v16.16b, v5.16b, v4.16b
- and v25.16b, v6.16b, v8.16b
- bmi .Lxts_dec_4
-
- ext v5.16b, v25.16b, v25.16b, #8
- add v6.2d, v16.2d, v16.2d
- add x0, x19, #16
- cmtst v7.2d, v16.2d, v9.2d
- ldr q4, [x20], #16
- eor v2.16b, v2.16b, v13.16b
- str q16, [x0], #16
- subs x22, x22, #0x10
- eor v17.16b, v6.16b, v5.16b
- and v26.16b, v7.16b, v8.16b
- bmi .Lxts_dec_5
-
- ext v7.16b, v26.16b, v26.16b, #8
- add v18.2d, v17.2d, v17.2d
- ldr q5, [x20], #16
- eor v3.16b, v3.16b, v14.16b
- str q17, [x0], #16
- subs x22, x22, #0x10
- eor v18.16b, v18.16b, v7.16b
- bmi .Lxts_dec_6
-
- ldr q6, [x20], #16
- eor v4.16b, v4.16b, v15.16b
- eor v5.16b, v5.16b, v16.16b
- str q18, [x0] // next round tweak
- mov x9, sp // pass key schedule
- mov x10, x1
- add x0, x19, #16
- sub x22, x22, #0x10
- eor v6.16b, v6.16b, v17.16b
-
- bl _bsaes_decrypt8
-
- ldr q16, [x0], #16
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- ldr q17, [x0], #16
- eor v6.16b, v6.16b, v13.16b
- eor v4.16b, v4.16b, v14.16b
- eor v2.16b, v2.16b, v15.16b
- ldr q11, [x0] // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- eor v0.16b, v7.16b, v16.16b
- eor v1.16b, v3.16b, v17.16b
- str q6, [x21], #16
- str q4, [x21], #16
- str q2, [x21], #16
- str q0, [x21], #16
- str q1, [x21], #16
- b .Lxts_dec_done
-
-.align 4
-.Lxts_dec_6:
- eor v4.16b, v4.16b, v15.16b
- eor v5.16b, v5.16b, v16.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_decrypt8
-
- ldr q16, [x0], #16
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- eor v6.16b, v6.16b, v13.16b
- eor v4.16b, v4.16b, v14.16b
- ldr q11, [x0] // next round tweak
- eor v2.16b, v2.16b, v15.16b
- str q0, [x21], #16
- str q1, [x21], #16
- eor v0.16b, v7.16b, v16.16b
- str q6, [x21], #16
- str q4, [x21], #16
- str q2, [x21], #16
- str q0, [x21], #16
- b .Lxts_dec_done
-
-.align 4
-.Lxts_dec_5:
- eor v3.16b, v3.16b, v14.16b
- eor v4.16b, v4.16b, v15.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_decrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- ldr q11, [x0] // next round tweak
- eor v6.16b, v6.16b, v13.16b
- eor v4.16b, v4.16b, v14.16b
- eor v2.16b, v2.16b, v15.16b
- str q0, [x21], #16
- str q1, [x21], #16
- str q6, [x21], #16
- str q4, [x21], #16
- str q2, [x21], #16
- b .Lxts_dec_done
-
-.align 4
-.Lxts_dec_4:
- eor v2.16b, v2.16b, v13.16b
- eor v3.16b, v3.16b, v14.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_decrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- eor v6.16b, v6.16b, v13.16b
- eor v4.16b, v4.16b, v14.16b
- mov v11.16b, v15.16b // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- str q6, [x21], #16
- str q4, [x21], #16
- b .Lxts_dec_done
-
-.align 4
-.Lxts_dec_3:
- eor v1.16b, v1.16b, v12.16b
- eor v2.16b, v2.16b, v13.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_decrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- eor v6.16b, v6.16b, v13.16b
- mov v11.16b, v14.16b // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- str q6, [x21], #16
- b .Lxts_dec_done
-
-.align 4
-.Lxts_dec_2:
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- mov x9, sp // pass key schedule
- mov x10, x1 // pass rounds
- add x0, x19, #16
-
- bl _bsaes_decrypt8
-
- eor v0.16b, v0.16b, v11.16b
- eor v1.16b, v1.16b, v12.16b
- mov v11.16b, v13.16b // next round tweak
- str q0, [x21], #16
- str q1, [x21], #16
- b .Lxts_dec_done
-
-.align 4
-.Lxts_dec_1:
- eor v0.16b, v0.16b, v11.16b
- sub x0, sp, #16
- sub x1, sp, #16
- mov x2, x23
- mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
- mov v14.d[0], v12.d[1]
- str q0, [sp, #-16]!
-
- bl AES_decrypt
-
- ldr q0, [sp], #16
- trn1 v13.2d, v11.2d, v13.2d
- trn1 v11.2d, v12.2d, v14.2d // next round tweak
- eor v0.16b, v0.16b, v13.16b
- str q0, [x21], #16
-
-.Lxts_dec_done:
- adds x22, x22, #0x10
- beq .Lxts_dec_ret
-
- // calculate one round of extra tweak for the stolen ciphertext
- ldr q8, .Lxts_magic
- sshr v6.2d, v11.2d, #63
- and v6.16b, v6.16b, v8.16b
- add v12.2d, v11.2d, v11.2d
- ext v6.16b, v6.16b, v6.16b, #8
- eor v12.16b, v12.16b, v6.16b
-
- // perform the final decryption with the last tweak value
- ldr q0, [x20], #16
- eor v0.16b, v0.16b, v12.16b
- str q0, [sp, #-16]!
- mov x0, sp
- mov x1, sp
- mov x2, x23
- mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
- mov v14.d[0], v12.d[1]
-
- bl AES_decrypt
-
- trn1 v12.2d, v12.2d, v14.2d
- trn1 v11.2d, v11.2d, v13.2d
- ldr q0, [sp], #16
- eor v0.16b, v0.16b, v12.16b
- str q0, [x21]
-
- mov x6, x21
- // Penultimate ciphertext block produces final plaintext part-block
- // plus remaining part of final ciphertext block. Move plaintext part
- // to final position and re-use penultimate plaintext block buffer to
- // construct final ciphertext block
-.Lxts_dec_steal:
- ldrb w1, [x21]
- ldrb w0, [x20], #1
- strb w1, [x21, #0x10]
- strb w0, [x21], #1
-
- subs x22, x22, #1
- bhi .Lxts_dec_steal
-
- // Finally decrypt the penultimate plaintext block using the
- // penultimate tweak
- ldr q0, [x6]
- eor v0.16b, v0.16b, v11.16b
- str q0, [sp, #-16]!
- mov x0, sp
- mov x1, sp
- mov x2, x23
- mov x21, x6
-
- bl AES_decrypt
-
- trn1 v11.2d, v11.2d, v13.2d
- ldr q0, [sp], #16
- eor v0.16b, v0.16b, v11.16b
- str q0, [x21]
-
-.Lxts_dec_ret:
-
- movi v0.16b, #0
- movi v1.16b, #0
-.Lxts_dec_bzero: // wipe key schedule
- stp q0, q1, [sp], #32
- cmp sp, x19
- bne .Lxts_dec_bzero
-
- ldp x19, x20, [sp, #80]
- ldp x21, x22, [sp, #96]
- ldr x23, [sp, #112]
- ldp d8, d9, [sp, #128]
- ldp d10, d11, [sp, #144]
- ldp d12, d13, [sp, #160]
- ldp d14, d15, [sp, #176]
- ldp fp, lr, [sp], #192
- ret
-.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
diff --git a/crypto/aes/build.info b/crypto/aes/build.info
index edf6c8106e..0b9f499ee6 100644
--- a/crypto/aes/build.info
+++ b/crypto/aes/build.info
@@ -30,8 +30,8 @@ IF[{- !$disabled{asm} -}]
$AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S
$AESDEF_armv4=AES_ASM BSAES_ASM
- $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S bsaes-armv8.S vpaes-armv8.S
- $AESDEF_aarch64=BSAES_ASM VPAES_ASM
+ $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S vpaes-armv8.S
+ $AESDEF_aarch64=VPAES_ASM
$AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s
$AESDEF_parisc11=AES_ASM
@@ -80,7 +80,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}]
ENDIF
GENERATE[aes-ia64.s]=asm/aes-ia64.S
-GENERATE[bsaes-armv8.S]=asm/bsaes-armv8.S
GENERATE[aes-586.s]=asm/aes-586.pl
DEPEND[aes-586.s]=../perlasm/x86asm.pl
More information about the openssl-commits
mailing list