[openssl-commits] [openssl] OpenSSL_1_0_2-stable update
Andy Polyakov
appro at openssl.org
Thu Apr 2 07:52:15 UTC 2015
The branch OpenSSL_1_0_2-stable has been updated
via 0a1f31f7ba837aeaa28e49ed323e60fdd4255b28 (commit)
via 5a27a20be3c67c2ba5f0258b563bcfe41b1befe1 (commit)
from 3d5bb773ecd78f75984fb096bb0be7808d3dc18d (commit)
- Log -----------------------------------------------------------------
commit 0a1f31f7ba837aeaa28e49ed323e60fdd4255b28
Author: Andy Polyakov <appro at openssl.org>
Date: Sat Mar 28 22:01:59 2015 +0100
sha/asm/sha*-armv8.pl: add Denver and X-Gene esults.
Reviewed-by: Richard Levitte <levitte at openssl.org>
(cherry picked from commit be5a87a1b00aceba5484a7ec198ac622c9283def)
commit 5a27a20be3c67c2ba5f0258b563bcfe41b1befe1
Author: Andy Polyakov <appro at openssl.org>
Date: Tue Mar 3 22:05:25 2015 +0100
aes/asm/aesv8-armx.pl: optimize for Cortex-A5x.
ARM has optimized Cortex-A5x pipeline to favour pairs of complementary
AES instructions. While modified code improves performance of post-r0p0
Cortex-A53 performance by >40% (for CBC decrypt and CTR), it hurts
original r0p0. We favour later revisions, because one can't prevent
future from coming. Improvement on post-r0p0 Cortex-A57 exceeds 50%,
while new code is not slower on r0p0, or Apple A7 for that matter.
[Update even SHA results for latest Cortex-A53.]
Reviewed-by: Richard Levitte <levitte at openssl.org>
(cherry picked from commit 94376cccb4ed5b376220bffe0739140ea9dad8c8)
-----------------------------------------------------------------------
Summary of changes:
crypto/aes/asm/aesv8-armx.pl | 209 +++++++++++++++++++++++------------------
crypto/sha/asm/sha1-armv8.pl | 6 +-
crypto/sha/asm/sha512-armv8.pl | 6 +-
3 files changed, 127 insertions(+), 94 deletions(-)
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
index 1e93f86..95ebae3 100755
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -24,8 +24,12 @@
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
-# Cortex-A53 2.45 1.87 1.94
-# Cortex-A57 3.64 1.34 1.32
+# Cortex-A53 1.32 1.29 1.46
+# Cortex-A57(*) 1.95 0.85 0.93
+# Denver 1.96 0.86 0.80
+#
+# (*) original 3.64/1.34/1.32 results were for r0p0 revision
+# and are still same even for updated module;
$flavour = shift;
open STDOUT,">".shift;
@@ -308,17 +312,17 @@ ${prefix}_${dir}crypt:
.Loop_${dir}c:
aes$e $inout,$rndkey0
- vld1.32 {$rndkey0},[$key],#16
aes$mc $inout,$inout
+ vld1.32 {$rndkey0},[$key],#16
subs $rounds,$rounds,#2
aes$e $inout,$rndkey1
- vld1.32 {$rndkey1},[$key],#16
aes$mc $inout,$inout
+ vld1.32 {$rndkey1},[$key],#16
b.gt .Loop_${dir}c
aes$e $inout,$rndkey0
- vld1.32 {$rndkey0},[$key]
aes$mc $inout,$inout
+ vld1.32 {$rndkey0},[$key]
aes$e $inout,$rndkey1
veor $inout,$inout,$rndkey0
@@ -336,6 +340,7 @@ my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
### q8-q15 preloaded key schedule
@@ -385,16 +390,42 @@ $code.=<<___;
veor $rndzero_n_last,q8,$rndlast
b.eq .Lcbc_enc128
+ vld1.32 {$in0-$in1},[$key_]
+ add $key_,$key,#16
+ add $key4,$key,#16*4
+ add $key5,$key,#16*5
+ aese $dat,q8
+ aesmc $dat,$dat
+ add $key6,$key,#16*6
+ add $key7,$key,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
.Loop_cbc_enc:
aese $dat,q8
- vld1.32 {q8},[$key_],#16
aesmc $dat,$dat
- subs $cnt,$cnt,#2
+ vst1.8 {$ivec},[$out],#16
+.Lenter_cbc_enc:
aese $dat,q9
- vld1.32 {q9},[$key_],#16
aesmc $dat,$dat
- b.gt .Loop_cbc_enc
+ aese $dat,$in0
+ aesmc $dat,$dat
+ vld1.32 {q8},[$key4]
+ cmp $rounds,#4
+ aese $dat,$in1
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key5]
+ b.eq .Lcbc_enc192
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ vld1.32 {q8},[$key6]
+ aese $dat,q9
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key7]
+ nop
+.Lcbc_enc192:
aese $dat,q8
aesmc $dat,$dat
subs $len,$len,#16
@@ -403,7 +434,6 @@ $code.=<<___;
cclr $step,eq
aese $dat,q10
aesmc $dat,$dat
- add $key_,$key,#16
aese $dat,q11
aesmc $dat,$dat
vld1.8 {q8},[$inp],$step
@@ -412,16 +442,14 @@ $code.=<<___;
veor q8,q8,$rndzero_n_last
aese $dat,q13
aesmc $dat,$dat
- vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
aese $dat,q14
aesmc $dat,$dat
aese $dat,q15
-
- mov $cnt,$rounds
veor $ivec,$dat,$rndlast
- vst1.8 {$ivec},[$out],#16
b.hs .Loop_cbc_enc
+ vst1.8 {$ivec},[$out],#16
b .Lcbc_done
.align 5
@@ -483,79 +511,78 @@ $code.=<<___;
.Loop3x_cbc_dec:
aesd $dat0,q8
- aesd $dat1,q8
- aesd $dat2,q8
- vld1.32 {q8},[$key_],#16
aesimc $dat0,$dat0
+ aesd $dat1,q8
aesimc $dat1,$dat1
+ aesd $dat2,q8
aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aesd $dat0,q9
- aesd $dat1,q9
- aesd $dat2,q9
- vld1.32 {q9},[$key_],#16
aesimc $dat0,$dat0
+ aesd $dat1,q9
aesimc $dat1,$dat1
+ aesd $dat2,q9
aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
b.gt .Loop3x_cbc_dec
aesd $dat0,q8
- aesd $dat1,q8
- aesd $dat2,q8
- veor $tmp0,$ivec,$rndlast
aesimc $dat0,$dat0
+ aesd $dat1,q8
aesimc $dat1,$dat1
+ aesd $dat2,q8
aesimc $dat2,$dat2
+ veor $tmp0,$ivec,$rndlast
+ subs $len,$len,#0x30
veor $tmp1,$in0,$rndlast
+ mov.lo x6,$len // x6, $cnt, is zero at this point
aesd $dat0,q9
- aesd $dat1,q9
- aesd $dat2,q9
- veor $tmp2,$in1,$rndlast
- subs $len,$len,#0x30
aesimc $dat0,$dat0
+ aesd $dat1,q9
aesimc $dat1,$dat1
+ aesd $dat2,q9
aesimc $dat2,$dat2
- vorr $ivec,$in2,$in2
- mov.lo x6,$len // x6, $cnt, is zero at this point
- aesd $dat0,q12
- aesd $dat1,q12
- aesd $dat2,q12
+ veor $tmp2,$in1,$rndlast
add $inp,$inp,x6 // $inp is adjusted in such way that
// at exit from the loop $dat1-$dat2
// are loaded with last "words"
+ vorr $ivec,$in2,$in2
+ mov $key_,$key
+ aesd $dat0,q12
aesimc $dat0,$dat0
+ aesd $dat1,q12
aesimc $dat1,$dat1
+ aesd $dat2,q12
aesimc $dat2,$dat2
- mov $key_,$key
- aesd $dat0,q13
- aesd $dat1,q13
- aesd $dat2,q13
vld1.8 {$in0},[$inp],#16
+ aesd $dat0,q13
aesimc $dat0,$dat0
+ aesd $dat1,q13
aesimc $dat1,$dat1
+ aesd $dat2,q13
aesimc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
aesd $dat0,q14
- aesd $dat1,q14
- aesd $dat2,q14
- vld1.8 {$in2},[$inp],#16
aesimc $dat0,$dat0
+ aesd $dat1,q14
aesimc $dat1,$dat1
+ aesd $dat2,q14
aesimc $dat2,$dat2
- vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ vld1.8 {$in2},[$inp],#16
aesd $dat0,q15
aesd $dat1,q15
aesd $dat2,q15
-
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
add $cnt,$rounds,#2
veor $tmp0,$tmp0,$dat0
veor $tmp1,$tmp1,$dat1
veor $dat2,$dat2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
- vorr $dat0,$in0,$in0
vst1.8 {$tmp0},[$out],#16
- vorr $dat1,$in1,$in1
+ vorr $dat0,$in0,$in0
vst1.8 {$tmp1},[$out],#16
+ vorr $dat1,$in1,$in1
vst1.8 {$dat2},[$out],#16
vorr $dat2,$in2,$in2
b.hs .Loop3x_cbc_dec
@@ -566,39 +593,39 @@ $code.=<<___;
.Lcbc_dec_tail:
aesd $dat1,q8
- aesd $dat2,q8
- vld1.32 {q8},[$key_],#16
aesimc $dat1,$dat1
+ aesd $dat2,q8
aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aesd $dat1,q9
- aesd $dat2,q9
- vld1.32 {q9},[$key_],#16
aesimc $dat1,$dat1
+ aesd $dat2,q9
aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
b.gt .Lcbc_dec_tail
aesd $dat1,q8
- aesd $dat2,q8
aesimc $dat1,$dat1
+ aesd $dat2,q8
aesimc $dat2,$dat2
aesd $dat1,q9
- aesd $dat2,q9
aesimc $dat1,$dat1
+ aesd $dat2,q9
aesimc $dat2,$dat2
aesd $dat1,q12
- aesd $dat2,q12
aesimc $dat1,$dat1
+ aesd $dat2,q12
aesimc $dat2,$dat2
cmn $len,#0x20
aesd $dat1,q13
- aesd $dat2,q13
aesimc $dat1,$dat1
+ aesd $dat2,q13
aesimc $dat2,$dat2
veor $tmp1,$ivec,$rndlast
aesd $dat1,q14
- aesd $dat2,q14
aesimc $dat1,$dat1
+ aesd $dat2,q14
aesimc $dat2,$dat2
veor $tmp2,$in1,$rndlast
aesd $dat1,q15
@@ -699,70 +726,69 @@ $code.=<<___;
.align 4
.Loop3x_ctr32:
aese $dat0,q8
- aese $dat1,q8
- aese $dat2,q8
- vld1.32 {q8},[$key_],#16
aesmc $dat0,$dat0
+ aese $dat1,q8
aesmc $dat1,$dat1
+ aese $dat2,q8
aesmc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
- aese $dat1,q9
- aese $dat2,q9
- vld1.32 {q9},[$key_],#16
aesmc $dat0,$dat0
+ aese $dat1,q9
aesmc $dat1,$dat1
+ aese $dat2,q9
aesmc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
b.gt .Loop3x_ctr32
aese $dat0,q8
- aese $dat1,q8
- aese $dat2,q8
- mov $key_,$key
aesmc $tmp0,$dat0
- vld1.8 {$in0},[$inp],#16
+ aese $dat1,q8
aesmc $tmp1,$dat1
- aesmc $dat2,$dat2
+ vld1.8 {$in0},[$inp],#16
vorr $dat0,$ivec,$ivec
- aese $tmp0,q9
+ aese $dat2,q8
+ aesmc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
- aese $tmp1,q9
- aese $dat2,q9
vorr $dat1,$ivec,$ivec
+ aese $tmp0,q9
aesmc $tmp0,$tmp0
- vld1.8 {$in2},[$inp],#16
+ aese $tmp1,q9
aesmc $tmp1,$tmp1
+ vld1.8 {$in2},[$inp],#16
+ mov $key_,$key
+ aese $dat2,q9
aesmc $tmp2,$dat2
vorr $dat2,$ivec,$ivec
add $tctr0,$ctr,#1
aese $tmp0,q12
+ aesmc $tmp0,$tmp0
aese $tmp1,q12
- aese $tmp2,q12
+ aesmc $tmp1,$tmp1
veor $in0,$in0,$rndlast
add $tctr1,$ctr,#2
- aesmc $tmp0,$tmp0
- aesmc $tmp1,$tmp1
+ aese $tmp2,q12
aesmc $tmp2,$tmp2
veor $in1,$in1,$rndlast
add $ctr,$ctr,#3
aese $tmp0,q13
+ aesmc $tmp0,$tmp0
aese $tmp1,q13
- aese $tmp2,q13
+ aesmc $tmp1,$tmp1
veor $in2,$in2,$rndlast
rev $tctr0,$tctr0
- aesmc $tmp0,$tmp0
- vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
- aesmc $tmp1,$tmp1
+ aese $tmp2,q13
aesmc $tmp2,$tmp2
vmov.32 ${dat0}[3], $tctr0
rev $tctr1,$tctr1
aese $tmp0,q14
+ aesmc $tmp0,$tmp0
aese $tmp1,q14
- aese $tmp2,q14
+ aesmc $tmp1,$tmp1
vmov.32 ${dat1}[3], $tctr1
rev $tctr2,$ctr
- aesmc $tmp0,$tmp0
- aesmc $tmp1,$tmp1
+ aese $tmp2,q14
aesmc $tmp2,$tmp2
vmov.32 ${dat2}[3], $tctr2
subs $len,$len,#3
@@ -770,13 +796,14 @@ $code.=<<___;
aese $tmp1,q15
aese $tmp2,q15
- mov $cnt,$rounds
veor $in0,$in0,$tmp0
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ vst1.8 {$in0},[$out],#16
veor $in1,$in1,$tmp1
+ mov $cnt,$rounds
+ vst1.8 {$in1},[$out],#16
veor $in2,$in2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
- vst1.8 {$in0},[$out],#16
- vst1.8 {$in1},[$out],#16
vst1.8 {$in2},[$out],#16
b.hs .Loop3x_ctr32
@@ -788,40 +815,40 @@ $code.=<<___;
.Lctr32_tail:
aese $dat0,q8
- aese $dat1,q8
- vld1.32 {q8},[$key_],#16
aesmc $dat0,$dat0
+ aese $dat1,q8
aesmc $dat1,$dat1
+ vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
- aese $dat1,q9
- vld1.32 {q9},[$key_],#16
aesmc $dat0,$dat0
+ aese $dat1,q9
aesmc $dat1,$dat1
+ vld1.32 {q9},[$key_],#16
b.gt .Lctr32_tail
aese $dat0,q8
- aese $dat1,q8
aesmc $dat0,$dat0
+ aese $dat1,q8
aesmc $dat1,$dat1
aese $dat0,q9
- aese $dat1,q9
aesmc $dat0,$dat0
+ aese $dat1,q9
aesmc $dat1,$dat1
vld1.8 {$in0},[$inp],$step
aese $dat0,q12
- aese $dat1,q12
- vld1.8 {$in1},[$inp]
aesmc $dat0,$dat0
+ aese $dat1,q12
aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp]
aese $dat0,q13
- aese $dat1,q13
aesmc $dat0,$dat0
+ aese $dat1,q13
aesmc $dat1,$dat1
- aese $dat0,q14
- aese $dat1,q14
veor $in0,$in0,$rndlast
+ aese $dat0,q14
aesmc $dat0,$dat0
+ aese $dat1,q14
aesmc $dat1,$dat1
veor $in1,$in1,$rndlast
aese $dat0,q15
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
index deb1238..c04432a 100644
--- a/crypto/sha/asm/sha1-armv8.pl
+++ b/crypto/sha/asm/sha1-armv8.pl
@@ -14,10 +14,14 @@
#
# hardware-assisted software(*)
# Apple A7 2.31 4.13 (+14%)
-# Cortex-A53 2.19 8.73 (+108%)
+# Cortex-A53 2.24 8.03 (+97%)
# Cortex-A57 2.35 7.88 (+74%)
+# Denver 2.13 3.97 (+0%)(**)
+# X-Gene 8.80 (+200%)
#
# (*) Software results are presented mostly for reference purposes.
+# (**) Keep in mind that Denver relies on binary translation, which
+# optimizes compiler output at run-time.
$flavour = shift;
open STDOUT,">".shift;
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
index bd7a0a5..f7b36b9 100644
--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -14,8 +14,10 @@
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-# Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***))
+# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+# Denver 2.01 10.5 (+26%) 6.70 (+8%)
+# X-Gene 20.0 (+100%) 12.8 (+300%(***))
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
@@ -25,7 +27,7 @@
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significanty faster
-# and lags behind assembly only by 50-90%.
+# and the gap is only 40-90%.
$flavour=shift;
$output=shift;
More information about the openssl-commits
mailing list