[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Mon Nov 16 12:11:39 UTC 2015
The branch master has been updated
via 9d0e4dc6351df7d0c08400c4b4cf17c017022e50 (commit)
from a5fd24d19bbb586b1c6d235c2021e9bead22c9f5 (commit)
- Log -----------------------------------------------------------------
commit 9d0e4dc6351df7d0c08400c4b4cf17c017022e50
Author: Andy Polyakov <appro at openssl.org>
Date: Tue Nov 10 21:11:24 2015 +0100
bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is couple percent faster]. Triggered by RT#4128, but solves the problem by real modulo-scheduling.
Reviewed-by: Rich Salz <rsalz at openssl.org>
-----------------------------------------------------------------------
Summary of changes:
crypto/bn/asm/s390x.S | 109 +++++++++++++++++++++++++++++++++-----------------
1 file changed, 72 insertions(+), 37 deletions(-)
diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
index 43fcb79..f5eebe4 100755
--- a/crypto/bn/asm/s390x.S
+++ b/crypto/bn/asm/s390x.S
@@ -18,71 +18,106 @@
.align 4
bn_mul_add_words:
lghi zero,0 // zero = 0
- la %r1,0(%r2) // put rp aside
- lghi %r2,0 // i=0;
+ la %r1,0(%r2) // put rp aside [to give way to]
+ lghi %r2,0 // return value
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
- stmg %r6,%r10,48(%r15)
- lghi %r10,3
- lghi %r8,0 // carry = 0
- nr %r10,%r4 // len%4
+ stmg %r6,%r13,48(%r15)
+ lghi %r2,3
+ lghi %r12,0 // carry = 0
+ slgr %r1,%r3 // rp-=ap
+ nr %r2,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
-.Loop4_madd:
- lg %r7,0(%r2,%r3) // ap[i]
+ lg %r7,0(%r3) // ap[0]
+ lg %r9,8(%r3) // ap[1]
mlgr %r6,%r5 // *=w
- alcgr %r7,%r8 // +=carry
- alcgr %r6,zero
- alg %r7,0(%r2,%r1) // +=rp[i]
- stg %r7,0(%r2,%r1) // rp[i]=
+ brct %r4,.Loop4_madd
+ j .Loop4_madd_tail
- lg %r9,8(%r2,%r3)
+.Loop4_madd:
mlgr %r8,%r5
+ lg %r11,16(%r3) // ap[i+2]
+ alcgr %r7,%r12 // +=carry
+ alcgr %r6,zero
+ alg %r7,0(%r3,%r1) // +=rp[i]
+ stg %r7,0(%r3,%r1) // rp[i]=
+
+ mlgr %r10,%r5
+ lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
- alg %r9,8(%r2,%r1)
- stg %r9,8(%r2,%r1)
+ alg %r9,8(%r3,%r1)
+ stg %r9,8(%r3,%r1)
+
+ mlgr %r12,%r5
+ lg %r7,32(%r3)
+ alcgr %r11,%r8
+ alcgr %r10,zero
+ alg %r11,16(%r3,%r1)
+ stg %r11,16(%r3,%r1)
- lg %r7,16(%r2,%r3)
mlgr %r6,%r5
- alcgr %r7,%r8
- alcgr %r6,zero
- alg %r7,16(%r2,%r1)
- stg %r7,16(%r2,%r1)
+ lg %r9,40(%r3)
+ alcgr %r13,%r10
+ alcgr %r12,zero
+ alg %r13,24(%r3,%r1)
+ stg %r13,24(%r3,%r1)
- lg %r9,24(%r2,%r3)
+ la %r3,32(%r3) // i+=4
+ brct %r4,.Loop4_madd
+
+.Loop4_madd_tail:
mlgr %r8,%r5
+ lg %r11,16(%r3)
+ alcgr %r7,%r12 // +=carry
+ alcgr %r6,zero
+ alg %r7,0(%r3,%r1) // +=rp[i]
+ stg %r7,0(%r3,%r1) // rp[i]=
+
+ mlgr %r10,%r5
+ lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
- alg %r9,24(%r2,%r1)
- stg %r9,24(%r2,%r1)
+ alg %r9,8(%r3,%r1)
+ stg %r9,8(%r3,%r1)
- la %r2,32(%r2) // i+=4
- brct %r4,.Loop4_madd
+ mlgr %r12,%r5
+ alcgr %r11,%r8
+ alcgr %r10,zero
+ alg %r11,16(%r3,%r1)
+ stg %r11,16(%r3,%r1)
- la %r10,1(%r10) // see if len%4 is zero ...
- brct %r10,.Loop1_madd // without touching condition code:-)
+ alcgr %r13,%r10
+ alcgr %r12,zero
+ alg %r13,24(%r3,%r1)
+ stg %r13,24(%r3,%r1)
+
+ la %r3,32(%r3) // i+=4
+
+ la %r2,1(%r2) // see if len%4 is zero ...
+ brct %r2,.Loop1_madd // without touching condition code:-)
.Lend_madd:
- alcgr %r8,zero // collect carry bit
- lgr %r2,%r8
- lmg %r6,%r10,48(%r15)
+ lgr %r2,zero // return value
+ alcgr %r2,%r12 // collect even carry bit
+ lmg %r6,%r13,48(%r15)
br %r14
.Loop1_madd:
- lg %r7,0(%r2,%r3) // ap[i]
+ lg %r7,0(%r3) // ap[i]
mlgr %r6,%r5 // *=w
- alcgr %r7,%r8 // +=carry
+ alcgr %r7,%r12 // +=carry
alcgr %r6,zero
- alg %r7,0(%r2,%r1) // +=rp[i]
- stg %r7,0(%r2,%r1) // rp[i]=
+ alg %r7,0(%r3,%r1) // +=rp[i]
+ stg %r7,0(%r3,%r1) // rp[i]=
- lgr %r8,%r6
- la %r2,8(%r2) // i++
- brct %r10,.Loop1_madd
+ lgr %r12,%r6
+ la %r3,8(%r3) // i++
+ brct %r2,.Loop1_madd
j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words
More information about the openssl-commits
mailing list