[openssl-commits] [openssl] master update
Andy Polyakov
appro at openssl.org
Wed May 30 20:53:19 UTC 2018
The branch master has been updated
via c869c3ada944bc42a6c00e0433c9d523c4426cde (commit)
via 95c81f8c8895615b4db0edde8ce5d1e030edd2f9 (commit)
from 2fc45cb805f85589bb49c3669864152e909696da (commit)
- Log -----------------------------------------------------------------
commit c869c3ada944bc42a6c00e0433c9d523c4426cde
Author: Andy Polyakov <appro at openssl.org>
Date: Sun May 27 14:04:48 2018 +0200
chacha/asm/chacha-ppc.pl: optimize AltiVec/VMX code path.
32-bit vector rotate instruction was defined from beginning, it
not being used from the start must be a brain-slip...
Reviewed-by: Bernd Edlinger <bernd.edlinger at hotmail.de>
Reviewed-by: Rich Salz <rsalz at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6363)
commit 95c81f8c8895615b4db0edde8ce5d1e030edd2f9
Author: Andy Polyakov <appro at openssl.org>
Date: Sun May 27 14:03:00 2018 +0200
perlasm/ppc-xlate.pl: add new instructions and clean up.
Reviewed-by: Bernd Edlinger <bernd.edlinger at hotmail.de>
Reviewed-by: Rich Salz <rsalz at openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6363)
-----------------------------------------------------------------------
Summary of changes:
crypto/chacha/asm/chacha-ppc.pl | 84 +++++++++++++++++------------------------
crypto/perlasm/ppc-xlate.pl | 43 ++++++++++++++-------
2 files changed, 65 insertions(+), 62 deletions(-)
diff --git a/crypto/chacha/asm/chacha-ppc.pl b/crypto/chacha/asm/chacha-ppc.pl
index 2a1036e..350d5fa 100755
--- a/crypto/chacha/asm/chacha-ppc.pl
+++ b/crypto/chacha/asm/chacha-ppc.pl
@@ -23,11 +23,14 @@
# IALU/gcc-4.x 3xAltiVec+1xIALU
#
# Freescale e300 13.6/+115% -
-# PPC74x0/G4e 6.81/+310% 4.66
-# PPC970/G5 9.29/+160% 4.60
-# POWER7 8.62/+61% 4.27
-# POWER8 8.70/+51% 3.96
-# POWER9 6.61/+29% 3.67
+# PPC74x0/G4e 6.81/+310% 3.72
+# PPC970/G5 9.29/+160% ?
+# POWER7 8.62/+61% 3.38
+# POWER8 8.70/+51% 3.36
+# POWER9 6.61/+29% 3.30(*)
+#
+# (*) this is trade-off result, it's possible to improve it, but
+# then it would negatively affect all others;
$flavour = shift;
@@ -392,19 +395,19 @@ Loop_tail: # byte-by-byte loop
___
{{{
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2) =
- map("v$_",(0..14));
-my (@K)=map("v$_",(15..20));
-my ($FOUR,$sixteen,$twenty4,$twenty,$twelve,$twenty5,$seven) =
- map("v$_",(21..27));
-my ($inpperm,$outperm,$outmask) = map("v$_",(28..30));
-my @D=("v31",$seven,$T0,$T1,$T2);
+my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2)
+ = map("v$_",(0..11));
+my @K = map("v$_",(12..17));
+my ($FOUR,$sixteen,$twenty4) = map("v$_",(18..20));
+my ($inpperm,$outperm,$outmask) = map("v$_",(21..23));
+my @D = map("v$_",(24..28));
+my ($twelve,$seven,$T0,$T1) = @D;
-my $FRAME=$LOCALS+64+13*16+18*$SIZE_T; # 13*16 is for v20-v31 offload
+my $FRAME=$LOCALS+64+10*16+18*$SIZE_T; # 10*16 is for v20-v28 offload
sub VMXROUND {
my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
+my ($a,$b,$c,$d)=@_;
(
"&vadduwm ('$a','$a','$b')",
@@ -412,20 +415,16 @@ my ($a,$b,$c,$d,$t)=@_;
"&vperm ('$d','$d','$d','$sixteen')",
"&vadduwm ('$c','$c','$d')",
- "&vxor ('$t','$b','$c')",
- "&vsrw ('$b','$t','$twenty')",
- "&vslw ('$t','$t','$twelve')",
- "&vor ('$b','$b','$t')",
+ "&vxor ('$b','$b','$c')",
+ "&vrlw ('$b','$b','$twelve')",
"&vadduwm ('$a','$a','$b')",
"&vxor ('$d','$d','$a')",
"&vperm ('$d','$d','$d','$twenty4')",
"&vadduwm ('$c','$c','$d')",
- "&vxor ('$t','$b','$c')",
- "&vsrw ('$b','$t','$twenty5')",
- "&vslw ('$t','$t','$seven')",
- "&vor ('$b','$b','$t')",
+ "&vxor ('$b','$b','$c')",
+ "&vrlw ('$b','$b','$seven')",
"&vsldoi ('$c','$c','$c',8)",
"&vsldoi ('$b','$b','$b',$odd?4:12)",
@@ -461,13 +460,7 @@ $code.=<<___;
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
- addi r11,r11,32
stvx v28,r10,$sp
- addi r10,r10,32
- stvx v29,r11,$sp
- addi r11,r11,32
- stvx v30,r10,$sp
- stvx v31,r11,$sp
stw r12,`$FRAME-$SIZE_T*18-4`($sp) # save vrsave
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
@@ -487,9 +480,9 @@ $code.=<<___;
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
- li r12,-1
+ li r12,-8
$PUSH r0, `$FRAME+$LRSAVE`($sp)
- mtspr 256,r12 # preserve all AltiVec registers
+ mtspr 256,r12 # preserve 29 AltiVec registers
bl Lconsts # returns pointer Lsigma in r12
li @x[0],16
@@ -526,11 +519,6 @@ $code.=<<___;
lwz @d[3],12($ctr)
vadduwm @K[5], at K[4], at K[5]
- vspltisw $twenty,-12 # synthesize constants
- vspltisw $twelve,12
- vspltisw $twenty5,-7
- #vspltisw $seven,7 # synthesized in the loop
-
vxor $T0,$T0,$T0 # 0x00..00
vspltisw $outmask,-1 # 0xff..ff
?lvsr $inpperm,0,$inp # prepare for unaligned load
@@ -543,6 +531,7 @@ $code.=<<___;
be?vxor $outperm,$outperm,$T1
be?vperm $inpperm,$inpperm,$inpperm,$T0
+ li r0,10 # inner loop counter
b Loop_outer_vmx
.align 4
@@ -560,7 +549,6 @@ Loop_outer_vmx:
ori @x[3], at x[3],0x6574
vmr $B0, at K[1]
- li r0,10 # inner loop counter
lwz @x[4],0($key) # load key to GPR
vmr $B1, at K[1]
lwz @x[5],4($key)
@@ -586,15 +574,17 @@ Loop_outer_vmx:
mr @t[1], at x[5]
mr @t[2], at x[6]
mr @t[3], at x[7]
+
+ vspltisw $twelve,12 # synthesize constants
vspltisw $seven,7
mtctr r0
nop
Loop_vmx:
___
- my @thread0=&VMXROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&VMXROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&VMXROUND($A2,$B2,$C2,$D2,$T2,0);
+ my @thread0=&VMXROUND($A0,$B0,$C0,$D0,0);
+ my @thread1=&VMXROUND($A1,$B1,$C1,$D1,0);
+ my @thread2=&VMXROUND($A2,$B2,$C2,$D2,0);
my @thread3=&ROUND(0,4,8,12);
foreach (@thread0) {
@@ -602,10 +592,11 @@ ___
eval(shift(@thread1)); eval(shift(@thread3));
eval(shift(@thread2)); eval(shift(@thread3));
}
+ foreach (@thread3) { eval; }
- @thread0=&VMXROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&VMXROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&VMXROUND($A2,$B2,$C2,$D2,$T2,1);
+ @thread0=&VMXROUND($A0,$B0,$C0,$D0,1);
+ @thread1=&VMXROUND($A1,$B1,$C1,$D1,1);
+ @thread2=&VMXROUND($A2,$B2,$C2,$D2,1);
@thread3=&ROUND(0,5,10,15);
foreach (@thread0) {
@@ -613,6 +604,7 @@ ___
eval(shift(@thread1)); eval(shift(@thread3));
eval(shift(@thread2)); eval(shift(@thread3));
}
+ foreach (@thread3) { eval; }
$code.=<<___;
bdnz Loop_vmx
@@ -866,13 +858,7 @@ Ldone_vmx:
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
- addi r11,r11,32
lvx v28,r10,$sp
- addi r10,r10,32
- lvx v29,r11,$sp
- addi r11,r11,32
- lvx v30,r10,$sp
- lvx v31,r11,$sp
$POP r0, `$FRAME+$LRSAVE`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
@@ -904,7 +890,7 @@ Ldone_vmx:
Lconsts:
mflr r0
bcl 20,31,\$+4
- mflr r12 #vvvvv "distance between . and _vpaes_consts
+ mflr r12 #vvvvv "distance between . and Lsigma
addi r12,r12,`64-8`
mtlr r0
blr
diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
index ba2842f..1a22f7a 100755
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl
@@ -1,5 +1,5 @@
#! /usr/bin/env perl
-# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
@@ -217,6 +217,7 @@ my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx
my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x
my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x
my $lvx_splt = sub { vsxmem_op(@_, 332); }; # lxvdsx
+# VSX instruction[s] masqueraded as made-up AltiVec/VMX
my $vpermdi = sub { # xxpermdi
my ($f, $vrt, $vra, $vrb, $dm) = @_;
$dm = oct($dm) if ($dm =~ /^0/);
@@ -228,6 +229,10 @@ sub vcrypto_op {
my ($f, $vrt, $vra, $vrb, $op) = @_;
" .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
}
+sub vfour {
+ my ($f, $vrt, $vra, $vrb, $vrc, $op) = @_;
+ " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op;
+};
my $vcipher = sub { vcrypto_op(@_, 1288); };
my $vcipherlast = sub { vcrypto_op(@_, 1289); };
my $vncipher = sub { vcrypto_op(@_, 1352); };
@@ -239,7 +244,7 @@ my $vpmsumb = sub { vcrypto_op(@_, 1032); };
my $vpmsumd = sub { vcrypto_op(@_, 1224); };
my $vpmsubh = sub { vcrypto_op(@_, 1096); };
my $vpmsumw = sub { vcrypto_op(@_, 1160); };
-# These are not really crypto, but one can use vcrypto_op
+# These are not really crypto, but vcrypto_op template works
my $vaddudm = sub { vcrypto_op(@_, 192); };
my $vadduqm = sub { vcrypto_op(@_, 256); };
my $vmuleuw = sub { vcrypto_op(@_, 648); };
@@ -247,21 +252,29 @@ my $vmulouw = sub { vcrypto_op(@_, 136); };
my $vrld = sub { vcrypto_op(@_, 196); };
my $vsld = sub { vcrypto_op(@_, 1476); };
my $vsrd = sub { vcrypto_op(@_, 1732); };
+my $vsubudm = sub { vcrypto_op(@_, 1216); };
+my $vaddcuq = sub { vcrypto_op(@_, 320); };
+my $vaddeuqm = sub { vfour(@_,60); };
+my $vaddecuq = sub { vfour(@_,61); };
my $mtsle = sub {
my ($f, $arg) = @_;
" .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
};
-# PowerISA 3.0 stuff
-my $maddhdu = sub {
- my ($f, $rt, $ra, $rb, $rc) = @_;
- " .long ".sprintf "0x%X",(4<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($rc<<6)|49;
+# VSX instructions masqueraded as AltiVec/VMX
+my $mtvrd = sub {
+ my ($f, $vrt, $ra) = @_;
+ " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|(179<<1)|1;
};
-my $maddld = sub {
- my ($f, $rt, $ra, $rb, $rc) = @_;
- " .long ".sprintf "0x%X",(4<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($rc<<6)|51;
+my $mtvrwz = sub {
+ my ($f, $vrt, $ra) = @_;
+ " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|(243<<1)|1;
};
+
+# PowerISA 3.0 stuff
+my $maddhdu = sub { vfour(@_,49); };
+my $maddld = sub { vfour(@_,51); };
my $darn = sub {
my ($f, $rt, $l) = @_;
" .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($l<<16)|(755<<1);
@@ -270,16 +283,20 @@ my $iseleq = sub {
my ($f, $rt, $ra, $rb) = @_;
" .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($ra<<16)|($rb<<11)|(2<<6)|30;
};
+# VSX instruction[s] masqueraded as made-up AltiVec/VMX
+my $vspltib = sub { # xxspltib
+ my ($f, $vrt, $imm8) = @_;
+ $imm8 = oct($imm8) if ($imm8 =~ /^0/);
+ $imm8 &= 0xff;
+ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($imm8<<11)|(360<<1)|1;
+};
# PowerISA 3.0B stuff
my $addex = sub {
my ($f, $rt, $ra, $rb, $cy) = @_; # only cy==0 is specified in 3.0B
" .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($cy<<9)|(170<<1);
};
-my $vmsumudm = sub {
- my ($f, $vrt, $vra, $vrb, $vrc) = @_;
- " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|35;
-};
+my $vmsumudm = sub { vfour(@_,35); };
while($line=<>) {
More information about the openssl-commits
mailing list