[openssl-commits] [openssl] master update

Sun Jun 3 19:20:54 UTC 2018

The branch master has been updated
       via  c4d9ef4cc5bf1c48a74b64879622ae9fd6f26b03 (commit)
       via  1a467bd12f20928f3d5e6809b5f9394dbe606541 (commit)
       via  41013cd63c068e2f271fabc92702ee67d800f0cb (commit)
      from  9a708bf982da1d2c9739339d16d7b021da955e00 (commit)


- Log -----------------------------------------------------------------
commit c4d9ef4cc5bf1c48a74b64879622ae9fd6f26b03
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Jun 2 15:25:50 2018 +0200

    sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%.
    
    Biggest part, ~7%, of improvement resulted from omitting constants'
    table index increment in each round. And minor part from rescheduling
    instructions. Apparently POWER9 (and POWER8) manage to dispatch
    instructions more efficiently if they are laid down as if they have
    no latency...
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6406)

commit 1a467bd12f20928f3d5e6809b5f9394dbe606541
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Jun 2 14:14:28 2018 +0200

    chacha/asm/chacha-ppc.pl: improve POWER8 performance by 15%.
    
    This comes at cost of minor 2.5% regression on G4, which is reasonable
    trade-off. [Further improve compliance with ABI requirements.]
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6406)

commit 41013cd63c068e2f271fabc92702ee67d800f0cb
Author: Andy Polyakov <appro at openssl.org>
Date:   Sat Jun 2 14:03:27 2018 +0200

    PPC assembly pack: correct POWER9 results.
    
    As it turns out originally published results were skewed by "turbo"
    mode. VM apparently remains oblivious to dynamic frequency scaling,
    and reports that processor operates at "base" frequency at all times.
    While actual frequency gets increased under load.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6406)

-----------------------------------------------------------------------

Summary of changes:
 crypto/aes/asm/aesp8-ppc.pl           |   3 +-
 crypto/chacha/asm/chacha-ppc.pl       |  74 ++++++++++++---------
 crypto/modes/asm/ghashp8-ppc.pl       |   2 +-
 crypto/poly1305/asm/poly1305-ppc.pl   |   2 +-
 crypto/poly1305/asm/poly1305-ppcfp.pl |   1 -
 crypto/poly1305/poly1305_ieee754.c    |   1 -
 crypto/sha/asm/keccak1600-ppc64.pl    |   2 +-
 crypto/sha/asm/keccak1600p8-ppc.pl    |   2 +-
 crypto/sha/asm/sha512p8-ppc.pl        | 122 +++++++++++++++-------------------
 9 files changed, 102 insertions(+), 107 deletions(-)

diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
index 8670940..488b133 100755
--- a/crypto/aes/asm/aesp8-ppc.pl
+++ b/crypto/aes/asm/aesp8-ppc.pl
@@ -40,7 +40,8 @@
 #		CBC en-/decrypt	CTR	XTS
 # POWER8[le]	3.96/0.72	0.74	1.1
 # POWER8[be]	3.75/0.65	0.66	1.0
-# POWER9[le]	3.05/0.65	0.65	0.80
+# POWER9[le]	4.02/0.86	0.84	1.05
+# POWER9[be]	3.99/0.78	0.79	0.97
 
 $flavour = shift;
 
diff --git a/crypto/chacha/asm/chacha-ppc.pl b/crypto/chacha/asm/chacha-ppc.pl
index 350d5fa..88746fe 100755
--- a/crypto/chacha/asm/chacha-ppc.pl
+++ b/crypto/chacha/asm/chacha-ppc.pl
@@ -23,11 +23,11 @@
 #			IALU/gcc-4.x    3xAltiVec+1xIALU
 #
 # Freescale e300	13.6/+115%	-
-# PPC74x0/G4e		6.81/+310%	3.72
+# PPC74x0/G4e		6.81/+310%	3.81
 # PPC970/G5		9.29/+160%	?
-# POWER7		8.62/+61%	3.38
-# POWER8		8.70/+51%	3.36
-# POWER9		6.61/+29%	3.30(*)
+# POWER7		8.62/+61%	3.35
+# POWER8		8.70/+51%	2.91
+# POWER9		8.80/+29%	4.44(*)
 #
 # (*)	this is trade-off result, it's possible to improve it, but
 #	then it would negatively affect all others;
@@ -398,12 +398,12 @@ ___
 my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2)
 				= map("v$_",(0..11));
 my @K				= map("v$_",(12..17));
-my ($FOUR,$sixteen,$twenty4)	= map("v$_",(18..20));
-my ($inpperm,$outperm,$outmask)	= map("v$_",(21..23));
-my @D				= map("v$_",(24..28));
+my ($FOUR,$sixteen,$twenty4)	= map("v$_",(18..19,23));
+my ($inpperm,$outperm,$outmask)	= map("v$_",(24..26));
+my @D				= map("v$_",(27..31));
 my ($twelve,$seven,$T0,$T1) = @D;
 
-my $FRAME=$LOCALS+64+10*16+18*$SIZE_T;	# 10*16 is for v20-v28 offload
+my $FRAME=$LOCALS+64+10*16+18*$SIZE_T;	# 10*16 is for v23-v31 offload
 
 sub VMXROUND {
 my $odd = pop;
@@ -445,22 +445,22 @@ $code.=<<___;
 	li	r10,`15+$LOCALS+64`
 	li	r11,`31+$LOCALS+64`
 	mfspr	r12,256
-	stvx	v20,r10,$sp
+	stvx	v23,r10,$sp
 	addi	r10,r10,32
-	stvx	v21,r11,$sp
+	stvx	v24,r11,$sp
 	addi	r11,r11,32
-	stvx	v22,r10,$sp
+	stvx	v25,r10,$sp
 	addi	r10,r10,32
-	stvx	v23,r11,$sp
+	stvx	v26,r11,$sp
 	addi	r11,r11,32
-	stvx	v24,r10,$sp
+	stvx	v27,r10,$sp
 	addi	r10,r10,32
-	stvx	v25,r11,$sp
+	stvx	v28,r11,$sp
 	addi	r11,r11,32
-	stvx	v26,r10,$sp
+	stvx	v29,r10,$sp
 	addi	r10,r10,32
-	stvx	v27,r11,$sp
-	stvx	v28,r10,$sp
+	stvx	v30,r11,$sp
+	stvx	v31,r10,$sp
 	stw	r12,`$FRAME-$SIZE_T*18-4`($sp)	# save vrsave
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
@@ -480,7 +480,7 @@ $code.=<<___;
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
-	li	r12,-8
+	li	r12,-4096+511
 	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
 	mtspr	256,r12				# preserve 29 AltiVec registers
 
@@ -588,9 +588,13 @@ ___
 	my @thread3=&ROUND(0,4,8,12);
 
 	foreach (@thread0) {
-		eval;			eval(shift(@thread3));
-		eval(shift(@thread1));	eval(shift(@thread3));
-		eval(shift(@thread2));	eval(shift(@thread3));
+		eval;
+		eval(shift(@thread1));
+		eval(shift(@thread2));
+
+		eval(shift(@thread3));
+		eval(shift(@thread3));
+		eval(shift(@thread3));
 	}
 	foreach (@thread3) { eval; }
 
@@ -600,9 +604,13 @@ ___
 	@thread3=&ROUND(0,5,10,15);
 
 	foreach (@thread0) {
-		eval;			eval(shift(@thread3));
-		eval(shift(@thread1));	eval(shift(@thread3));
-		eval(shift(@thread2));	eval(shift(@thread3));
+		eval;
+		eval(shift(@thread1));
+		eval(shift(@thread2));
+
+		eval(shift(@thread3));
+		eval(shift(@thread3));
+		eval(shift(@thread3));
 	}
 	foreach (@thread3) { eval; }
 $code.=<<___;
@@ -843,22 +851,22 @@ Ldone_vmx:
 	li	r10,`15+$LOCALS+64`
 	li	r11,`31+$LOCALS+64`
 	mtspr	256,r12				# restore vrsave
-	lvx	v20,r10,$sp
+	lvx	v23,r10,$sp
 	addi	r10,r10,32
-	lvx	v21,r11,$sp
+	lvx	v24,r11,$sp
 	addi	r11,r11,32
-	lvx	v22,r10,$sp
+	lvx	v25,r10,$sp
 	addi	r10,r10,32
-	lvx	v23,r11,$sp
+	lvx	v26,r11,$sp
 	addi	r11,r11,32
-	lvx	v24,r10,$sp
+	lvx	v27,r10,$sp
 	addi	r10,r10,32
-	lvx	v25,r11,$sp
+	lvx	v28,r11,$sp
 	addi	r11,r11,32
-	lvx	v26,r10,$sp
+	lvx	v29,r10,$sp
 	addi	r10,r10,32
-	lvx	v27,r11,$sp
-	lvx	v28,r10,$sp
+	lvx	v30,r11,$sp
+	lvx	v31,r10,$sp
 	$POP	r0, `$FRAME+$LRSAVE`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
 	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
diff --git a/crypto/modes/asm/ghashp8-ppc.pl b/crypto/modes/asm/ghashp8-ppc.pl
index 6df485e..6a2ac71 100755
--- a/crypto/modes/asm/ghashp8-ppc.pl
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@@ -30,7 +30,7 @@
 # 2x aggregated reduction improves performance by 50% (resulting
 # performance on POWER8 is 1 cycle per processed byte), and 4x
 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
-# POWER9 delivers 0.40 cpb.
+# POWER9 delivers 0.51 cpb.
 
 $flavour=shift;
 $output =shift;
diff --git a/crypto/poly1305/asm/poly1305-ppc.pl b/crypto/poly1305/asm/poly1305-ppc.pl
index cb4ae23..0c6d015 100755
--- a/crypto/poly1305/asm/poly1305-ppc.pl
+++ b/crypto/poly1305/asm/poly1305-ppc.pl
@@ -28,7 +28,7 @@
 # PPC970		7.00/+114%	3.51/+205%
 # POWER7		3.75/+260%	1.93/+100%
 # POWER8		-		2.03/+200%
-# POWER9		-		1.56/+150%
+# POWER9		-		2.00/+150%
 #
 # Do we need floating-point implementation for PPC? Results presented
 # in poly1305_ieee754.c are tricky to compare to, because they are for
diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl b/crypto/poly1305/asm/poly1305-ppcfp.pl
index 2abb8e2..09f8185 100755
--- a/crypto/poly1305/asm/poly1305-ppcfp.pl
+++ b/crypto/poly1305/asm/poly1305-ppcfp.pl
@@ -26,7 +26,6 @@
 # PPC970		6.03/+80%
 # POWER7		3.50/+30%
 # POWER8		3.75/+10%
-# POWER9		2.80/+12%
 
 $flavour = shift;
 
diff --git a/crypto/poly1305/poly1305_ieee754.c b/crypto/poly1305/poly1305_ieee754.c
index 1a06e03..995a02e 100644
--- a/crypto/poly1305/poly1305_ieee754.c
+++ b/crypto/poly1305/poly1305_ieee754.c
@@ -38,7 +38,6 @@
  * POWER6               4.92
  * POWER7               4.50
  * POWER8               4.10
- * POWER9               3.14
  *
  * z10                  11.2
  * z196+                7.30
diff --git a/crypto/sha/asm/keccak1600-ppc64.pl b/crypto/sha/asm/keccak1600-ppc64.pl
index bc1023e..30e70c5 100755
--- a/crypto/sha/asm/keccak1600-ppc64.pl
+++ b/crypto/sha/asm/keccak1600-ppc64.pl
@@ -30,7 +30,7 @@
 # PPC970/G5	14.6/+120%
 # POWER7	10.3/+100%
 # POWER8	11.5/+85%
-# POWER9	7.2/+45%
+# POWER9	9.4/+45%
 #
 # (*)	Corresponds to SHA3-256. Percentage after slash is improvement
 #	over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
diff --git a/crypto/sha/asm/keccak1600p8-ppc.pl b/crypto/sha/asm/keccak1600p8-ppc.pl
index a0aeeb0..de2bcd6 100755
--- a/crypto/sha/asm/keccak1600p8-ppc.pl
+++ b/crypto/sha/asm/keccak1600p8-ppc.pl
@@ -23,7 +23,7 @@
 # buffer for r=1088, which matches SHA3-256. This is 17% better than
 # scalar PPC64 code. It probably should be noted that if POWER8's
 # successor can achieve higher scalar instruction issue rate, then
-# this module will loose... And it does on POWER9 with 8.8 vs. 7.2.
+# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
 
 $flavour = shift;
 
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
index a33ae4d..e3f522c 100755
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -37,8 +37,8 @@
 # build of sha512-ppc.pl, presented for reference.
 #
 #		POWER8		POWER9
-# SHA256	9.9 [15.8]	9.2 [9.3]
-# SHA512	6.3 [10.3]	5.8 [5.9]
+# SHA256	9.7 [15.8]	11.2 [12.5]
+# SHA512	6.1 [10.3]	7.0 [7.9]
 
 $flavour=shift;
 $output =shift;
@@ -79,7 +79,8 @@ if ($output =~ /512/) {
 }
 
 $func="sha${bits}_block_p8";
-$FRAME=8*$SIZE_T;
+$LOCALS=8*$SIZE_T+8*16;
+$FRAME=$LOCALS+9*16+6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -91,16 +92,17 @@ $idx="r7";
 $lrsave="r8";
 $offload="r11";
 $vrsave="r12";
-($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
- $x00=0 if ($flavour =~ /osx/);
+ at I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
+      $x00=0 if ($flavour =~ /osx/);
 
 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
- at X=map("v$_",(8..23));
-($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+ at X=map("v$_",(8..19,24..27));
+($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
 
 sub ROUND {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 my $j=($i+1)%16;
+my $k=($i+2)%8;
 
 $code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
 	lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
@@ -112,26 +114,30 @@ ___
 $code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
 	vperm		@X[$i], at X[$i], at X[$i],$lemask
 ___
+$code.=<<___		if ($i>=15);
+	vshasigma${sz}	$Sigma, at X[($j+1)%16],0,0
+	vaddu${sz}m	@X[$j], at X[$j],$Sigma
+	vshasigma${sz}	$Sigma, at X[($j+14)%16],0,15
+	vaddu${sz}m	@X[$j], at X[$j],$Sigma
+	vaddu${sz}m	@X[$j], at X[$j], at X[($j+9)%16]
+___
 $code.=<<___;
-	`"vshasigma${sz}	$s0, at X[($j+1)%16],0,0"		if ($i>=15)`
-	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
-	vshasigma${sz}	$S1,$e,1,15		; Sigma1(e)
 	vaddu${sz}m	$h,$h, at X[$i%16]		; h+=X[i]
-	vshasigma${sz}	$S0,$a,1,0		; Sigma0(a)
-	`"vshasigma${sz}	$s1, at X[($j+14)%16],0,15"	if ($i>=15)`
+	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
+	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
 	vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
+	vshasigma${sz}	$Sigma,$e,1,15		; Sigma1(e)
+	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma1(e)
 	vxor		$Func,$a,$b
-	`"vaddu${sz}m		@X[$j], at X[$j], at X[($j+9)%16]"	if ($i>=15)`
-	vaddu${sz}m	$h,$h,$S1		; h+=Sigma1(e)
 	vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
-	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
 	vaddu${sz}m	$d,$d,$h		; d+=h
-	vaddu${sz}m	$S0,$S0,$Func		; Sigma0(a)+Maj(a,b,c)
-	`"vaddu${sz}m		@X[$j], at X[$j],$s0"		if ($i>=15)`
-	lvx		$Ki,$idx,$Tbl		; load next K[i]
-	addi		$idx,$idx,16
-	vaddu${sz}m	$h,$h,$S0		; h+=Sigma0(a)+Maj(a,b,c)
-	`"vaddu${sz}m		@X[$j], at X[$j],$s1"		if ($i>=15)`
+	vshasigma${sz}	$Sigma,$a,1,0		; Sigma0(a)
+	vaddu${sz}m	$Sigma,$Sigma,$Func	; Sigma0(a)+Maj(a,b,c)
+	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma0(a)+Maj(a,b,c)
+	lvx		$Ki, at I[$k],$idx		; load next K[i]
+___
+$code.=<<___		if ($k == 7);
+	addi		$idx,$idx,0x80
 ___
 }
 
@@ -142,21 +148,13 @@ $code=<<___;
 .globl	$func
 .align	6
 $func:
-	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	$STU		$sp,-$FRAME($sp)
 	mflr		$lrsave
-	li		r10,`$FRAME+8*16+15`
-	li		r11,`$FRAME+8*16+31`
-	stvx		v20,r10,$sp		# ABI says so
+	li		r10,`$LOCALS+15`
+	li		r11,`$LOCALS+31`
+	stvx		v24,r10,$sp		# ABI says so
 	addi		r10,r10,32
 	mfspr		$vrsave,256
-	stvx		v21,r11,$sp
-	addi		r11,r11,32
-	stvx		v22,r10,$sp
-	addi		r10,r10,32
-	stvx		v23,r11,$sp
-	addi		r11,r11,32
-	stvx		v24,r10,$sp
-	addi		r10,r10,32
 	stvx		v25,r11,$sp
 	addi		r11,r11,32
 	stvx		v26,r10,$sp
@@ -169,26 +167,26 @@ $func:
 	addi		r11,r11,32
 	stvx		v30,r10,$sp
 	stvx		v31,r11,$sp
-	li		r11,-1
-	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		r11,-4096+255
+	stw		$vrsave,`$FRAME+6*$SIZE_T-4`($sp)	# save vrsave
 	li		$x10,0x10
-	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$PUSH		r26,`$FRAME-6*$SIZE_T`($sp)
 	li		$x20,0x20
-	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$PUSH		r27,`$FRAME-5*$SIZE_T`($sp)
 	li		$x30,0x30
-	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$PUSH		r28,`$FRAME-4*$SIZE_T`($sp)
 	li		$x40,0x40
-	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$PUSH		r29,`$FRAME-3*$SIZE_T`($sp)
 	li		$x50,0x50
-	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$PUSH		r30,`$FRAME-2*$SIZE_T`($sp)
 	li		$x60,0x60
-	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	$PUSH		r31,`$FRAME-1*$SIZE_T`($sp)
 	li		$x70,0x70
-	$PUSH		$lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	$PUSH		$lrsave,`$FRAME+$LRSAVE`($sp)
 	mtspr		256,r11
 
 	bl		LPICmeup
-	addi		$offload,$sp,$FRAME+15
+	addi		$offload,$sp,`8*$SIZE_T+15`
 ___
 $code.=<<___		if ($LENDIAN);
 	li		$idx,8
@@ -222,9 +220,9 @@ $code.=<<___;
 .align	5
 Loop:
 	lvx		$Ki,$x00,$Tbl
-	li		$idx,16
 	lvx_u		@X[0],0,$inp
 	addi		$inp,$inp,16
+	mr		$idx,$Tbl		# copy $Tbl
 	stvx		$A,$x00,$offload	# offload $A-$H
 	stvx		$B,$x10,$offload
 	stvx		$C,$x20,$offload
@@ -234,8 +232,7 @@ Loop:
 	stvx		$G,$x60,$offload
 	stvx		$H,$x70,$offload
 	vaddu${sz}m	$H,$H,$Ki		# h+K[i]
-	lvx		$Ki,$idx,$Tbl
-	addi		$idx,$idx,16
+	lvx		$Ki,$x10,$Tbl
 ___
 for ($i=0;$i<16;$i++)	{ &ROUND($i, at V); unshift(@V,pop(@V)); }
 $code.=<<___;
@@ -268,10 +265,9 @@ $code.=<<___;
 	bne		Loop
 ___
 $code.=<<___		if ($SZ==4);
-	lvx		@X[0],$idx,$Tbl
-	addi		$idx,$idx,16
+	lvx		@X[0],$x20,$idx
 	vperm		$A,$A,$B,$Ki		# pack the answer
-	lvx		@X[1],$idx,$Tbl
+	lvx		@X[1],$x30,$idx
 	vperm		$E,$E,$F,$Ki
 	vperm		$A,$A,$C, at X[0]
 	vperm		$E,$E,$G, at X[0]
@@ -291,19 +287,11 @@ $code.=<<___		if ($SZ==8);
 	stvx_u		$G,$x30,$ctx
 ___
 $code.=<<___;
-	li		r10,`$FRAME+8*16+15`
+	li		r10,`$LOCALS+15`
 	mtlr		$lrsave
-	li		r11,`$FRAME+8*16+31`
+	li		r11,`$LOCALS+31`
 	mtspr		256,$vrsave
-	lvx		v20,r10,$sp		# ABI says so
-	addi		r10,r10,32
-	lvx		v21,r11,$sp
-	addi		r11,r11,32
-	lvx		v22,r10,$sp
-	addi		r10,r10,32
-	lvx		v23,r11,$sp
-	addi		r11,r11,32
-	lvx		v24,r10,$sp
+	lvx		v24,r10,$sp		# ABI says so
 	addi		r10,r10,32
 	lvx		v25,r11,$sp
 	addi		r11,r11,32
@@ -317,13 +305,13 @@ $code.=<<___;
 	addi		r11,r11,32
 	lvx		v30,r10,$sp
 	lvx		v31,r11,$sp
-	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	$POP		r26,`$FRAME-6*$SIZE_T`($sp)
+	$POP		r27,`$FRAME-5*$SIZE_T`($sp)
+	$POP		r28,`$FRAME-4*$SIZE_T`($sp)
+	$POP		r29,`$FRAME-3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME-2*$SIZE_T`($sp)
+	$POP		r31,`$FRAME-1*$SIZE_T`($sp)
+	addi		$sp,$sp,$FRAME
 	blr
 	.long		0
 	.byte		0,12,4,1,0x80,6,3,0