[openssl-commits] [openssl] master update

Wed Jun 6 20:14:48 UTC 2018

The branch master has been updated
       via  316d527ff9b6597105df399fc222ea328cd827bf (commit)
       via  791cc3029bd2aee7fc6b766b9841ce1e0a14484a (commit)
       via  3f9c3b3c48252f24043689ad6b0e78b4a3005910 (commit)
       via  f0c77d66b49c3ca7378035f233f86ec0336866eb (commit)
      from  0336df2fa316a3e08b8f0d2d0e8d4bc175e46634 (commit)


- Log -----------------------------------------------------------------
commit 316d527ff9b6597105df399fc222ea328cd827bf
Author: Andy Polyakov <appro at openssl.org>
Date:   Tue Jun 5 20:00:46 2018 +0200

    crypto/ppccap.c: wire new ChaCha20_ctr32_vsx.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6419)

commit 791cc3029bd2aee7fc6b766b9841ce1e0a14484a
Author: Andy Polyakov <appro at openssl.org>
Date:   Tue Jun 5 19:59:19 2018 +0200

    chacha/asm/chacha-ppc.pl: improve performance by 40/80% on POWER8/9.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6419)

commit 3f9c3b3c48252f24043689ad6b0e78b4a3005910
Author: Andy Polyakov <appro at openssl.org>
Date:   Tue Jun 5 19:57:42 2018 +0200

    perlasm/ppc-xlate.pl: add vmrg[eo]w instructions.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6419)

commit f0c77d66b49c3ca7378035f233f86ec0336866eb
Author: Andy Polyakov <appro at openssl.org>
Date:   Tue Jun 5 19:55:55 2018 +0200

    sha/asm/sha512p8-ppc.pl: fix build on Mac OS X.
    
    Reviewed-by: Rich Salz <rsalz at openssl.org>
    (Merged from https://github.com/openssl/openssl/pull/6419)

-----------------------------------------------------------------------

Summary of changes:
 crypto/chacha/asm/chacha-ppc.pl | 415 +++++++++++++++++++++++++++++++++++++++-
 crypto/perlasm/ppc-xlate.pl     |   2 +
 crypto/ppccap.c                 |  11 +-
 crypto/sha/asm/sha512p8-ppc.pl  |   3 +-
 4 files changed, 418 insertions(+), 13 deletions(-)

diff --git a/crypto/chacha/asm/chacha-ppc.pl b/crypto/chacha/asm/chacha-ppc.pl
index 88746fe..96cdfeb 100755
--- a/crypto/chacha/asm/chacha-ppc.pl
+++ b/crypto/chacha/asm/chacha-ppc.pl
@@ -18,19 +18,31 @@
 #
 # ChaCha20 for PowerPC/AltiVec.
 #
+# June 2018
+#
+# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
+# processors that can't issue more than one vector instruction per
+# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
+# interleave would perform better. Incidentally PowerISA 2.07 (first
+# implemented by POWER8) defined new usable instructions, hence 4xVSX
+# code path...
+#
 # Performance in cycles per byte out of large buffer.
 #
-#			IALU/gcc-4.x    3xAltiVec+1xIALU
+#			IALU/gcc-4.x    3xAltiVec+1xIALU	4xVSX
 #
-# Freescale e300	13.6/+115%	-
-# PPC74x0/G4e		6.81/+310%	3.81
-# PPC970/G5		9.29/+160%	?
-# POWER7		8.62/+61%	3.35
-# POWER8		8.70/+51%	2.91
-# POWER9		8.80/+29%	4.44(*)
+# Freescale e300	13.6/+115%	-			-
+# PPC74x0/G4e		6.81/+310%	3.81			-
+# PPC970/G5		9.29/+160%	?			-
+# POWER7		8.62/+61%	3.35			-
+# POWER8		8.70/+51%	2.91			2.09
+# POWER9		8.80/+29%	4.44(*)			2.45(**)
 #
 # (*)	this is trade-off result, it's possible to improve it, but
 #	then it would negatively affect all others;
+# (**)	POWER9 seems to be "allergic" to mixing vector and integer
+#	instructions, which is why switch to vector-only code pays
+#	off that much;
 
 $flavour = shift;
 
@@ -893,7 +905,390 @@ Ldone_vmx:
 	.byte	0,12,0x04,1,0x80,18,5,0
 	.long	0
 .size	.ChaCha20_ctr32_vmx,.-.ChaCha20_ctr32_vmx
+___
+}}}
+{{{
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15));
+my @K = map("v$_",(16..19));
+my $CTR = "v26";
+my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30));
+my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3);
+my $beperm = "v31";
+
+my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
+
+my $FRAME=$LOCALS+64+7*16;	# 7*16 is for v26-v31 offload
+
+sub VSX_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("\"v$_\"",(0..15));
+
+	(
+	"&vadduwm	(@x[$a0], at x[$a0], at x[$b0])",	# Q1
+	 "&vadduwm	(@x[$a1], at x[$a1], at x[$b1])",	# Q2
+	  "&vadduwm	(@x[$a2], at x[$a2], at x[$b2])",	# Q3
+	   "&vadduwm	(@x[$a3], at x[$a3], at x[$b3])",	# Q4
+	"&vxor		(@x[$d0], at x[$d0], at x[$a0])",
+	 "&vxor		(@x[$d1], at x[$d1], at x[$a1])",
+	  "&vxor	(@x[$d2], at x[$d2], at x[$a2])",
+	   "&vxor	(@x[$d3], at x[$d3], at x[$a3])",
+	"&vrlw		(@x[$d0], at x[$d0],'$sixteen')",
+	 "&vrlw		(@x[$d1], at x[$d1],'$sixteen')",
+	  "&vrlw	(@x[$d2], at x[$d2],'$sixteen')",
+	   "&vrlw	(@x[$d3], at x[$d3],'$sixteen')",
+
+	"&vadduwm	(@x[$c0], at x[$c0], at x[$d0])",
+	 "&vadduwm	(@x[$c1], at x[$c1], at x[$d1])",
+	  "&vadduwm	(@x[$c2], at x[$c2], at x[$d2])",
+	   "&vadduwm	(@x[$c3], at x[$c3], at x[$d3])",
+	"&vxor		(@x[$b0], at x[$b0], at x[$c0])",
+	 "&vxor		(@x[$b1], at x[$b1], at x[$c1])",
+	  "&vxor	(@x[$b2], at x[$b2], at x[$c2])",
+	   "&vxor	(@x[$b3], at x[$b3], at x[$c3])",
+	"&vrlw		(@x[$b0], at x[$b0],'$twelve')",
+	 "&vrlw		(@x[$b1], at x[$b1],'$twelve')",
+	  "&vrlw	(@x[$b2], at x[$b2],'$twelve')",
+	   "&vrlw	(@x[$b3], at x[$b3],'$twelve')",
+
+	"&vadduwm	(@x[$a0], at x[$a0], at x[$b0])",
+	 "&vadduwm	(@x[$a1], at x[$a1], at x[$b1])",
+	  "&vadduwm	(@x[$a2], at x[$a2], at x[$b2])",
+	   "&vadduwm	(@x[$a3], at x[$a3], at x[$b3])",
+	"&vxor		(@x[$d0], at x[$d0], at x[$a0])",
+	 "&vxor		(@x[$d1], at x[$d1], at x[$a1])",
+	  "&vxor	(@x[$d2], at x[$d2], at x[$a2])",
+	   "&vxor	(@x[$d3], at x[$d3], at x[$a3])",
+	"&vrlw		(@x[$d0], at x[$d0],'$eight')",
+	 "&vrlw		(@x[$d1], at x[$d1],'$eight')",
+	  "&vrlw	(@x[$d2], at x[$d2],'$eight')",
+	   "&vrlw	(@x[$d3], at x[$d3],'$eight')",
+
+	"&vadduwm	(@x[$c0], at x[$c0], at x[$d0])",
+	 "&vadduwm	(@x[$c1], at x[$c1], at x[$d1])",
+	  "&vadduwm	(@x[$c2], at x[$c2], at x[$d2])",
+	   "&vadduwm	(@x[$c3], at x[$c3], at x[$d3])",
+	"&vxor		(@x[$b0], at x[$b0], at x[$c0])",
+	 "&vxor		(@x[$b1], at x[$b1], at x[$c1])",
+	  "&vxor	(@x[$b2], at x[$b2], at x[$c2])",
+	   "&vxor	(@x[$b3], at x[$b3], at x[$c3])",
+	"&vrlw		(@x[$b0], at x[$b0],'$seven')",
+	 "&vrlw		(@x[$b1], at x[$b1],'$seven')",
+	  "&vrlw	(@x[$b2], at x[$b2],'$seven')",
+	   "&vrlw	(@x[$b3], at x[$b3],'$seven')"
+	);
+}
+
+$code.=<<___;
 
+.globl	.ChaCha20_ctr32_vsx
+.align	5
+.ChaCha20_ctr32_vsx:
+	$STU	$sp,-$FRAME($sp)
+	mflr	r0
+	li	r10,`15+$LOCALS+64`
+	li	r11,`31+$LOCALS+64`
+	mfspr	r12,256
+	stvx	v26,r10,$sp
+	addi	r10,r10,32
+	stvx	v27,r11,$sp
+	addi	r11,r11,32
+	stvx	v28,r10,$sp
+	addi	r10,r10,32
+	stvx	v29,r11,$sp
+	addi	r11,r11,32
+	stvx	v30,r10,$sp
+	stvx	v31,r11,$sp
+	stw	r12,`$FRAME-4`($sp)		# save vrsave
+	li	r12,-4096+63
+	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
+	mtspr	256,r12				# preserve 29 AltiVec registers
+
+	bl	Lconsts				# returns pointer Lsigma in r12
+	lvx_4w	@K[0],0,r12			# load sigma
+	addi	r12,r12,0x50
+	li	$x10,16
+	li	$x20,32
+	li	$x30,48
+	li	r11,64
+
+	lvx_4w	@K[1],0,$key			# load key
+	lvx_4w	@K[2],$x10,$key
+	lvx_4w	@K[3],0,$ctr			# load counter
+
+	vxor	$xt0,$xt0,$xt0
+	lvx_4w	$xt1,r11,r12
+	vspltw	$CTR, at K[3],0
+	vsldoi	@K[3], at K[3],$xt0,4
+	vsldoi	@K[3],$xt0, at K[3],12		# clear @K[3].word[0]
+	vadduwm	$CTR,$CTR,$xt1
+
+	be?lvsl	$beperm,0,$x10			# 0x00..0f
+	be?vspltisb $xt0,3			# 0x03..03
+	be?vxor	$beperm,$beperm,$xt0		# swap bytes within words
+
+	li	r0,10				# inner loop counter
+	mtctr	r0
+	b	Loop_outer_vsx
+
+.align	5
+Loop_outer_vsx:
+	lvx	$xa0,$x00,r12			# load [smashed] sigma
+	lvx	$xa1,$x10,r12
+	lvx	$xa2,$x20,r12
+	lvx	$xa3,$x30,r12
+
+	vspltw	$xb0, at K[1],0			# smash the key
+	vspltw	$xb1, at K[1],1
+	vspltw	$xb2, at K[1],2
+	vspltw	$xb3, at K[1],3
+
+	vspltw	$xc0, at K[2],0
+	vspltw	$xc1, at K[2],1
+	vspltw	$xc2, at K[2],2
+	vspltw	$xc3, at K[2],3
+
+	vmr	$xd0,$CTR			# smash the counter
+	vspltw	$xd1, at K[3],1
+	vspltw	$xd2, at K[3],2
+	vspltw	$xd3, at K[3],3
+
+	vspltisw $sixteen,-16			# synthesize constants
+	vspltisw $twelve,12
+	vspltisw $eight,8
+	vspltisw $seven,7
+
+Loop_vsx:
+___
+	foreach (&VSX_lane_ROUND(0, 4, 8,12)) { eval; }
+	foreach (&VSX_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+	bdnz	Loop_vsx
+
+	vadduwm	$xd0,$xd0,$CTR
+
+	vmrgew	$xt0,$xa0,$xa1			# transpose data
+	vmrgew	$xt1,$xa2,$xa3
+	vmrgow	$xa0,$xa0,$xa1
+	vmrgow	$xa2,$xa2,$xa3
+	 vmrgew	$xt2,$xb0,$xb1
+	 vmrgew	$xt3,$xb2,$xb3
+	vpermdi	$xa1,$xa0,$xa2,0b00
+	vpermdi	$xa3,$xa0,$xa2,0b11
+	vpermdi	$xa0,$xt0,$xt1,0b00
+	vpermdi	$xa2,$xt0,$xt1,0b11
+
+	vmrgow	$xb0,$xb0,$xb1
+	vmrgow	$xb2,$xb2,$xb3
+	 vmrgew	$xt0,$xc0,$xc1
+	 vmrgew	$xt1,$xc2,$xc3
+	vpermdi	$xb1,$xb0,$xb2,0b00
+	vpermdi	$xb3,$xb0,$xb2,0b11
+	vpermdi	$xb0,$xt2,$xt3,0b00
+	vpermdi	$xb2,$xt2,$xt3,0b11
+
+	vmrgow	$xc0,$xc0,$xc1
+	vmrgow	$xc2,$xc2,$xc3
+	 vmrgew	$xt2,$xd0,$xd1
+	 vmrgew	$xt3,$xd2,$xd3
+	vpermdi	$xc1,$xc0,$xc2,0b00
+	vpermdi	$xc3,$xc0,$xc2,0b11
+	vpermdi	$xc0,$xt0,$xt1,0b00
+	vpermdi	$xc2,$xt0,$xt1,0b11
+
+	vmrgow	$xd0,$xd0,$xd1
+	vmrgow	$xd2,$xd2,$xd3
+	 vspltisw $xt0,4
+	 vadduwm  $CTR,$CTR,$xt0		# next counter value
+	vpermdi	$xd1,$xd0,$xd2,0b00
+	vpermdi	$xd3,$xd0,$xd2,0b11
+	vpermdi	$xd0,$xt2,$xt3,0b00
+	vpermdi	$xd2,$xt2,$xt3,0b11
+
+	vadduwm	$xa0,$xa0, at K[0]
+	vadduwm	$xb0,$xb0, at K[1]
+	vadduwm	$xc0,$xc0, at K[2]
+	vadduwm	$xd0,$xd0, at K[3]
+
+	be?vperm $xa0,$xa0,$xa0,$beperm
+	be?vperm $xb0,$xb0,$xb0,$beperm
+	be?vperm $xc0,$xc0,$xc0,$beperm
+	be?vperm $xd0,$xd0,$xd0,$beperm
+
+	${UCMP}i $len,0x40
+	blt	Ltail_vsx
+
+	lvx_4w	$xt0,$x00,$inp
+	lvx_4w	$xt1,$x10,$inp
+	lvx_4w	$xt2,$x20,$inp
+	lvx_4w	$xt3,$x30,$inp
+
+	vxor	$xt0,$xt0,$xa0
+	vxor	$xt1,$xt1,$xb0
+	vxor	$xt2,$xt2,$xc0
+	vxor	$xt3,$xt3,$xd0
+
+	stvx_4w	$xt0,$x00,$out
+	stvx_4w	$xt1,$x10,$out
+	addi	$inp,$inp,0x40
+	stvx_4w	$xt2,$x20,$out
+	subi	$len,$len,0x40
+	stvx_4w	$xt3,$x30,$out
+	addi	$out,$out,0x40
+	beq	Ldone_vsx
+
+	vadduwm	$xa0,$xa1, at K[0]
+	vadduwm	$xb0,$xb1, at K[1]
+	vadduwm	$xc0,$xc1, at K[2]
+	vadduwm	$xd0,$xd1, at K[3]
+
+	be?vperm $xa0,$xa0,$xa0,$beperm
+	be?vperm $xb0,$xb0,$xb0,$beperm
+	be?vperm $xc0,$xc0,$xc0,$beperm
+	be?vperm $xd0,$xd0,$xd0,$beperm
+
+	${UCMP}i $len,0x40
+	blt	Ltail_vsx
+
+	lvx_4w	$xt0,$x00,$inp
+	lvx_4w	$xt1,$x10,$inp
+	lvx_4w	$xt2,$x20,$inp
+	lvx_4w	$xt3,$x30,$inp
+
+	vxor	$xt0,$xt0,$xa0
+	vxor	$xt1,$xt1,$xb0
+	vxor	$xt2,$xt2,$xc0
+	vxor	$xt3,$xt3,$xd0
+
+	stvx_4w	$xt0,$x00,$out
+	stvx_4w	$xt1,$x10,$out
+	addi	$inp,$inp,0x40
+	stvx_4w	$xt2,$x20,$out
+	subi	$len,$len,0x40
+	stvx_4w	$xt3,$x30,$out
+	addi	$out,$out,0x40
+	beq	Ldone_vsx
+
+	vadduwm	$xa0,$xa2, at K[0]
+	vadduwm	$xb0,$xb2, at K[1]
+	vadduwm	$xc0,$xc2, at K[2]
+	vadduwm	$xd0,$xd2, at K[3]
+
+	be?vperm $xa0,$xa0,$xa0,$beperm
+	be?vperm $xb0,$xb0,$xb0,$beperm
+	be?vperm $xc0,$xc0,$xc0,$beperm
+	be?vperm $xd0,$xd0,$xd0,$beperm
+
+	${UCMP}i $len,0x40
+	blt	Ltail_vsx
+
+	lvx_4w	$xt0,$x00,$inp
+	lvx_4w	$xt1,$x10,$inp
+	lvx_4w	$xt2,$x20,$inp
+	lvx_4w	$xt3,$x30,$inp
+
+	vxor	$xt0,$xt0,$xa0
+	vxor	$xt1,$xt1,$xb0
+	vxor	$xt2,$xt2,$xc0
+	vxor	$xt3,$xt3,$xd0
+
+	stvx_4w	$xt0,$x00,$out
+	stvx_4w	$xt1,$x10,$out
+	addi	$inp,$inp,0x40
+	stvx_4w	$xt2,$x20,$out
+	subi	$len,$len,0x40
+	stvx_4w	$xt3,$x30,$out
+	addi	$out,$out,0x40
+	beq	Ldone_vsx
+
+	vadduwm	$xa0,$xa3, at K[0]
+	vadduwm	$xb0,$xb3, at K[1]
+	vadduwm	$xc0,$xc3, at K[2]
+	vadduwm	$xd0,$xd3, at K[3]
+
+	be?vperm $xa0,$xa0,$xa0,$beperm
+	be?vperm $xb0,$xb0,$xb0,$beperm
+	be?vperm $xc0,$xc0,$xc0,$beperm
+	be?vperm $xd0,$xd0,$xd0,$beperm
+
+	${UCMP}i $len,0x40
+	blt	Ltail_vsx
+
+	lvx_4w	$xt0,$x00,$inp
+	lvx_4w	$xt1,$x10,$inp
+	lvx_4w	$xt2,$x20,$inp
+	lvx_4w	$xt3,$x30,$inp
+
+	vxor	$xt0,$xt0,$xa0
+	vxor	$xt1,$xt1,$xb0
+	vxor	$xt2,$xt2,$xc0
+	vxor	$xt3,$xt3,$xd0
+
+	stvx_4w	$xt0,$x00,$out
+	stvx_4w	$xt1,$x10,$out
+	addi	$inp,$inp,0x40
+	stvx_4w	$xt2,$x20,$out
+	subi	$len,$len,0x40
+	stvx_4w	$xt3,$x30,$out
+	addi	$out,$out,0x40
+	mtctr	r0
+	bne	Loop_outer_vsx
+
+Ldone_vsx:
+	lwz	r12,`$FRAME-4`($sp)		# pull vrsave
+	li	r10,`15+$LOCALS+64`
+	li	r11,`31+$LOCALS+64`
+	$POP	r0, `$FRAME+$LRSAVE`($sp)
+	mtspr	256,r12				# restore vrsave
+	lvx	v26,r10,$sp
+	addi	r10,r10,32
+	lvx	v27,r11,$sp
+	addi	r11,r11,32
+	lvx	v28,r10,$sp
+	addi	r10,r10,32
+	lvx	v29,r11,$sp
+	addi	r11,r11,32
+	lvx	v30,r10,$sp
+	lvx	v31,r11,$sp
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+
+.align	4
+Ltail_vsx:
+	addi	r11,$sp,$LOCALS
+	mtctr	$len
+	stvx_4w	$xa0,$x00,r11			# offload block to stack
+	stvx_4w	$xb0,$x10,r11
+	stvx_4w	$xc0,$x20,r11
+	stvx_4w	$xd0,$x30,r11
+	subi	r12,r11,1			# prepare for *++ptr
+	subi	$inp,$inp,1
+	subi	$out,$out,1
+
+Loop_tail_vsx:
+	lbzu	r6,1(r12)
+	lbzu	r7,1($inp)
+	xor	r6,r6,r7
+	stbu	r6,1($out)
+	bdnz	Loop_tail_vsx
+
+	stvx_4w	$K[0],$x00,r11			# wipe copy of the block
+	stvx_4w	$K[0],$x10,r11
+	stvx_4w	$K[0],$x20,r11
+	stvx_4w	$K[0],$x30,r11
+
+	b	Ldone_vsx
+	.long	0
+	.byte	0,12,0x04,1,0x80,0,5,0
+	.long	0
+.size	.ChaCha20_ctr32_vsx,.-.ChaCha20_ctr32_vsx
+___
+}}}
+$code.=<<___;
 .align	5
 Lconsts:
 	mflr	r0
@@ -919,10 +1314,14 @@ $code.=<<___ 	if (!$LITTLE_ENDIAN);	# flipped words
 	.long	0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c
 ___
 $code.=<<___;
+	.long	0x61707865,0x61707865,0x61707865,0x61707865
+	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
+	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
+	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
+	.long	0,1,2,3
 .asciz  "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
 ___
-}}}
 
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/ge;
diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
index 1a22f7a..d220c62 100755
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl
@@ -256,6 +256,8 @@ my $vsubudm	= sub { vcrypto_op(@_, 1216); };
 my $vaddcuq	= sub { vcrypto_op(@_, 320);  };
 my $vaddeuqm	= sub { vfour(@_,60); };
 my $vaddecuq	= sub { vfour(@_,61); };
+my $vmrgew	= sub { vfour(@_,0,1932); };
+my $vmrgow	= sub { vfour(@_,0,1676); };
 
 my $mtsle	= sub {
     my ($f, $arg) = @_;
diff --git a/crypto/ppccap.c b/crypto/ppccap.c
index f8b7c00..8b7d765 100644
--- a/crypto/ppccap.c
+++ b/crypto/ppccap.c
@@ -90,13 +90,18 @@ void ChaCha20_ctr32_int(unsigned char *out, const unsigned char *inp,
 void ChaCha20_ctr32_vmx(unsigned char *out, const unsigned char *inp,
                         size_t len, const unsigned int key[8],
                         const unsigned int counter[4]);
+void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp,
+                        size_t len, const unsigned int key[8],
+                        const unsigned int counter[4]);
 void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
                     size_t len, const unsigned int key[8],
                     const unsigned int counter[4])
 {
-    OPENSSL_ppccap_P & PPC_ALTIVEC
-        ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
-        : ChaCha20_ctr32_int(out, inp, len, key, counter);
+    OPENSSL_ppccap_P & PPC_CRYPTO207
+        ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
+        : OPENSSL_ppccap_P & PPC_ALTIVEC
+            ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
+            : ChaCha20_ctr32_int(out, inp, len, key, counter);
 }
 #endif
 
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
index e3f522c..2792800 100755
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -92,8 +92,7 @@ $idx="r7";
 $lrsave="r8";
 $offload="r11";
 $vrsave="r12";
- at I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
-      $x00=0 if ($flavour =~ /osx/);
+ at I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
 
 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
 @X=map("v$_",(8..19,24..27));