/[smecontribs]/rpms/openssl3/contribs10/0072-ChaCha20-performance-optimizations-for-ppc64le.patch
ViewVC logotype

Annotation of /rpms/openssl3/contribs10/0072-ChaCha20-performance-optimizations-for-ppc64le.patch

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Wed Jan 31 17:24:44 2024 UTC (10 months ago) by jpp
Branch: MAIN
CVS Tags: openssl3-3_0_7-5_el7_sme_1, HEAD
Initial import

1 jpp 1.1 Upstream-Status: Backport [
2     https://github.com/openssl/openssl/commit/f596bbe4da779b56eea34d96168b557d78e1149,
3     https://github.com/openssl/openssl/commit/7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa,
4     hunks in crypto/ppccap.c from https://github.com/openssl/openssl/commit/f5485b97b6c9977c0d39c7669b9f97a879312447
5     ]
6     diff --git a/crypto/chacha/asm/chachap10-ppc.pl b/crypto/chacha/asm/chachap10-ppc.pl
7     new file mode 100755
8     index 0000000..36e9a8d
9     --- /dev/null
10     +++ b/crypto/chacha/asm/chachap10-ppc.pl
11     @@ -0,0 +1,1288 @@
12     +#! /usr/bin/env perl
13     +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
14     +#
15     +# Licensed under the Apache License 2.0 (the "License"). You may not use
16     +# this file except in compliance with the License. You can obtain a copy
17     +# in the file LICENSE in the source distribution or at
18     +# https://www.openssl.org/source/license.html
19     +
20     +#
21     +# ====================================================================
22     +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
23     +# project. The module is, however, dual licensed under OpenSSL and
24     +# CRYPTOGAMS licenses depending on where you obtain it. For further
25     +# details see http://www.openssl.org/~appro/cryptogams/.
26     +# ====================================================================
27     +#
28     +# October 2015
29     +#
30     +# ChaCha20 for PowerPC/AltiVec.
31     +#
32     +# June 2018
33     +#
34     +# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
35     +# processors that can't issue more than one vector instruction per
36     +# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
37     +# interleave would perform better. Incidentally PowerISA 2.07 (first
38     +# implemented by POWER8) defined new usable instructions, hence 4xVSX
39     +# code path...
40     +#
41     +# Performance in cycles per byte out of large buffer.
42     +#
43     +# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX
44     +#
45     +# Freescale e300 13.6/+115% - -
46     +# PPC74x0/G4e 6.81/+310% 3.81 -
47     +# PPC970/G5 9.29/+160% ? -
48     +# POWER7 8.62/+61% 3.35 -
49     +# POWER8 8.70/+51% 2.91 2.09
50     +# POWER9 8.80/+29% 4.44(*) 2.45(**)
51     +#
52     +# (*) this is trade-off result, it's possible to improve it, but
53     +# then it would negatively affect all others;
54     +# (**) POWER9 seems to be "allergic" to mixing vector and integer
55     +# instructions, which is why switch to vector-only code pays
56     +# off that much;
57     +
58     +# $output is the last argument if it looks like a file (it has an extension)
59     +# $flavour is the first argument if it doesn't look like a file
60     +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
61     +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
62     +
63     +if ($flavour =~ /64/) {
64     + $SIZE_T =8;
65     + $LRSAVE =2*$SIZE_T;
66     + $STU ="stdu";
67     + $POP ="ld";
68     + $PUSH ="std";
69     + $UCMP ="cmpld";
70     +} elsif ($flavour =~ /32/) {
71     + $SIZE_T =4;
72     + $LRSAVE =$SIZE_T;
73     + $STU ="stwu";
74     + $POP ="lwz";
75     + $PUSH ="stw";
76     + $UCMP ="cmplw";
77     +} else { die "nonsense $flavour"; }
78     +
79     +$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
80     +
81     +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
82     +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
83     +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
84     +die "can't locate ppc-xlate.pl";
85     +
86     +open STDOUT,"| $^X $xlate $flavour \"$output\""
87     + or die "can't call $xlate: $!";
88     +
89     +$LOCALS=6*$SIZE_T;
90     +$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables
91     +
92     +sub AUTOLOAD() # thunk [simplified] x86-style perlasm
93     +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
94     + $code .= "\t$opcode\t".join(',',@_)."\n";
95     +}
96     +
97     +my $sp = "r1";
98     +
99     +my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
100     +
101     +
102     +{{{
103     +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
104     + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15));
105     +my @K = map("v$_",(16..19));
106     +my $CTR = "v26";
107     +my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30));
108     +my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3);
109     +my $beperm = "v31";
110     +
111     +my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
112     +
113     +my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload
114     +
115     +
116     +sub VSX_lane_ROUND_4x {
117     +my ($a0,$b0,$c0,$d0)=@_;
118     +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
119     +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
120     +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
121     +my @x=map("\"v$_\"",(0..15));
122     +
123     + (
124     + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
125     + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
126     + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
127     + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
128     + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
129     + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
130     + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
131     + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
132     + "&vrlw (@x[$d0],@x[$d0],'$sixteen')",
133     + "&vrlw (@x[$d1],@x[$d1],'$sixteen')",
134     + "&vrlw (@x[$d2],@x[$d2],'$sixteen')",
135     + "&vrlw (@x[$d3],@x[$d3],'$sixteen')",
136     +
137     + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
138     + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
139     + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
140     + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
141     + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
142     + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
143     + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
144     + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
145     + "&vrlw (@x[$b0],@x[$b0],'$twelve')",
146     + "&vrlw (@x[$b1],@x[$b1],'$twelve')",
147     + "&vrlw (@x[$b2],@x[$b2],'$twelve')",
148     + "&vrlw (@x[$b3],@x[$b3],'$twelve')",
149     +
150     + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
151     + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
152     + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
153     + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
154     + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
155     + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
156     + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
157     + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
158     + "&vrlw (@x[$d0],@x[$d0],'$eight')",
159     + "&vrlw (@x[$d1],@x[$d1],'$eight')",
160     + "&vrlw (@x[$d2],@x[$d2],'$eight')",
161     + "&vrlw (@x[$d3],@x[$d3],'$eight')",
162     +
163     + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
164     + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
165     + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
166     + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
167     + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
168     + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
169     + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
170     + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
171     + "&vrlw (@x[$b0],@x[$b0],'$seven')",
172     + "&vrlw (@x[$b1],@x[$b1],'$seven')",
173     + "&vrlw (@x[$b2],@x[$b2],'$seven')",
174     + "&vrlw (@x[$b3],@x[$b3],'$seven')"
175     + );
176     +}
177     +
178     +$code.=<<___;
179     +
180     +.globl .ChaCha20_ctr32_vsx_p10
181     +.align 5
182     +.ChaCha20_ctr32_vsx_p10:
183     + ${UCMP}i $len,255
184     + bgt ChaCha20_ctr32_vsx_8x
185     + $STU $sp,-$FRAME($sp)
186     + mflr r0
187     + li r10,`15+$LOCALS+64`
188     + li r11,`31+$LOCALS+64`
189     + mfspr r12,256
190     + stvx v26,r10,$sp
191     + addi r10,r10,32
192     + stvx v27,r11,$sp
193     + addi r11,r11,32
194     + stvx v28,r10,$sp
195     + addi r10,r10,32
196     + stvx v29,r11,$sp
197     + addi r11,r11,32
198     + stvx v30,r10,$sp
199     + stvx v31,r11,$sp
200     + stw r12,`$FRAME-4`($sp) # save vrsave
201     + li r12,-4096+63
202     + $PUSH r0, `$FRAME+$LRSAVE`($sp)
203     + mtspr 256,r12 # preserve 29 AltiVec registers
204     +
205     + bl Lconsts # returns pointer Lsigma in r12
206     + lvx_4w @K[0],0,r12 # load sigma
207     + addi r12,r12,0x70
208     + li $x10,16
209     + li $x20,32
210     + li $x30,48
211     + li r11,64
212     +
213     + lvx_4w @K[1],0,$key # load key
214     + lvx_4w @K[2],$x10,$key
215     + lvx_4w @K[3],0,$ctr # load counter
216     +
217     + vxor $xt0,$xt0,$xt0
218     + lvx_4w $xt1,r11,r12
219     + vspltw $CTR,@K[3],0
220     + vsldoi @K[3],@K[3],$xt0,4
221     + vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0]
222     + vadduwm $CTR,$CTR,$xt1
223     +
224     + be?lvsl $beperm,0,$x10 # 0x00..0f
225     + be?vspltisb $xt0,3 # 0x03..03
226     + be?vxor $beperm,$beperm,$xt0 # swap bytes within words
227     +
228     + li r0,10 # inner loop counter
229     + mtctr r0
230     + b Loop_outer_vsx
231     +
232     +.align 5
233     +Loop_outer_vsx:
234     + lvx $xa0,$x00,r12 # load [smashed] sigma
235     + lvx $xa1,$x10,r12
236     + lvx $xa2,$x20,r12
237     + lvx $xa3,$x30,r12
238     +
239     + vspltw $xb0,@K[1],0 # smash the key
240     + vspltw $xb1,@K[1],1
241     + vspltw $xb2,@K[1],2
242     + vspltw $xb3,@K[1],3
243     +
244     + vspltw $xc0,@K[2],0
245     + vspltw $xc1,@K[2],1
246     + vspltw $xc2,@K[2],2
247     + vspltw $xc3,@K[2],3
248     +
249     + vmr $xd0,$CTR # smash the counter
250     + vspltw $xd1,@K[3],1
251     + vspltw $xd2,@K[3],2
252     + vspltw $xd3,@K[3],3
253     +
254     + vspltisw $sixteen,-16 # synthesize constants
255     + vspltisw $twelve,12
256     + vspltisw $eight,8
257     + vspltisw $seven,7
258     +
259     +Loop_vsx_4x:
260     +___
261     + foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; }
262     + foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; }
263     +$code.=<<___;
264     +
265     + bdnz Loop_vsx_4x
266     +
267     + vadduwm $xd0,$xd0,$CTR
268     +
269     + vmrgew $xt0,$xa0,$xa1 # transpose data
270     + vmrgew $xt1,$xa2,$xa3
271     + vmrgow $xa0,$xa0,$xa1
272     + vmrgow $xa2,$xa2,$xa3
273     + vmrgew $xt2,$xb0,$xb1
274     + vmrgew $xt3,$xb2,$xb3
275     + vpermdi $xa1,$xa0,$xa2,0b00
276     + vpermdi $xa3,$xa0,$xa2,0b11
277     + vpermdi $xa0,$xt0,$xt1,0b00
278     + vpermdi $xa2,$xt0,$xt1,0b11
279     +
280     + vmrgow $xb0,$xb0,$xb1
281     + vmrgow $xb2,$xb2,$xb3
282     + vmrgew $xt0,$xc0,$xc1
283     + vmrgew $xt1,$xc2,$xc3
284     + vpermdi $xb1,$xb0,$xb2,0b00
285     + vpermdi $xb3,$xb0,$xb2,0b11
286     + vpermdi $xb0,$xt2,$xt3,0b00
287     + vpermdi $xb2,$xt2,$xt3,0b11
288     +
289     + vmrgow $xc0,$xc0,$xc1
290     + vmrgow $xc2,$xc2,$xc3
291     + vmrgew $xt2,$xd0,$xd1
292     + vmrgew $xt3,$xd2,$xd3
293     + vpermdi $xc1,$xc0,$xc2,0b00
294     + vpermdi $xc3,$xc0,$xc2,0b11
295     + vpermdi $xc0,$xt0,$xt1,0b00
296     + vpermdi $xc2,$xt0,$xt1,0b11
297     +
298     + vmrgow $xd0,$xd0,$xd1
299     + vmrgow $xd2,$xd2,$xd3
300     + vspltisw $xt0,4
301     + vadduwm $CTR,$CTR,$xt0 # next counter value
302     + vpermdi $xd1,$xd0,$xd2,0b00
303     + vpermdi $xd3,$xd0,$xd2,0b11
304     + vpermdi $xd0,$xt2,$xt3,0b00
305     + vpermdi $xd2,$xt2,$xt3,0b11
306     +
307     + vadduwm $xa0,$xa0,@K[0]
308     + vadduwm $xb0,$xb0,@K[1]
309     + vadduwm $xc0,$xc0,@K[2]
310     + vadduwm $xd0,$xd0,@K[3]
311     +
312     + be?vperm $xa0,$xa0,$xa0,$beperm
313     + be?vperm $xb0,$xb0,$xb0,$beperm
314     + be?vperm $xc0,$xc0,$xc0,$beperm
315     + be?vperm $xd0,$xd0,$xd0,$beperm
316     +
317     + ${UCMP}i $len,0x40
318     + blt Ltail_vsx
319     +
320     + lvx_4w $xt0,$x00,$inp
321     + lvx_4w $xt1,$x10,$inp
322     + lvx_4w $xt2,$x20,$inp
323     + lvx_4w $xt3,$x30,$inp
324     +
325     + vxor $xt0,$xt0,$xa0
326     + vxor $xt1,$xt1,$xb0
327     + vxor $xt2,$xt2,$xc0
328     + vxor $xt3,$xt3,$xd0
329     +
330     + stvx_4w $xt0,$x00,$out
331     + stvx_4w $xt1,$x10,$out
332     + addi $inp,$inp,0x40
333     + stvx_4w $xt2,$x20,$out
334     + subi $len,$len,0x40
335     + stvx_4w $xt3,$x30,$out
336     + addi $out,$out,0x40
337     + beq Ldone_vsx
338     +
339     + vadduwm $xa0,$xa1,@K[0]
340     + vadduwm $xb0,$xb1,@K[1]
341     + vadduwm $xc0,$xc1,@K[2]
342     + vadduwm $xd0,$xd1,@K[3]
343     +
344     + be?vperm $xa0,$xa0,$xa0,$beperm
345     + be?vperm $xb0,$xb0,$xb0,$beperm
346     + be?vperm $xc0,$xc0,$xc0,$beperm
347     + be?vperm $xd0,$xd0,$xd0,$beperm
348     +
349     + ${UCMP}i $len,0x40
350     + blt Ltail_vsx
351     +
352     + lvx_4w $xt0,$x00,$inp
353     + lvx_4w $xt1,$x10,$inp
354     + lvx_4w $xt2,$x20,$inp
355     + lvx_4w $xt3,$x30,$inp
356     +
357     + vxor $xt0,$xt0,$xa0
358     + vxor $xt1,$xt1,$xb0
359     + vxor $xt2,$xt2,$xc0
360     + vxor $xt3,$xt3,$xd0
361     +
362     + stvx_4w $xt0,$x00,$out
363     + stvx_4w $xt1,$x10,$out
364     + addi $inp,$inp,0x40
365     + stvx_4w $xt2,$x20,$out
366     + subi $len,$len,0x40
367     + stvx_4w $xt3,$x30,$out
368     + addi $out,$out,0x40
369     + beq Ldone_vsx
370     +
371     + vadduwm $xa0,$xa2,@K[0]
372     + vadduwm $xb0,$xb2,@K[1]
373     + vadduwm $xc0,$xc2,@K[2]
374     + vadduwm $xd0,$xd2,@K[3]
375     +
376     + be?vperm $xa0,$xa0,$xa0,$beperm
377     + be?vperm $xb0,$xb0,$xb0,$beperm
378     + be?vperm $xc0,$xc0,$xc0,$beperm
379     + be?vperm $xd0,$xd0,$xd0,$beperm
380     +
381     + ${UCMP}i $len,0x40
382     + blt Ltail_vsx
383     +
384     + lvx_4w $xt0,$x00,$inp
385     + lvx_4w $xt1,$x10,$inp
386     + lvx_4w $xt2,$x20,$inp
387     + lvx_4w $xt3,$x30,$inp
388     +
389     + vxor $xt0,$xt0,$xa0
390     + vxor $xt1,$xt1,$xb0
391     + vxor $xt2,$xt2,$xc0
392     + vxor $xt3,$xt3,$xd0
393     +
394     + stvx_4w $xt0,$x00,$out
395     + stvx_4w $xt1,$x10,$out
396     + addi $inp,$inp,0x40
397     + stvx_4w $xt2,$x20,$out
398     + subi $len,$len,0x40
399     + stvx_4w $xt3,$x30,$out
400     + addi $out,$out,0x40
401     + beq Ldone_vsx
402     +
403     + vadduwm $xa0,$xa3,@K[0]
404     + vadduwm $xb0,$xb3,@K[1]
405     + vadduwm $xc0,$xc3,@K[2]
406     + vadduwm $xd0,$xd3,@K[3]
407     +
408     + be?vperm $xa0,$xa0,$xa0,$beperm
409     + be?vperm $xb0,$xb0,$xb0,$beperm
410     + be?vperm $xc0,$xc0,$xc0,$beperm
411     + be?vperm $xd0,$xd0,$xd0,$beperm
412     +
413     + ${UCMP}i $len,0x40
414     + blt Ltail_vsx
415     +
416     + lvx_4w $xt0,$x00,$inp
417     + lvx_4w $xt1,$x10,$inp
418     + lvx_4w $xt2,$x20,$inp
419     + lvx_4w $xt3,$x30,$inp
420     +
421     + vxor $xt0,$xt0,$xa0
422     + vxor $xt1,$xt1,$xb0
423     + vxor $xt2,$xt2,$xc0
424     + vxor $xt3,$xt3,$xd0
425     +
426     + stvx_4w $xt0,$x00,$out
427     + stvx_4w $xt1,$x10,$out
428     + addi $inp,$inp,0x40
429     + stvx_4w $xt2,$x20,$out
430     + subi $len,$len,0x40
431     + stvx_4w $xt3,$x30,$out
432     + addi $out,$out,0x40
433     + mtctr r0
434     + bne Loop_outer_vsx
435     +
436     +Ldone_vsx:
437     + lwz r12,`$FRAME-4`($sp) # pull vrsave
438     + li r10,`15+$LOCALS+64`
439     + li r11,`31+$LOCALS+64`
440     + $POP r0, `$FRAME+$LRSAVE`($sp)
441     + mtspr 256,r12 # restore vrsave
442     + lvx v26,r10,$sp
443     + addi r10,r10,32
444     + lvx v27,r11,$sp
445     + addi r11,r11,32
446     + lvx v28,r10,$sp
447     + addi r10,r10,32
448     + lvx v29,r11,$sp
449     + addi r11,r11,32
450     + lvx v30,r10,$sp
451     + lvx v31,r11,$sp
452     + mtlr r0
453     + addi $sp,$sp,$FRAME
454     + blr
455     +
456     +.align 4
457     +Ltail_vsx:
458     + addi r11,$sp,$LOCALS
459     + mtctr $len
460     + stvx_4w $xa0,$x00,r11 # offload block to stack
461     + stvx_4w $xb0,$x10,r11
462     + stvx_4w $xc0,$x20,r11
463     + stvx_4w $xd0,$x30,r11
464     + subi r12,r11,1 # prepare for *++ptr
465     + subi $inp,$inp,1
466     + subi $out,$out,1
467     +
468     +Loop_tail_vsx:
469     + lbzu r6,1(r12)
470     + lbzu r7,1($inp)
471     + xor r6,r6,r7
472     + stbu r6,1($out)
473     + bdnz Loop_tail_vsx
474     +
475     + stvx_4w $K[0],$x00,r11 # wipe copy of the block
476     + stvx_4w $K[0],$x10,r11
477     + stvx_4w $K[0],$x20,r11
478     + stvx_4w $K[0],$x30,r11
479     +
480     + b Ldone_vsx
481     + .long 0
482     + .byte 0,12,0x04,1,0x80,0,5,0
483     + .long 0
484     +.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10
485     +___
486     +}}}
487     +
488     +##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to
489     +# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info.
490     +# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value.
491     +# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel.
492     +#
493     +{{{
494     +#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
495     +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
496     + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3,
497     + $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7,
498     + $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31));
499     +my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15));
500     +my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3));
501     +my @K = map("v$_",27,(24..26));
502     +my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31));
503     +my $xr0 = "v4";
504     +my $CTR0 = "v22";
505     +my $CTR1 = "v5";
506     +my $beperm = "v31";
507     +my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
508     +my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7));
509     +my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17));
510     +my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21));
511     +my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26));
512     +
513     +my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload
514     +
515     +sub VSX_lane_ROUND_8x {
516     +my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_;
517     +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
518     +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
519     +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
520     +my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4));
521     +my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5));
522     +my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6));
523     +my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17));
524     +my @x=map("\"v$_\"",(0..31));
525     +
526     + (
527     + "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13
528     + "&vxxlorc (@x[$c7], $xv9,$xv9)",
529     +
530     + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
531     + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
532     + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
533     + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
534     + "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1
535     + "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2
536     + "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3
537     + "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4
538     +
539     + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
540     + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
541     + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
542     + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
543     + "&vxor (@x[$d4],@x[$d4],@x[$a4])",
544     + "&vxor (@x[$d5],@x[$d5],@x[$a5])",
545     + "&vxor (@x[$d6],@x[$d6],@x[$a6])",
546     + "&vxor (@x[$d7],@x[$d7],@x[$a7])",
547     +
548     + "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
549     + "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
550     + "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
551     + "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
552     + "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
553     + "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
554     + "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
555     + "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
556     +
557     + "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
558     + "&vxxlorc (@x[$c7], $xv15,$xv15)",
559     + "&vxxlorc (@x[$a7], $xv10,$xv10)",
560     +
561     + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
562     + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
563     + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
564     + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
565     + "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
566     + "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
567     + "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
568     + "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
569     +
570     + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
571     + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
572     + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
573     + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
574     + "&vxor (@x[$b4],@x[$b4],@x[$c4])",
575     + "&vxor (@x[$b5],@x[$b5],@x[$c5])",
576     + "&vxor (@x[$b6],@x[$b6],@x[$c6])",
577     + "&vxor (@x[$b7],@x[$b7],@x[$c7])",
578     +
579     + "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
580     + "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
581     + "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
582     + "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
583     + "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
584     + "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
585     + "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
586     + "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
587     +
588     + "&vxxlorc (@x[$a7], $xv13,$xv13)",
589     + "&vxxlor ($xv15 ,@x[$c7],@x[$c7])",
590     + "&vxxlorc (@x[$c7], $xv11,$xv11)",
591     +
592     +
593     + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
594     + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
595     + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
596     + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
597     + "&vadduwm (@x[$a4],@x[$a4],@x[$b4])",
598     + "&vadduwm (@x[$a5],@x[$a5],@x[$b5])",
599     + "&vadduwm (@x[$a6],@x[$a6],@x[$b6])",
600     + "&vadduwm (@x[$a7],@x[$a7],@x[$b7])",
601     +
602     + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
603     + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
604     + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
605     + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
606     + "&vxor (@x[$d4],@x[$d4],@x[$a4])",
607     + "&vxor (@x[$d5],@x[$d5],@x[$a5])",
608     + "&vxor (@x[$d6],@x[$d6],@x[$a6])",
609     + "&vxor (@x[$d7],@x[$d7],@x[$a7])",
610     +
611     + "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
612     + "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
613     + "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
614     + "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
615     + "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
616     + "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
617     + "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
618     + "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
619     +
620     + "&vxxlorc (@x[$c7], $xv15,$xv15)",
621     + "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
622     + "&vxxlorc (@x[$a7], $xv12,$xv12)",
623     +
624     + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
625     + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
626     + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
627     + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
628     + "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
629     + "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
630     + "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
631     + "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
632     + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
633     + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
634     + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
635     + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
636     + "&vxor (@x[$b4],@x[$b4],@x[$c4])",
637     + "&vxor (@x[$b5],@x[$b5],@x[$c5])",
638     + "&vxor (@x[$b6],@x[$b6],@x[$c6])",
639     + "&vxor (@x[$b7],@x[$b7],@x[$c7])",
640     + "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
641     + "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
642     + "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
643     + "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
644     + "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
645     + "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
646     + "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
647     + "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
648     +
649     + "&vxxlorc (@x[$a7], $xv13,$xv13)",
650     + );
651     +}
652     +
653     +$code.=<<___;
654     +
655     +.globl .ChaCha20_ctr32_vsx_8x
656     +.align 5
657     +.ChaCha20_ctr32_vsx_8x:
658     + $STU $sp,-$FRAME($sp)
659     + mflr r0
660     + li r10,`15+$LOCALS+64`
661     + li r11,`31+$LOCALS+64`
662     + mfspr r12,256
663     + stvx v24,r10,$sp
664     + addi r10,r10,32
665     + stvx v25,r11,$sp
666     + addi r11,r11,32
667     + stvx v26,r10,$sp
668     + addi r10,r10,32
669     + stvx v27,r11,$sp
670     + addi r11,r11,32
671     + stvx v28,r10,$sp
672     + addi r10,r10,32
673     + stvx v29,r11,$sp
674     + addi r11,r11,32
675     + stvx v30,r10,$sp
676     + stvx v31,r11,$sp
677     + stw r12,`$FRAME-4`($sp) # save vrsave
678     + li r12,-4096+63
679     + $PUSH r0, `$FRAME+$LRSAVE`($sp)
680     + mtspr 256,r12 # preserve 29 AltiVec registers
681     +
682     + bl Lconsts # returns pointer Lsigma in r12
683     +
684     + lvx_4w @K[0],0,r12 # load sigma
685     + addi r12,r12,0x70
686     + li $x10,16
687     + li $x20,32
688     + li $x30,48
689     + li r11,64
690     +
691     + vspltisw $xa4,-16 # synthesize constants
692     + vspltisw $xb4,12 # synthesize constants
693     + vspltisw $xc4,8 # synthesize constants
694     + vspltisw $xd4,7 # synthesize constants
695     +
696     + lvx $xa0,$x00,r12 # load [smashed] sigma
697     + lvx $xa1,$x10,r12
698     + lvx $xa2,$x20,r12
699     + lvx $xa3,$x30,r12
700     +
701     + vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12
702     + vxxlor $xv10 ,$xb4,$xb4
703     + vxxlor $xv11 ,$xc4,$xc4
704     + vxxlor $xv12 ,$xd4,$xd4
705     + vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25
706     + vxxlor $xv23 ,$xa1,$xa1
707     + vxxlor $xv24 ,$xa2,$xa2
708     + vxxlor $xv25 ,$xa3,$xa3
709     +
710     + lvx_4w @K[1],0,$key # load key
711     + lvx_4w @K[2],$x10,$key
712     + lvx_4w @K[3],0,$ctr # load counter
713     + vspltisw $xt3,4
714     +
715     +
716     + vxor $xt2,$xt2,$xt2
717     + lvx_4w $xt1,r11,r12
718     + vspltw $xa2,@K[3],0 #save the original count after spltw
719     + vsldoi @K[3],@K[3],$xt2,4
720     + vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0]
721     + vadduwm $xt1,$xa2,$xt1
722     + vadduwm $xt3,$xt1,$xt3 # next counter value
723     + vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8.
724     +
725     + be?lvsl $beperm,0,$x10 # 0x00..0f
726     + be?vspltisb $xt0,3 # 0x03..03
727     + be?vxor $beperm,$beperm,$xt0 # swap bytes within words
728     + be?vxxlor $xv26 ,$beperm,$beperm
729     +
730     + vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2
731     + vxxlor $xv1 ,@K[1],@K[1]
732     + vxxlor $xv2 ,@K[2],@K[2]
733     + vxxlor $xv3 ,@K[3],@K[3]
734     + vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5
735     + vxxlor $xv5 ,$xt3,$xt3
736     + vxxlor $xv8 ,$xa0,$xa0
737     +
738     + li r0,10 # inner loop counter
739     + mtctr r0
740     + b Loop_outer_vsx_8x
741     +
742     +.align 5
743     +Loop_outer_vsx_8x:
744     + vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma
745     + vxxlorc $xa1,$xv23,$xv23
746     + vxxlorc $xa2,$xv24,$xv24
747     + vxxlorc $xa3,$xv25,$xv25
748     + vxxlorc $xa4,$xv22,$xv22
749     + vxxlorc $xa5,$xv23,$xv23
750     + vxxlorc $xa6,$xv24,$xv24
751     + vxxlorc $xa7,$xv25,$xv25
752     +
753     + vspltw $xb0,@K[1],0 # smash the key
754     + vspltw $xb1,@K[1],1
755     + vspltw $xb2,@K[1],2
756     + vspltw $xb3,@K[1],3
757     + vspltw $xb4,@K[1],0 # smash the key
758     + vspltw $xb5,@K[1],1
759     + vspltw $xb6,@K[1],2
760     + vspltw $xb7,@K[1],3
761     +
762     + vspltw $xc0,@K[2],0
763     + vspltw $xc1,@K[2],1
764     + vspltw $xc2,@K[2],2
765     + vspltw $xc3,@K[2],3
766     + vspltw $xc4,@K[2],0
767     + vspltw $xc7,@K[2],3
768     + vspltw $xc5,@K[2],1
769     +
770     + vxxlorc $xd0,$xv4,$xv4 # smash the counter
771     + vspltw $xd1,@K[3],1
772     + vspltw $xd2,@K[3],2
773     + vspltw $xd3,@K[3],3
774     + vxxlorc $xd4,$xv5,$xv5 # smash the counter
775     + vspltw $xd5,@K[3],1
776     + vspltw $xd6,@K[3],2
777     + vspltw $xd7,@K[3],3
778     + vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done
779     +
780     +Loop_vsx_8x:
781     +___
782     + foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; }
783     + foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; }
784     +$code.=<<___;
785     +
786     + bdnz Loop_vsx_8x
787     + vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31
788     + vxxlor $xv14 ,$xd5,$xd5 #
789     + vxxlor $xv15 ,$xd6,$xd6 #
790     + vxxlor $xv16 ,$xd7,$xd7 #
791     +
792     + vxxlor $xv18 ,$xc4,$xc4 #
793     + vxxlor $xv19 ,$xc5,$xc5 #
794     + vxxlor $xv20 ,$xc6,$xc6 #
795     + vxxlor $xv21 ,$xc7,$xc7 #
796     +
797     + vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs
798     + vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs
799     + be?vxxlorc $beperm,$xv26,$xv26 # copy back the the beperm.
800     +
801     + vxxlorc @K[0],$xv0,$xv0 #27
802     + vxxlorc @K[1],$xv1,$xv1 #24
803     + vxxlorc @K[2],$xv2,$xv2 #25
804     + vxxlorc @K[3],$xv3,$xv3 #26
805     + vxxlorc $CTR0,$xv4,$xv4
806     +###changing to vertical
807     +
808     + vmrgew $xt0,$xa0,$xa1 # transpose data
809     + vmrgew $xt1,$xa2,$xa3
810     + vmrgow $xa0,$xa0,$xa1
811     + vmrgow $xa2,$xa2,$xa3
812     +
813     + vmrgew $xt2,$xb0,$xb1
814     + vmrgew $xt3,$xb2,$xb3
815     + vmrgow $xb0,$xb0,$xb1
816     + vmrgow $xb2,$xb2,$xb3
817     +
818     + vadduwm $xd0,$xd0,$CTR0
819     +
820     + vpermdi $xa1,$xa0,$xa2,0b00
821     + vpermdi $xa3,$xa0,$xa2,0b11
822     + vpermdi $xa0,$xt0,$xt1,0b00
823     + vpermdi $xa2,$xt0,$xt1,0b11
824     + vpermdi $xb1,$xb0,$xb2,0b00
825     + vpermdi $xb3,$xb0,$xb2,0b11
826     + vpermdi $xb0,$xt2,$xt3,0b00
827     + vpermdi $xb2,$xt2,$xt3,0b11
828     +
829     + vmrgew $xt0,$xc0,$xc1
830     + vmrgew $xt1,$xc2,$xc3
831     + vmrgow $xc0,$xc0,$xc1
832     + vmrgow $xc2,$xc2,$xc3
833     + vmrgew $xt2,$xd0,$xd1
834     + vmrgew $xt3,$xd2,$xd3
835     + vmrgow $xd0,$xd0,$xd1
836     + vmrgow $xd2,$xd2,$xd3
837     +
838     + vpermdi $xc1,$xc0,$xc2,0b00
839     + vpermdi $xc3,$xc0,$xc2,0b11
840     + vpermdi $xc0,$xt0,$xt1,0b00
841     + vpermdi $xc2,$xt0,$xt1,0b11
842     + vpermdi $xd1,$xd0,$xd2,0b00
843     + vpermdi $xd3,$xd0,$xd2,0b11
844     + vpermdi $xd0,$xt2,$xt3,0b00
845     + vpermdi $xd2,$xt2,$xt3,0b11
846     +
847     + vspltisw $xt0,8
848     + vadduwm $CTR0,$CTR0,$xt0 # next counter value
849     + vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5
850     +
851     + vadduwm $xa0,$xa0,@K[0]
852     + vadduwm $xb0,$xb0,@K[1]
853     + vadduwm $xc0,$xc0,@K[2]
854     + vadduwm $xd0,$xd0,@K[3]
855     +
856     + be?vperm $xa0,$xa0,$xa0,$beperm
857     + be?vperm $xb0,$xb0,$xb0,$beperm
858     + be?vperm $xc0,$xc0,$xc0,$beperm
859     + be?vperm $xd0,$xd0,$xd0,$beperm
860     +
861     + ${UCMP}i $len,0x40
862     + blt Ltail_vsx_8x
863     +
864     + lvx_4w $xt0,$x00,$inp
865     + lvx_4w $xt1,$x10,$inp
866     + lvx_4w $xt2,$x20,$inp
867     + lvx_4w $xt3,$x30,$inp
868     +
869     + vxor $xt0,$xt0,$xa0
870     + vxor $xt1,$xt1,$xb0
871     + vxor $xt2,$xt2,$xc0
872     + vxor $xt3,$xt3,$xd0
873     +
874     + stvx_4w $xt0,$x00,$out
875     + stvx_4w $xt1,$x10,$out
876     + addi $inp,$inp,0x40
877     + stvx_4w $xt2,$x20,$out
878     + subi $len,$len,0x40
879     + stvx_4w $xt3,$x30,$out
880     + addi $out,$out,0x40
881     + beq Ldone_vsx_8x
882     +
883     + vadduwm $xa0,$xa1,@K[0]
884     + vadduwm $xb0,$xb1,@K[1]
885     + vadduwm $xc0,$xc1,@K[2]
886     + vadduwm $xd0,$xd1,@K[3]
887     +
888     + be?vperm $xa0,$xa0,$xa0,$beperm
889     + be?vperm $xb0,$xb0,$xb0,$beperm
890     + be?vperm $xc0,$xc0,$xc0,$beperm
891     + be?vperm $xd0,$xd0,$xd0,$beperm
892     +
893     + ${UCMP}i $len,0x40
894     + blt Ltail_vsx_8x
895     +
896     + lvx_4w $xt0,$x00,$inp
897     + lvx_4w $xt1,$x10,$inp
898     + lvx_4w $xt2,$x20,$inp
899     + lvx_4w $xt3,$x30,$inp
900     +
901     + vxor $xt0,$xt0,$xa0
902     + vxor $xt1,$xt1,$xb0
903     + vxor $xt2,$xt2,$xc0
904     + vxor $xt3,$xt3,$xd0
905     +
906     + stvx_4w $xt0,$x00,$out
907     + stvx_4w $xt1,$x10,$out
908     + addi $inp,$inp,0x40
909     + stvx_4w $xt2,$x20,$out
910     + subi $len,$len,0x40
911     + stvx_4w $xt3,$x30,$out
912     + addi $out,$out,0x40
913     + beq Ldone_vsx_8x
914     +
915     + vadduwm $xa0,$xa2,@K[0]
916     + vadduwm $xb0,$xb2,@K[1]
917     + vadduwm $xc0,$xc2,@K[2]
918     + vadduwm $xd0,$xd2,@K[3]
919     +
920     + be?vperm $xa0,$xa0,$xa0,$beperm
921     + be?vperm $xb0,$xb0,$xb0,$beperm
922     + be?vperm $xc0,$xc0,$xc0,$beperm
923     + be?vperm $xd0,$xd0,$xd0,$beperm
924     +
925     + ${UCMP}i $len,0x40
926     + blt Ltail_vsx_8x
927     +
928     + lvx_4w $xt0,$x00,$inp
929     + lvx_4w $xt1,$x10,$inp
930     + lvx_4w $xt2,$x20,$inp
931     + lvx_4w $xt3,$x30,$inp
932     +
933     + vxor $xt0,$xt0,$xa0
934     + vxor $xt1,$xt1,$xb0
935     + vxor $xt2,$xt2,$xc0
936     + vxor $xt3,$xt3,$xd0
937     +
938     + stvx_4w $xt0,$x00,$out
939     + stvx_4w $xt1,$x10,$out
940     + addi $inp,$inp,0x40
941     + stvx_4w $xt2,$x20,$out
942     + subi $len,$len,0x40
943     + stvx_4w $xt3,$x30,$out
944     + addi $out,$out,0x40
945     + beq Ldone_vsx_8x
946     +
947     + vadduwm $xa0,$xa3,@K[0]
948     + vadduwm $xb0,$xb3,@K[1]
949     + vadduwm $xc0,$xc3,@K[2]
950     + vadduwm $xd0,$xd3,@K[3]
951     +
952     + be?vperm $xa0,$xa0,$xa0,$beperm
953     + be?vperm $xb0,$xb0,$xb0,$beperm
954     + be?vperm $xc0,$xc0,$xc0,$beperm
955     + be?vperm $xd0,$xd0,$xd0,$beperm
956     +
957     + ${UCMP}i $len,0x40
958     + blt Ltail_vsx_8x
959     +
960     + lvx_4w $xt0,$x00,$inp
961     + lvx_4w $xt1,$x10,$inp
962     + lvx_4w $xt2,$x20,$inp
963     + lvx_4w $xt3,$x30,$inp
964     +
965     + vxor $xt0,$xt0,$xa0
966     + vxor $xt1,$xt1,$xb0
967     + vxor $xt2,$xt2,$xc0
968     + vxor $xt3,$xt3,$xd0
969     +
970     + stvx_4w $xt0,$x00,$out
971     + stvx_4w $xt1,$x10,$out
972     + addi $inp,$inp,0x40
973     + stvx_4w $xt2,$x20,$out
974     + subi $len,$len,0x40
975     + stvx_4w $xt3,$x30,$out
976     + addi $out,$out,0x40
977     + beq Ldone_vsx_8x
978     +
979     +#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31.
980     +#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0.
981     +
982     + vxxlorc $CTR1 ,$xv5,$xv5
983     +
984     + vxxlorc $xcn4 ,$xv18,$xv18
985     + vxxlorc $xcn5 ,$xv19,$xv19
986     + vxxlorc $xcn6 ,$xv20,$xv20
987     + vxxlorc $xcn7 ,$xv21,$xv21
988     +
989     + vxxlorc $xdn4 ,$xv13,$xv13
990     + vxxlorc $xdn5 ,$xv14,$xv14
991     + vxxlorc $xdn6 ,$xv15,$xv15
992     + vxxlorc $xdn7 ,$xv16,$xv16
993     + vadduwm $xdn4,$xdn4,$CTR1
994     +
995     + vxxlorc $xb6 ,$xv6,$xv6
996     + vxxlorc $xb7 ,$xv7,$xv7
997     +#use xa1->xr0, as xt0...in the block 4-7
998     +
999     + vmrgew $xr0,$xa4,$xa5 # transpose data
1000     + vmrgew $xt1,$xa6,$xa7
1001     + vmrgow $xa4,$xa4,$xa5
1002     + vmrgow $xa6,$xa6,$xa7
1003     + vmrgew $xt2,$xb4,$xb5
1004     + vmrgew $xt3,$xb6,$xb7
1005     + vmrgow $xb4,$xb4,$xb5
1006     + vmrgow $xb6,$xb6,$xb7
1007     +
1008     + vpermdi $xa5,$xa4,$xa6,0b00
1009     + vpermdi $xa7,$xa4,$xa6,0b11
1010     + vpermdi $xa4,$xr0,$xt1,0b00
1011     + vpermdi $xa6,$xr0,$xt1,0b11
1012     + vpermdi $xb5,$xb4,$xb6,0b00
1013     + vpermdi $xb7,$xb4,$xb6,0b11
1014     + vpermdi $xb4,$xt2,$xt3,0b00
1015     + vpermdi $xb6,$xt2,$xt3,0b11
1016     +
1017     + vmrgew $xr0,$xcn4,$xcn5
1018     + vmrgew $xt1,$xcn6,$xcn7
1019     + vmrgow $xcn4,$xcn4,$xcn5
1020     + vmrgow $xcn6,$xcn6,$xcn7
1021     + vmrgew $xt2,$xdn4,$xdn5
1022     + vmrgew $xt3,$xdn6,$xdn7
1023     + vmrgow $xdn4,$xdn4,$xdn5
1024     + vmrgow $xdn6,$xdn6,$xdn7
1025     +
1026     + vpermdi $xcn5,$xcn4,$xcn6,0b00
1027     + vpermdi $xcn7,$xcn4,$xcn6,0b11
1028     + vpermdi $xcn4,$xr0,$xt1,0b00
1029     + vpermdi $xcn6,$xr0,$xt1,0b11
1030     + vpermdi $xdn5,$xdn4,$xdn6,0b00
1031     + vpermdi $xdn7,$xdn4,$xdn6,0b11
1032     + vpermdi $xdn4,$xt2,$xt3,0b00
1033     + vpermdi $xdn6,$xt2,$xt3,0b11
1034     +
1035     + vspltisw $xr0,8
1036     + vadduwm $CTR1,$CTR1,$xr0 # next counter value
1037     + vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5
1038     +
1039     + vadduwm $xan0,$xa4,@K[0]
1040     + vadduwm $xbn0,$xb4,@K[1]
1041     + vadduwm $xcn0,$xcn4,@K[2]
1042     + vadduwm $xdn0,$xdn4,@K[3]
1043     +
1044     + be?vperm $xan0,$xa4,$xa4,$beperm
1045     + be?vperm $xbn0,$xb4,$xb4,$beperm
1046     + be?vperm $xcn0,$xcn4,$xcn4,$beperm
1047     + be?vperm $xdn0,$xdn4,$xdn4,$beperm
1048     +
1049     + ${UCMP}i $len,0x40
1050     + blt Ltail_vsx_8x_1
1051     +
1052     + lvx_4w $xr0,$x00,$inp
1053     + lvx_4w $xt1,$x10,$inp
1054     + lvx_4w $xt2,$x20,$inp
1055     + lvx_4w $xt3,$x30,$inp
1056     +
1057     + vxor $xr0,$xr0,$xan0
1058     + vxor $xt1,$xt1,$xbn0
1059     + vxor $xt2,$xt2,$xcn0
1060     + vxor $xt3,$xt3,$xdn0
1061     +
1062     + stvx_4w $xr0,$x00,$out
1063     + stvx_4w $xt1,$x10,$out
1064     + addi $inp,$inp,0x40
1065     + stvx_4w $xt2,$x20,$out
1066     + subi $len,$len,0x40
1067     + stvx_4w $xt3,$x30,$out
1068     + addi $out,$out,0x40
1069     + beq Ldone_vsx_8x
1070     +
1071     + vadduwm $xan0,$xa5,@K[0]
1072     + vadduwm $xbn0,$xb5,@K[1]
1073     + vadduwm $xcn0,$xcn5,@K[2]
1074     + vadduwm $xdn0,$xdn5,@K[3]
1075     +
1076     + be?vperm $xan0,$xan0,$xan0,$beperm
1077     + be?vperm $xbn0,$xbn0,$xbn0,$beperm
1078     + be?vperm $xcn0,$xcn0,$xcn0,$beperm
1079     + be?vperm $xdn0,$xdn0,$xdn0,$beperm
1080     +
1081     + ${UCMP}i $len,0x40
1082     + blt Ltail_vsx_8x_1
1083     +
1084     + lvx_4w $xr0,$x00,$inp
1085     + lvx_4w $xt1,$x10,$inp
1086     + lvx_4w $xt2,$x20,$inp
1087     + lvx_4w $xt3,$x30,$inp
1088     +
1089     + vxor $xr0,$xr0,$xan0
1090     + vxor $xt1,$xt1,$xbn0
1091     + vxor $xt2,$xt2,$xcn0
1092     + vxor $xt3,$xt3,$xdn0
1093     +
1094     + stvx_4w $xr0,$x00,$out
1095     + stvx_4w $xt1,$x10,$out
1096     + addi $inp,$inp,0x40
1097     + stvx_4w $xt2,$x20,$out
1098     + subi $len,$len,0x40
1099     + stvx_4w $xt3,$x30,$out
1100     + addi $out,$out,0x40
1101     + beq Ldone_vsx_8x
1102     +
1103     + vadduwm $xan0,$xa6,@K[0]
1104     + vadduwm $xbn0,$xb6,@K[1]
1105     + vadduwm $xcn0,$xcn6,@K[2]
1106     + vadduwm $xdn0,$xdn6,@K[3]
1107     +
1108     + be?vperm $xan0,$xan0,$xan0,$beperm
1109     + be?vperm $xbn0,$xbn0,$xbn0,$beperm
1110     + be?vperm $xcn0,$xcn0,$xcn0,$beperm
1111     + be?vperm $xdn0,$xdn0,$xdn0,$beperm
1112     +
1113     + ${UCMP}i $len,0x40
1114     + blt Ltail_vsx_8x_1
1115     +
1116     + lvx_4w $xr0,$x00,$inp
1117     + lvx_4w $xt1,$x10,$inp
1118     + lvx_4w $xt2,$x20,$inp
1119     + lvx_4w $xt3,$x30,$inp
1120     +
1121     + vxor $xr0,$xr0,$xan0
1122     + vxor $xt1,$xt1,$xbn0
1123     + vxor $xt2,$xt2,$xcn0
1124     + vxor $xt3,$xt3,$xdn0
1125     +
1126     + stvx_4w $xr0,$x00,$out
1127     + stvx_4w $xt1,$x10,$out
1128     + addi $inp,$inp,0x40
1129     + stvx_4w $xt2,$x20,$out
1130     + subi $len,$len,0x40
1131     + stvx_4w $xt3,$x30,$out
1132     + addi $out,$out,0x40
1133     + beq Ldone_vsx_8x
1134     +
1135     + vadduwm $xan0,$xa7,@K[0]
1136     + vadduwm $xbn0,$xb7,@K[1]
1137     + vadduwm $xcn0,$xcn7,@K[2]
1138     + vadduwm $xdn0,$xdn7,@K[3]
1139     +
1140     + be?vperm $xan0,$xan0,$xan0,$beperm
1141     + be?vperm $xbn0,$xbn0,$xbn0,$beperm
1142     + be?vperm $xcn0,$xcn0,$xcn0,$beperm
1143     + be?vperm $xdn0,$xdn0,$xdn0,$beperm
1144     +
1145     + ${UCMP}i $len,0x40
1146     + blt Ltail_vsx_8x_1
1147     +
1148     + lvx_4w $xr0,$x00,$inp
1149     + lvx_4w $xt1,$x10,$inp
1150     + lvx_4w $xt2,$x20,$inp
1151     + lvx_4w $xt3,$x30,$inp
1152     +
1153     + vxor $xr0,$xr0,$xan0
1154     + vxor $xt1,$xt1,$xbn0
1155     + vxor $xt2,$xt2,$xcn0
1156     + vxor $xt3,$xt3,$xdn0
1157     +
1158     + stvx_4w $xr0,$x00,$out
1159     + stvx_4w $xt1,$x10,$out
1160     + addi $inp,$inp,0x40
1161     + stvx_4w $xt2,$x20,$out
1162     + subi $len,$len,0x40
1163     + stvx_4w $xt3,$x30,$out
1164     + addi $out,$out,0x40
1165     + beq Ldone_vsx_8x
1166     +
1167     + mtctr r0
1168     + bne Loop_outer_vsx_8x
1169     +
1170     +Ldone_vsx_8x:
1171     + lwz r12,`$FRAME-4`($sp) # pull vrsave
1172     + li r10,`15+$LOCALS+64`
1173     + li r11,`31+$LOCALS+64`
1174     + $POP r0, `$FRAME+$LRSAVE`($sp)
1175     + mtspr 256,r12 # restore vrsave
1176     + lvx v24,r10,$sp
1177     + addi r10,r10,32
1178     + lvx v25,r11,$sp
1179     + addi r11,r11,32
1180     + lvx v26,r10,$sp
1181     + addi r10,r10,32
1182     + lvx v27,r11,$sp
1183     + addi r11,r11,32
1184     + lvx v28,r10,$sp
1185     + addi r10,r10,32
1186     + lvx v29,r11,$sp
1187     + addi r11,r11,32
1188     + lvx v30,r10,$sp
1189     + lvx v31,r11,$sp
1190     + mtlr r0
1191     + addi $sp,$sp,$FRAME
1192     + blr
1193     +
1194     +.align 4
1195     +Ltail_vsx_8x:
1196     + addi r11,$sp,$LOCALS
1197     + mtctr $len
1198     + stvx_4w $xa0,$x00,r11 # offload block to stack
1199     + stvx_4w $xb0,$x10,r11
1200     + stvx_4w $xc0,$x20,r11
1201     + stvx_4w $xd0,$x30,r11
1202     + subi r12,r11,1 # prepare for *++ptr
1203     + subi $inp,$inp,1
1204     + subi $out,$out,1
1205     + bl Loop_tail_vsx_8x
1206     +Ltail_vsx_8x_1:
1207     + addi r11,$sp,$LOCALS
1208     + mtctr $len
1209     + stvx_4w $xan0,$x00,r11 # offload block to stack
1210     + stvx_4w $xbn0,$x10,r11
1211     + stvx_4w $xcn0,$x20,r11
1212     + stvx_4w $xdn0,$x30,r11
1213     + subi r12,r11,1 # prepare for *++ptr
1214     + subi $inp,$inp,1
1215     + subi $out,$out,1
1216     + bl Loop_tail_vsx_8x
1217     +
1218     +Loop_tail_vsx_8x:
1219     + lbzu r6,1(r12)
1220     + lbzu r7,1($inp)
1221     + xor r6,r6,r7
1222     + stbu r6,1($out)
1223     + bdnz Loop_tail_vsx_8x
1224     +
1225     + stvx_4w $K[0],$x00,r11 # wipe copy of the block
1226     + stvx_4w $K[0],$x10,r11
1227     + stvx_4w $K[0],$x20,r11
1228     + stvx_4w $K[0],$x30,r11
1229     +
1230     + b Ldone_vsx_8x
1231     + .long 0
1232     + .byte 0,12,0x04,1,0x80,0,5,0
1233     + .long 0
1234     +.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x
1235     +___
1236     +}}}
1237     +
1238     +
1239     +$code.=<<___;
1240     +.align 5
1241     +Lconsts:
1242     + mflr r0
1243     + bcl 20,31,\$+4
1244     + mflr r12 #vvvvv "distance between . and Lsigma
1245     + addi r12,r12,`64-8`
1246     + mtlr r0
1247     + blr
1248     + .long 0
1249     + .byte 0,12,0x14,0,0,0,0,0
1250     + .space `64-9*4`
1251     +Lsigma:
1252     + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574
1253     + .long 1,0,0,0
1254     + .long 2,0,0,0
1255     + .long 3,0,0,0
1256     + .long 4,0,0,0
1257     +___
1258     +$code.=<<___ if ($LITTLE_ENDIAN);
1259     + .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001
1260     + .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300
1261     +___
1262     +$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words
1263     + .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d
1264     + .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c
1265     +___
1266     +$code.=<<___;
1267     + .long 0x61707865,0x61707865,0x61707865,0x61707865
1268     + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
1269     + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
1270     + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
1271     + .long 0,1,2,3
1272     + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c
1273     +.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>"
1274     +.align 2
1275     +___
1276     +
1277     +foreach (split("\n",$code)) {
1278     + s/\`([^\`]*)\`/eval $1/ge;
1279     +
1280     + # instructions prefixed with '?' are endian-specific and need
1281     + # to be adjusted accordingly...
1282     + if ($flavour !~ /le$/) { # big-endian
1283     + s/be\?// or
1284     + s/le\?/#le#/ or
1285     + s/\?lvsr/lvsl/ or
1286     + s/\?lvsl/lvsr/ or
1287     + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or
1288     + s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/;
1289     + } else { # little-endian
1290     + s/le\?// or
1291     + s/be\?/#be#/ or
1292     + s/\?([a-z]+)/$1/ or
1293     + s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/;
1294     + }
1295     +
1296     + print $_,"\n";
1297     +}
1298     +
1299     +close STDOUT or die "error closing STDOUT: $!";
1300     diff --git a/crypto/chacha/build.info b/crypto/chacha/build.info
1301     index c12cb9c..2a819b2 100644
1302     --- a/crypto/chacha/build.info
1303     +++ b/crypto/chacha/build.info
1304     @@ -12,7 +12,7 @@ IF[{- !$disabled{asm} -}]
1305     $CHACHAASM_armv4=chacha-armv4.S
1306     $CHACHAASM_aarch64=chacha-armv8.S
1307    
1308     - $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s
1309     + $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s
1310     $CHACHAASM_ppc64=$CHACHAASM_ppc32
1311    
1312     $CHACHAASM_c64xplus=chacha-c64xplus.s
1313     @@ -29,6 +29,7 @@ SOURCE[../../libcrypto]=$CHACHAASM
1314     GENERATE[chacha-x86.S]=asm/chacha-x86.pl
1315     GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl
1316     GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl
1317     +GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl
1318     GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl
1319     INCLUDE[chacha-armv4.o]=..
1320     GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl
1321     diff --git a/crypto/chacha/chacha_ppc.c b/crypto/chacha/chacha_ppc.c
1322     index 5319040..f99cca8 100644
1323     --- a/crypto/chacha/chacha_ppc.c
1324     +++ b/crypto/chacha/chacha_ppc.c
1325     @@ -23,13 +23,18 @@ void ChaCha20_ctr32_vmx(unsigned char *out, const unsigned char *inp,
1326     void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp,
1327     size_t len, const unsigned int key[8],
1328     const unsigned int counter[4]);
1329     +void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp,
1330     + size_t len, const unsigned int key[8],
1331     + const unsigned int counter[4]);
1332     void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
1333     size_t len, const unsigned int key[8],
1334     const unsigned int counter[4])
1335     {
1336     - OPENSSL_ppccap_P & PPC_CRYPTO207
1337     - ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
1338     - : OPENSSL_ppccap_P & PPC_ALTIVEC
1339     - ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
1340     - : ChaCha20_ctr32_int(out, inp, len, key, counter);
1341     + OPENSSL_ppccap_P & PPC_BRD31
1342     + ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter)
1343     + :OPENSSL_ppccap_P & PPC_CRYPTO207
1344     + ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
1345     + : OPENSSL_ppccap_P & PPC_ALTIVEC
1346     + ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
1347     + : ChaCha20_ctr32_int(out, inp, len, key, counter);
1348     }
1349     diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
1350     index 2ee4440..4590340 100755
1351     --- a/crypto/perlasm/ppc-xlate.pl
1352     +++ b/crypto/perlasm/ppc-xlate.pl
1353     @@ -293,6 +293,14 @@ my $vpermdi = sub { # xxpermdi
1354     $dm = oct($dm) if ($dm =~ /^0/);
1355     " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7;
1356     };
1357     +my $vxxlor = sub { # xxlor
1358     + my ($f, $vrt, $vra, $vrb) = @_;
1359     + " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6;
1360     +};
1361     +my $vxxlorc = sub { # xxlor
1362     + my ($f, $vrt, $vra, $vrb) = @_;
1363     + " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1;
1364     +};
1365    
1366     # PowerISA 2.07 stuff
1367     sub vcrypto_op {
1368     @@ -377,6 +385,15 @@ my $addex = sub {
1369     };
1370     my $vmsumudm = sub { vfour_vsr(@_, 35); };
1371    
1372     +# PowerISA 3.1 stuff
1373     +my $brd = sub {
1374     + my ($f, $ra, $rs) = @_;
1375     + " .long ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1);
1376     +};
1377     +my $vsrq = sub { vcrypto_op(@_, 517); };
1378     +
1379     +
1380     +
1381     while($line=<>) {
1382    
1383     $line =~ s|[#!;].*$||; # get rid of asm-style comments...
1384     diff --git a/crypto/ppccap.c b/crypto/ppccap.c
1385     index 8bcfed2..664627c 100644
1386     --- a/crypto/ppccap.c
1387     +++ b/crypto/ppccap.c
1388     @@ -45,6 +45,7 @@ void OPENSSL_ppc64_probe(void);
1389     void OPENSSL_altivec_probe(void);
1390     void OPENSSL_crypto207_probe(void);
1391     void OPENSSL_madd300_probe(void);
1392     +void OPENSSL_brd31_probe(void);
1393    
1394     long OPENSSL_rdtsc_mftb(void);
1395     long OPENSSL_rdtsc_mfspr268(void);
1396     @@ -117,16 +118,21 @@ static unsigned long getauxval(unsigned long key)
1397     #endif
1398    
1399     /* I wish <sys/auxv.h> was universally available */
1400     -#define HWCAP 16 /* AT_HWCAP */
1401     +#ifndef AT_HWCAP
1402     +# define AT_HWCAP 16 /* AT_HWCAP */
1403     +#endif
1404     #define HWCAP_PPC64 (1U << 30)
1405     #define HWCAP_ALTIVEC (1U << 28)
1406     #define HWCAP_FPU (1U << 27)
1407     #define HWCAP_POWER6_EXT (1U << 9)
1408     #define HWCAP_VSX (1U << 7)
1409    
1410     -#define HWCAP2 26 /* AT_HWCAP2 */
1411     +#ifndef AT_HWCAP2
1412     +# define AT_HWCAP2 26 /* AT_HWCAP2 */
1413     +#endif
1414     #define HWCAP_VEC_CRYPTO (1U << 25)
1415     #define HWCAP_ARCH_3_00 (1U << 23)
1416     +#define HWCAP_ARCH_3_1 (1U << 18)
1417    
1418     # if defined(__GNUC__) && __GNUC__>=2
1419     __attribute__ ((constructor))
1420     @@ -187,6 +193,9 @@ void OPENSSL_cpuid_setup(void)
1421     if (__power_set(0xffffffffU<<17)) /* POWER9 and later */
1422     OPENSSL_ppccap_P |= PPC_MADD300;
1423    
1424     + if (__power_set(0xffffffffU<<18)) /* POWER10 and later */
1425     + OPENSSL_ppccap_P |= PPC_BRD31;
1426     +
1427     return;
1428     # endif
1429     #endif
1430     @@ -215,8 +224,8 @@ void OPENSSL_cpuid_setup(void)
1431    
1432     #ifdef OSSL_IMPLEMENT_GETAUXVAL
1433     {
1434     - unsigned long hwcap = getauxval(HWCAP);
1435     - unsigned long hwcap2 = getauxval(HWCAP2);
1436     + unsigned long hwcap = getauxval(AT_HWCAP);
1437     + unsigned long hwcap2 = getauxval(AT_HWCAP2);
1438    
1439     if (hwcap & HWCAP_FPU) {
1440     OPENSSL_ppccap_P |= PPC_FPU;
1441     @@ -242,6 +251,10 @@ void OPENSSL_cpuid_setup(void)
1442     if (hwcap2 & HWCAP_ARCH_3_00) {
1443     OPENSSL_ppccap_P |= PPC_MADD300;
1444     }
1445     +
1446     + if (hwcap2 & HWCAP_ARCH_3_1) {
1447     + OPENSSL_ppccap_P |= PPC_BRD31;
1448     + }
1449     }
1450     #endif
1451    
1452     @@ -263,7 +276,7 @@ void OPENSSL_cpuid_setup(void)
1453     sigaction(SIGILL, &ill_act, &ill_oact);
1454    
1455     #ifndef OSSL_IMPLEMENT_GETAUXVAL
1456     - if (sigsetjmp(ill_jmp,1) == 0) {
1457     + if (sigsetjmp(ill_jmp, 1) == 0) {
1458     OPENSSL_fpu_probe();
1459     OPENSSL_ppccap_P |= PPC_FPU;
1460    
1461     diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
1462     index c6555df..706164a 100755
1463     --- a/crypto/ppccpuid.pl
1464     +++ b/crypto/ppccpuid.pl
1465     @@ -81,6 +81,17 @@ $code=<<___;
1466     .long 0
1467     .byte 0,12,0x14,0,0,0,0,0
1468    
1469     +.globl .OPENSSL_brd31_probe
1470     +.align 4
1471     +.OPENSSL_brd31_probe:
1472     + xor r0,r0,r0
1473     + brd r3,r0
1474     + blr
1475     + .long 0
1476     + .byte 0,12,0x14,0,0,0,0,0
1477     +.size .OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe
1478     +
1479     +
1480     .globl .OPENSSL_wipe_cpu
1481     .align 4
1482     .OPENSSL_wipe_cpu:
1483     diff --git a/include/crypto/ppc_arch.h b/include/crypto/ppc_arch.h
1484     index 3b3ce4b..fcc846c 100644
1485     --- a/include/crypto/ppc_arch.h
1486     +++ b/include/crypto/ppc_arch.h
1487     @@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P;
1488     # define PPC_MADD300 (1<<4)
1489     # define PPC_MFTB (1<<5)
1490     # define PPC_MFSPR268 (1<<6)
1491     +# define PPC_BRD31 (1<<7)
1492    
1493     #endif

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed