/[smecontribs]/rpms/openssl3/contribs10/0072-ChaCha20-performance-optimizations-for-ppc64le.patch
ViewVC logotype

Contents of /rpms/openssl3/contribs10/0072-ChaCha20-performance-optimizations-for-ppc64le.patch

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download)
Wed Jan 31 17:24:44 2024 UTC (4 months ago) by jpp
Branch: MAIN
CVS Tags: openssl3-3_0_7-5_el7_sme_1, HEAD
Initial import

1 Upstream-Status: Backport [
2 https://github.com/openssl/openssl/commit/f596bbe4da779b56eea34d96168b557d78e1149,
3 https://github.com/openssl/openssl/commit/7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa,
4 hunks in crypto/ppccap.c from https://github.com/openssl/openssl/commit/f5485b97b6c9977c0d39c7669b9f97a879312447
5 ]
6 diff --git a/crypto/chacha/asm/chachap10-ppc.pl b/crypto/chacha/asm/chachap10-ppc.pl
7 new file mode 100755
8 index 0000000..36e9a8d
9 --- /dev/null
10 +++ b/crypto/chacha/asm/chachap10-ppc.pl
11 @@ -0,0 +1,1288 @@
12 +#! /usr/bin/env perl
13 +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
14 +#
15 +# Licensed under the Apache License 2.0 (the "License"). You may not use
16 +# this file except in compliance with the License. You can obtain a copy
17 +# in the file LICENSE in the source distribution or at
18 +# https://www.openssl.org/source/license.html
19 +
20 +#
21 +# ====================================================================
22 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
23 +# project. The module is, however, dual licensed under OpenSSL and
24 +# CRYPTOGAMS licenses depending on where you obtain it. For further
25 +# details see http://www.openssl.org/~appro/cryptogams/.
26 +# ====================================================================
27 +#
28 +# October 2015
29 +#
30 +# ChaCha20 for PowerPC/AltiVec.
31 +#
32 +# June 2018
33 +#
34 +# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
35 +# processors that can't issue more than one vector instruction per
36 +# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
37 +# interleave would perform better. Incidentally PowerISA 2.07 (first
38 +# implemented by POWER8) defined new usable instructions, hence 4xVSX
39 +# code path...
40 +#
41 +# Performance in cycles per byte out of large buffer.
42 +#
43 +# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX
44 +#
45 +# Freescale e300 13.6/+115% - -
46 +# PPC74x0/G4e 6.81/+310% 3.81 -
47 +# PPC970/G5 9.29/+160% ? -
48 +# POWER7 8.62/+61% 3.35 -
49 +# POWER8 8.70/+51% 2.91 2.09
50 +# POWER9 8.80/+29% 4.44(*) 2.45(**)
51 +#
52 +# (*) this is trade-off result, it's possible to improve it, but
53 +# then it would negatively affect all others;
54 +# (**) POWER9 seems to be "allergic" to mixing vector and integer
55 +# instructions, which is why switch to vector-only code pays
56 +# off that much;
57 +
58 +# $output is the last argument if it looks like a file (it has an extension)
59 +# $flavour is the first argument if it doesn't look like a file
60 +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
61 +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
62 +
63 +if ($flavour =~ /64/) {
64 + $SIZE_T =8;
65 + $LRSAVE =2*$SIZE_T;
66 + $STU ="stdu";
67 + $POP ="ld";
68 + $PUSH ="std";
69 + $UCMP ="cmpld";
70 +} elsif ($flavour =~ /32/) {
71 + $SIZE_T =4;
72 + $LRSAVE =$SIZE_T;
73 + $STU ="stwu";
74 + $POP ="lwz";
75 + $PUSH ="stw";
76 + $UCMP ="cmplw";
77 +} else { die "nonsense $flavour"; }
78 +
79 +$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
80 +
81 +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
82 +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
83 +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
84 +die "can't locate ppc-xlate.pl";
85 +
86 +open STDOUT,"| $^X $xlate $flavour \"$output\""
87 + or die "can't call $xlate: $!";
88 +
89 +$LOCALS=6*$SIZE_T;
90 +$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables
91 +
92 +sub AUTOLOAD() # thunk [simplified] x86-style perlasm
93 +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
94 + $code .= "\t$opcode\t".join(',',@_)."\n";
95 +}
96 +
97 +my $sp = "r1";
98 +
99 +my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
100 +
101 +
102 +{{{
103 +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
104 + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15));
105 +my @K = map("v$_",(16..19));
106 +my $CTR = "v26";
107 +my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30));
108 +my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3);
109 +my $beperm = "v31";
110 +
111 +my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
112 +
113 +my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload
114 +
115 +
116 +sub VSX_lane_ROUND_4x {
117 +my ($a0,$b0,$c0,$d0)=@_;
118 +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
119 +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
120 +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
121 +my @x=map("\"v$_\"",(0..15));
122 +
123 + (
124 + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
125 + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
126 + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
127 + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
128 + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
129 + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
130 + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
131 + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
132 + "&vrlw (@x[$d0],@x[$d0],'$sixteen')",
133 + "&vrlw (@x[$d1],@x[$d1],'$sixteen')",
134 + "&vrlw (@x[$d2],@x[$d2],'$sixteen')",
135 + "&vrlw (@x[$d3],@x[$d3],'$sixteen')",
136 +
137 + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
138 + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
139 + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
140 + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
141 + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
142 + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
143 + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
144 + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
145 + "&vrlw (@x[$b0],@x[$b0],'$twelve')",
146 + "&vrlw (@x[$b1],@x[$b1],'$twelve')",
147 + "&vrlw (@x[$b2],@x[$b2],'$twelve')",
148 + "&vrlw (@x[$b3],@x[$b3],'$twelve')",
149 +
150 + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
151 + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
152 + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
153 + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
154 + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
155 + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
156 + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
157 + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
158 + "&vrlw (@x[$d0],@x[$d0],'$eight')",
159 + "&vrlw (@x[$d1],@x[$d1],'$eight')",
160 + "&vrlw (@x[$d2],@x[$d2],'$eight')",
161 + "&vrlw (@x[$d3],@x[$d3],'$eight')",
162 +
163 + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
164 + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
165 + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
166 + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
167 + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
168 + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
169 + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
170 + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
171 + "&vrlw (@x[$b0],@x[$b0],'$seven')",
172 + "&vrlw (@x[$b1],@x[$b1],'$seven')",
173 + "&vrlw (@x[$b2],@x[$b2],'$seven')",
174 + "&vrlw (@x[$b3],@x[$b3],'$seven')"
175 + );
176 +}
177 +
178 +$code.=<<___;
179 +
180 +.globl .ChaCha20_ctr32_vsx_p10
181 +.align 5
182 +.ChaCha20_ctr32_vsx_p10:
183 + ${UCMP}i $len,255
184 + bgt ChaCha20_ctr32_vsx_8x
185 + $STU $sp,-$FRAME($sp)
186 + mflr r0
187 + li r10,`15+$LOCALS+64`
188 + li r11,`31+$LOCALS+64`
189 + mfspr r12,256
190 + stvx v26,r10,$sp
191 + addi r10,r10,32
192 + stvx v27,r11,$sp
193 + addi r11,r11,32
194 + stvx v28,r10,$sp
195 + addi r10,r10,32
196 + stvx v29,r11,$sp
197 + addi r11,r11,32
198 + stvx v30,r10,$sp
199 + stvx v31,r11,$sp
200 + stw r12,`$FRAME-4`($sp) # save vrsave
201 + li r12,-4096+63
202 + $PUSH r0, `$FRAME+$LRSAVE`($sp)
203 + mtspr 256,r12 # preserve 29 AltiVec registers
204 +
205 + bl Lconsts # returns pointer Lsigma in r12
206 + lvx_4w @K[0],0,r12 # load sigma
207 + addi r12,r12,0x70
208 + li $x10,16
209 + li $x20,32
210 + li $x30,48
211 + li r11,64
212 +
213 + lvx_4w @K[1],0,$key # load key
214 + lvx_4w @K[2],$x10,$key
215 + lvx_4w @K[3],0,$ctr # load counter
216 +
217 + vxor $xt0,$xt0,$xt0
218 + lvx_4w $xt1,r11,r12
219 + vspltw $CTR,@K[3],0
220 + vsldoi @K[3],@K[3],$xt0,4
221 + vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0]
222 + vadduwm $CTR,$CTR,$xt1
223 +
224 + be?lvsl $beperm,0,$x10 # 0x00..0f
225 + be?vspltisb $xt0,3 # 0x03..03
226 + be?vxor $beperm,$beperm,$xt0 # swap bytes within words
227 +
228 + li r0,10 # inner loop counter
229 + mtctr r0
230 + b Loop_outer_vsx
231 +
232 +.align 5
233 +Loop_outer_vsx:
234 + lvx $xa0,$x00,r12 # load [smashed] sigma
235 + lvx $xa1,$x10,r12
236 + lvx $xa2,$x20,r12
237 + lvx $xa3,$x30,r12
238 +
239 + vspltw $xb0,@K[1],0 # smash the key
240 + vspltw $xb1,@K[1],1
241 + vspltw $xb2,@K[1],2
242 + vspltw $xb3,@K[1],3
243 +
244 + vspltw $xc0,@K[2],0
245 + vspltw $xc1,@K[2],1
246 + vspltw $xc2,@K[2],2
247 + vspltw $xc3,@K[2],3
248 +
249 + vmr $xd0,$CTR # smash the counter
250 + vspltw $xd1,@K[3],1
251 + vspltw $xd2,@K[3],2
252 + vspltw $xd3,@K[3],3
253 +
254 + vspltisw $sixteen,-16 # synthesize constants
255 + vspltisw $twelve,12
256 + vspltisw $eight,8
257 + vspltisw $seven,7
258 +
259 +Loop_vsx_4x:
260 +___
261 + foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; }
262 + foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; }
263 +$code.=<<___;
264 +
265 + bdnz Loop_vsx_4x
266 +
267 + vadduwm $xd0,$xd0,$CTR
268 +
269 + vmrgew $xt0,$xa0,$xa1 # transpose data
270 + vmrgew $xt1,$xa2,$xa3
271 + vmrgow $xa0,$xa0,$xa1
272 + vmrgow $xa2,$xa2,$xa3
273 + vmrgew $xt2,$xb0,$xb1
274 + vmrgew $xt3,$xb2,$xb3
275 + vpermdi $xa1,$xa0,$xa2,0b00
276 + vpermdi $xa3,$xa0,$xa2,0b11
277 + vpermdi $xa0,$xt0,$xt1,0b00
278 + vpermdi $xa2,$xt0,$xt1,0b11
279 +
280 + vmrgow $xb0,$xb0,$xb1
281 + vmrgow $xb2,$xb2,$xb3
282 + vmrgew $xt0,$xc0,$xc1
283 + vmrgew $xt1,$xc2,$xc3
284 + vpermdi $xb1,$xb0,$xb2,0b00
285 + vpermdi $xb3,$xb0,$xb2,0b11
286 + vpermdi $xb0,$xt2,$xt3,0b00
287 + vpermdi $xb2,$xt2,$xt3,0b11
288 +
289 + vmrgow $xc0,$xc0,$xc1
290 + vmrgow $xc2,$xc2,$xc3
291 + vmrgew $xt2,$xd0,$xd1
292 + vmrgew $xt3,$xd2,$xd3
293 + vpermdi $xc1,$xc0,$xc2,0b00
294 + vpermdi $xc3,$xc0,$xc2,0b11
295 + vpermdi $xc0,$xt0,$xt1,0b00
296 + vpermdi $xc2,$xt0,$xt1,0b11
297 +
298 + vmrgow $xd0,$xd0,$xd1
299 + vmrgow $xd2,$xd2,$xd3
300 + vspltisw $xt0,4
301 + vadduwm $CTR,$CTR,$xt0 # next counter value
302 + vpermdi $xd1,$xd0,$xd2,0b00
303 + vpermdi $xd3,$xd0,$xd2,0b11
304 + vpermdi $xd0,$xt2,$xt3,0b00
305 + vpermdi $xd2,$xt2,$xt3,0b11
306 +
307 + vadduwm $xa0,$xa0,@K[0]
308 + vadduwm $xb0,$xb0,@K[1]
309 + vadduwm $xc0,$xc0,@K[2]
310 + vadduwm $xd0,$xd0,@K[3]
311 +
312 + be?vperm $xa0,$xa0,$xa0,$beperm
313 + be?vperm $xb0,$xb0,$xb0,$beperm
314 + be?vperm $xc0,$xc0,$xc0,$beperm
315 + be?vperm $xd0,$xd0,$xd0,$beperm
316 +
317 + ${UCMP}i $len,0x40
318 + blt Ltail_vsx
319 +
320 + lvx_4w $xt0,$x00,$inp
321 + lvx_4w $xt1,$x10,$inp
322 + lvx_4w $xt2,$x20,$inp
323 + lvx_4w $xt3,$x30,$inp
324 +
325 + vxor $xt0,$xt0,$xa0
326 + vxor $xt1,$xt1,$xb0
327 + vxor $xt2,$xt2,$xc0
328 + vxor $xt3,$xt3,$xd0
329 +
330 + stvx_4w $xt0,$x00,$out
331 + stvx_4w $xt1,$x10,$out
332 + addi $inp,$inp,0x40
333 + stvx_4w $xt2,$x20,$out
334 + subi $len,$len,0x40
335 + stvx_4w $xt3,$x30,$out
336 + addi $out,$out,0x40
337 + beq Ldone_vsx
338 +
339 + vadduwm $xa0,$xa1,@K[0]
340 + vadduwm $xb0,$xb1,@K[1]
341 + vadduwm $xc0,$xc1,@K[2]
342 + vadduwm $xd0,$xd1,@K[3]
343 +
344 + be?vperm $xa0,$xa0,$xa0,$beperm
345 + be?vperm $xb0,$xb0,$xb0,$beperm
346 + be?vperm $xc0,$xc0,$xc0,$beperm
347 + be?vperm $xd0,$xd0,$xd0,$beperm
348 +
349 + ${UCMP}i $len,0x40
350 + blt Ltail_vsx
351 +
352 + lvx_4w $xt0,$x00,$inp
353 + lvx_4w $xt1,$x10,$inp
354 + lvx_4w $xt2,$x20,$inp
355 + lvx_4w $xt3,$x30,$inp
356 +
357 + vxor $xt0,$xt0,$xa0
358 + vxor $xt1,$xt1,$xb0
359 + vxor $xt2,$xt2,$xc0
360 + vxor $xt3,$xt3,$xd0
361 +
362 + stvx_4w $xt0,$x00,$out
363 + stvx_4w $xt1,$x10,$out
364 + addi $inp,$inp,0x40
365 + stvx_4w $xt2,$x20,$out
366 + subi $len,$len,0x40
367 + stvx_4w $xt3,$x30,$out
368 + addi $out,$out,0x40
369 + beq Ldone_vsx
370 +
371 + vadduwm $xa0,$xa2,@K[0]
372 + vadduwm $xb0,$xb2,@K[1]
373 + vadduwm $xc0,$xc2,@K[2]
374 + vadduwm $xd0,$xd2,@K[3]
375 +
376 + be?vperm $xa0,$xa0,$xa0,$beperm
377 + be?vperm $xb0,$xb0,$xb0,$beperm
378 + be?vperm $xc0,$xc0,$xc0,$beperm
379 + be?vperm $xd0,$xd0,$xd0,$beperm
380 +
381 + ${UCMP}i $len,0x40
382 + blt Ltail_vsx
383 +
384 + lvx_4w $xt0,$x00,$inp
385 + lvx_4w $xt1,$x10,$inp
386 + lvx_4w $xt2,$x20,$inp
387 + lvx_4w $xt3,$x30,$inp
388 +
389 + vxor $xt0,$xt0,$xa0
390 + vxor $xt1,$xt1,$xb0
391 + vxor $xt2,$xt2,$xc0
392 + vxor $xt3,$xt3,$xd0
393 +
394 + stvx_4w $xt0,$x00,$out
395 + stvx_4w $xt1,$x10,$out
396 + addi $inp,$inp,0x40
397 + stvx_4w $xt2,$x20,$out
398 + subi $len,$len,0x40
399 + stvx_4w $xt3,$x30,$out
400 + addi $out,$out,0x40
401 + beq Ldone_vsx
402 +
403 + vadduwm $xa0,$xa3,@K[0]
404 + vadduwm $xb0,$xb3,@K[1]
405 + vadduwm $xc0,$xc3,@K[2]
406 + vadduwm $xd0,$xd3,@K[3]
407 +
408 + be?vperm $xa0,$xa0,$xa0,$beperm
409 + be?vperm $xb0,$xb0,$xb0,$beperm
410 + be?vperm $xc0,$xc0,$xc0,$beperm
411 + be?vperm $xd0,$xd0,$xd0,$beperm
412 +
413 + ${UCMP}i $len,0x40
414 + blt Ltail_vsx
415 +
416 + lvx_4w $xt0,$x00,$inp
417 + lvx_4w $xt1,$x10,$inp
418 + lvx_4w $xt2,$x20,$inp
419 + lvx_4w $xt3,$x30,$inp
420 +
421 + vxor $xt0,$xt0,$xa0
422 + vxor $xt1,$xt1,$xb0
423 + vxor $xt2,$xt2,$xc0
424 + vxor $xt3,$xt3,$xd0
425 +
426 + stvx_4w $xt0,$x00,$out
427 + stvx_4w $xt1,$x10,$out
428 + addi $inp,$inp,0x40
429 + stvx_4w $xt2,$x20,$out
430 + subi $len,$len,0x40
431 + stvx_4w $xt3,$x30,$out
432 + addi $out,$out,0x40
433 + mtctr r0
434 + bne Loop_outer_vsx
435 +
436 +Ldone_vsx:
437 + lwz r12,`$FRAME-4`($sp) # pull vrsave
438 + li r10,`15+$LOCALS+64`
439 + li r11,`31+$LOCALS+64`
440 + $POP r0, `$FRAME+$LRSAVE`($sp)
441 + mtspr 256,r12 # restore vrsave
442 + lvx v26,r10,$sp
443 + addi r10,r10,32
444 + lvx v27,r11,$sp
445 + addi r11,r11,32
446 + lvx v28,r10,$sp
447 + addi r10,r10,32
448 + lvx v29,r11,$sp
449 + addi r11,r11,32
450 + lvx v30,r10,$sp
451 + lvx v31,r11,$sp
452 + mtlr r0
453 + addi $sp,$sp,$FRAME
454 + blr
455 +
456 +.align 4
457 +Ltail_vsx:
458 + addi r11,$sp,$LOCALS
459 + mtctr $len
460 + stvx_4w $xa0,$x00,r11 # offload block to stack
461 + stvx_4w $xb0,$x10,r11
462 + stvx_4w $xc0,$x20,r11
463 + stvx_4w $xd0,$x30,r11
464 + subi r12,r11,1 # prepare for *++ptr
465 + subi $inp,$inp,1
466 + subi $out,$out,1
467 +
468 +Loop_tail_vsx:
469 + lbzu r6,1(r12)
470 + lbzu r7,1($inp)
471 + xor r6,r6,r7
472 + stbu r6,1($out)
473 + bdnz Loop_tail_vsx
474 +
475 + stvx_4w $K[0],$x00,r11 # wipe copy of the block
476 + stvx_4w $K[0],$x10,r11
477 + stvx_4w $K[0],$x20,r11
478 + stvx_4w $K[0],$x30,r11
479 +
480 + b Ldone_vsx
481 + .long 0
482 + .byte 0,12,0x04,1,0x80,0,5,0
483 + .long 0
484 +.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10
485 +___
486 +}}}
487 +
488 +##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to
489 +# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info.
490 +# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value.
491 +# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel.
492 +#
493 +{{{
494 +#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
495 +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
496 + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3,
497 + $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7,
498 + $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31));
499 +my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15));
500 +my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3));
501 +my @K = map("v$_",27,(24..26));
502 +my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31));
503 +my $xr0 = "v4";
504 +my $CTR0 = "v22";
505 +my $CTR1 = "v5";
506 +my $beperm = "v31";
507 +my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
508 +my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7));
509 +my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17));
510 +my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21));
511 +my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26));
512 +
513 +my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload
514 +
515 +sub VSX_lane_ROUND_8x {
516 +my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_;
517 +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
518 +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
519 +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
520 +my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4));
521 +my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5));
522 +my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6));
523 +my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17));
524 +my @x=map("\"v$_\"",(0..31));
525 +
526 + (
527 + "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13
528 + "&vxxlorc (@x[$c7], $xv9,$xv9)",
529 +
530 + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
531 + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
532 + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
533 + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
534 + "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1
535 + "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2
536 + "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3
537 + "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4
538 +
539 + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
540 + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
541 + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
542 + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
543 + "&vxor (@x[$d4],@x[$d4],@x[$a4])",
544 + "&vxor (@x[$d5],@x[$d5],@x[$a5])",
545 + "&vxor (@x[$d6],@x[$d6],@x[$a6])",
546 + "&vxor (@x[$d7],@x[$d7],@x[$a7])",
547 +
548 + "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
549 + "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
550 + "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
551 + "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
552 + "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
553 + "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
554 + "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
555 + "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
556 +
557 + "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
558 + "&vxxlorc (@x[$c7], $xv15,$xv15)",
559 + "&vxxlorc (@x[$a7], $xv10,$xv10)",
560 +
561 + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
562 + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
563 + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
564 + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
565 + "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
566 + "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
567 + "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
568 + "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
569 +
570 + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
571 + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
572 + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
573 + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
574 + "&vxor (@x[$b4],@x[$b4],@x[$c4])",
575 + "&vxor (@x[$b5],@x[$b5],@x[$c5])",
576 + "&vxor (@x[$b6],@x[$b6],@x[$c6])",
577 + "&vxor (@x[$b7],@x[$b7],@x[$c7])",
578 +
579 + "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
580 + "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
581 + "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
582 + "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
583 + "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
584 + "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
585 + "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
586 + "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
587 +
588 + "&vxxlorc (@x[$a7], $xv13,$xv13)",
589 + "&vxxlor ($xv15 ,@x[$c7],@x[$c7])",
590 + "&vxxlorc (@x[$c7], $xv11,$xv11)",
591 +
592 +
593 + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
594 + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
595 + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
596 + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
597 + "&vadduwm (@x[$a4],@x[$a4],@x[$b4])",
598 + "&vadduwm (@x[$a5],@x[$a5],@x[$b5])",
599 + "&vadduwm (@x[$a6],@x[$a6],@x[$b6])",
600 + "&vadduwm (@x[$a7],@x[$a7],@x[$b7])",
601 +
602 + "&vxor (@x[$d0],@x[$d0],@x[$a0])",
603 + "&vxor (@x[$d1],@x[$d1],@x[$a1])",
604 + "&vxor (@x[$d2],@x[$d2],@x[$a2])",
605 + "&vxor (@x[$d3],@x[$d3],@x[$a3])",
606 + "&vxor (@x[$d4],@x[$d4],@x[$a4])",
607 + "&vxor (@x[$d5],@x[$d5],@x[$a5])",
608 + "&vxor (@x[$d6],@x[$d6],@x[$a6])",
609 + "&vxor (@x[$d7],@x[$d7],@x[$a7])",
610 +
611 + "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
612 + "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
613 + "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
614 + "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
615 + "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
616 + "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
617 + "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
618 + "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
619 +
620 + "&vxxlorc (@x[$c7], $xv15,$xv15)",
621 + "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
622 + "&vxxlorc (@x[$a7], $xv12,$xv12)",
623 +
624 + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
625 + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
626 + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
627 + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
628 + "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
629 + "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
630 + "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
631 + "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
632 + "&vxor (@x[$b0],@x[$b0],@x[$c0])",
633 + "&vxor (@x[$b1],@x[$b1],@x[$c1])",
634 + "&vxor (@x[$b2],@x[$b2],@x[$c2])",
635 + "&vxor (@x[$b3],@x[$b3],@x[$c3])",
636 + "&vxor (@x[$b4],@x[$b4],@x[$c4])",
637 + "&vxor (@x[$b5],@x[$b5],@x[$c5])",
638 + "&vxor (@x[$b6],@x[$b6],@x[$c6])",
639 + "&vxor (@x[$b7],@x[$b7],@x[$c7])",
640 + "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
641 + "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
642 + "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
643 + "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
644 + "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
645 + "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
646 + "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
647 + "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
648 +
649 + "&vxxlorc (@x[$a7], $xv13,$xv13)",
650 + );
651 +}
652 +
653 +$code.=<<___;
654 +
655 +.globl .ChaCha20_ctr32_vsx_8x
656 +.align 5
657 +.ChaCha20_ctr32_vsx_8x:
658 + $STU $sp,-$FRAME($sp)
659 + mflr r0
660 + li r10,`15+$LOCALS+64`
661 + li r11,`31+$LOCALS+64`
662 + mfspr r12,256
663 + stvx v24,r10,$sp
664 + addi r10,r10,32
665 + stvx v25,r11,$sp
666 + addi r11,r11,32
667 + stvx v26,r10,$sp
668 + addi r10,r10,32
669 + stvx v27,r11,$sp
670 + addi r11,r11,32
671 + stvx v28,r10,$sp
672 + addi r10,r10,32
673 + stvx v29,r11,$sp
674 + addi r11,r11,32
675 + stvx v30,r10,$sp
676 + stvx v31,r11,$sp
677 + stw r12,`$FRAME-4`($sp) # save vrsave
678 + li r12,-4096+63
679 + $PUSH r0, `$FRAME+$LRSAVE`($sp)
680 + mtspr 256,r12 # preserve 29 AltiVec registers
681 +
682 + bl Lconsts # returns pointer Lsigma in r12
683 +
684 + lvx_4w @K[0],0,r12 # load sigma
685 + addi r12,r12,0x70
686 + li $x10,16
687 + li $x20,32
688 + li $x30,48
689 + li r11,64
690 +
691 + vspltisw $xa4,-16 # synthesize constants
692 + vspltisw $xb4,12 # synthesize constants
693 + vspltisw $xc4,8 # synthesize constants
694 + vspltisw $xd4,7 # synthesize constants
695 +
696 + lvx $xa0,$x00,r12 # load [smashed] sigma
697 + lvx $xa1,$x10,r12
698 + lvx $xa2,$x20,r12
699 + lvx $xa3,$x30,r12
700 +
701 + vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12
702 + vxxlor $xv10 ,$xb4,$xb4
703 + vxxlor $xv11 ,$xc4,$xc4
704 + vxxlor $xv12 ,$xd4,$xd4
705 + vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25
706 + vxxlor $xv23 ,$xa1,$xa1
707 + vxxlor $xv24 ,$xa2,$xa2
708 + vxxlor $xv25 ,$xa3,$xa3
709 +
710 + lvx_4w @K[1],0,$key # load key
711 + lvx_4w @K[2],$x10,$key
712 + lvx_4w @K[3],0,$ctr # load counter
713 + vspltisw $xt3,4
714 +
715 +
716 + vxor $xt2,$xt2,$xt2
717 + lvx_4w $xt1,r11,r12
718 + vspltw $xa2,@K[3],0 #save the original count after spltw
719 + vsldoi @K[3],@K[3],$xt2,4
720 + vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0]
721 + vadduwm $xt1,$xa2,$xt1
722 + vadduwm $xt3,$xt1,$xt3 # next counter value
723 + vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8.
724 +
725 + be?lvsl $beperm,0,$x10 # 0x00..0f
726 + be?vspltisb $xt0,3 # 0x03..03
727 + be?vxor $beperm,$beperm,$xt0 # swap bytes within words
728 + be?vxxlor $xv26 ,$beperm,$beperm
729 +
730 + vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2
731 + vxxlor $xv1 ,@K[1],@K[1]
732 + vxxlor $xv2 ,@K[2],@K[2]
733 + vxxlor $xv3 ,@K[3],@K[3]
734 + vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5
735 + vxxlor $xv5 ,$xt3,$xt3
736 + vxxlor $xv8 ,$xa0,$xa0
737 +
738 + li r0,10 # inner loop counter
739 + mtctr r0
740 + b Loop_outer_vsx_8x
741 +
742 +.align 5
743 +Loop_outer_vsx_8x:
744 + vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma
745 + vxxlorc $xa1,$xv23,$xv23
746 + vxxlorc $xa2,$xv24,$xv24
747 + vxxlorc $xa3,$xv25,$xv25
748 + vxxlorc $xa4,$xv22,$xv22
749 + vxxlorc $xa5,$xv23,$xv23
750 + vxxlorc $xa6,$xv24,$xv24
751 + vxxlorc $xa7,$xv25,$xv25
752 +
753 + vspltw $xb0,@K[1],0 # smash the key
754 + vspltw $xb1,@K[1],1
755 + vspltw $xb2,@K[1],2
756 + vspltw $xb3,@K[1],3
757 + vspltw $xb4,@K[1],0 # smash the key
758 + vspltw $xb5,@K[1],1
759 + vspltw $xb6,@K[1],2
760 + vspltw $xb7,@K[1],3
761 +
762 + vspltw $xc0,@K[2],0
763 + vspltw $xc1,@K[2],1
764 + vspltw $xc2,@K[2],2
765 + vspltw $xc3,@K[2],3
766 + vspltw $xc4,@K[2],0
767 + vspltw $xc7,@K[2],3
768 + vspltw $xc5,@K[2],1
769 +
770 + vxxlorc $xd0,$xv4,$xv4 # smash the counter
771 + vspltw $xd1,@K[3],1
772 + vspltw $xd2,@K[3],2
773 + vspltw $xd3,@K[3],3
774 + vxxlorc $xd4,$xv5,$xv5 # smash the counter
775 + vspltw $xd5,@K[3],1
776 + vspltw $xd6,@K[3],2
777 + vspltw $xd7,@K[3],3
778 + vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done
779 +
780 +Loop_vsx_8x:
781 +___
782 + foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; }
783 + foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; }
784 +$code.=<<___;
785 +
786 + bdnz Loop_vsx_8x
787 + vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31
788 + vxxlor $xv14 ,$xd5,$xd5 #
789 + vxxlor $xv15 ,$xd6,$xd6 #
790 + vxxlor $xv16 ,$xd7,$xd7 #
791 +
792 + vxxlor $xv18 ,$xc4,$xc4 #
793 + vxxlor $xv19 ,$xc5,$xc5 #
794 + vxxlor $xv20 ,$xc6,$xc6 #
795 + vxxlor $xv21 ,$xc7,$xc7 #
796 +
797 + vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs
798 + vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs
799 + be?vxxlorc $beperm,$xv26,$xv26 # copy back the the beperm.
800 +
801 + vxxlorc @K[0],$xv0,$xv0 #27
802 + vxxlorc @K[1],$xv1,$xv1 #24
803 + vxxlorc @K[2],$xv2,$xv2 #25
804 + vxxlorc @K[3],$xv3,$xv3 #26
805 + vxxlorc $CTR0,$xv4,$xv4
806 +###changing to vertical
807 +
808 + vmrgew $xt0,$xa0,$xa1 # transpose data
809 + vmrgew $xt1,$xa2,$xa3
810 + vmrgow $xa0,$xa0,$xa1
811 + vmrgow $xa2,$xa2,$xa3
812 +
813 + vmrgew $xt2,$xb0,$xb1
814 + vmrgew $xt3,$xb2,$xb3
815 + vmrgow $xb0,$xb0,$xb1
816 + vmrgow $xb2,$xb2,$xb3
817 +
818 + vadduwm $xd0,$xd0,$CTR0
819 +
820 + vpermdi $xa1,$xa0,$xa2,0b00
821 + vpermdi $xa3,$xa0,$xa2,0b11
822 + vpermdi $xa0,$xt0,$xt1,0b00
823 + vpermdi $xa2,$xt0,$xt1,0b11
824 + vpermdi $xb1,$xb0,$xb2,0b00
825 + vpermdi $xb3,$xb0,$xb2,0b11
826 + vpermdi $xb0,$xt2,$xt3,0b00
827 + vpermdi $xb2,$xt2,$xt3,0b11
828 +
829 + vmrgew $xt0,$xc0,$xc1
830 + vmrgew $xt1,$xc2,$xc3
831 + vmrgow $xc0,$xc0,$xc1
832 + vmrgow $xc2,$xc2,$xc3
833 + vmrgew $xt2,$xd0,$xd1
834 + vmrgew $xt3,$xd2,$xd3
835 + vmrgow $xd0,$xd0,$xd1
836 + vmrgow $xd2,$xd2,$xd3
837 +
838 + vpermdi $xc1,$xc0,$xc2,0b00
839 + vpermdi $xc3,$xc0,$xc2,0b11
840 + vpermdi $xc0,$xt0,$xt1,0b00
841 + vpermdi $xc2,$xt0,$xt1,0b11
842 + vpermdi $xd1,$xd0,$xd2,0b00
843 + vpermdi $xd3,$xd0,$xd2,0b11
844 + vpermdi $xd0,$xt2,$xt3,0b00
845 + vpermdi $xd2,$xt2,$xt3,0b11
846 +
847 + vspltisw $xt0,8
848 + vadduwm $CTR0,$CTR0,$xt0 # next counter value
849 + vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5
850 +
851 + vadduwm $xa0,$xa0,@K[0]
852 + vadduwm $xb0,$xb0,@K[1]
853 + vadduwm $xc0,$xc0,@K[2]
854 + vadduwm $xd0,$xd0,@K[3]
855 +
856 + be?vperm $xa0,$xa0,$xa0,$beperm
857 + be?vperm $xb0,$xb0,$xb0,$beperm
858 + be?vperm $xc0,$xc0,$xc0,$beperm
859 + be?vperm $xd0,$xd0,$xd0,$beperm
860 +
861 + ${UCMP}i $len,0x40
862 + blt Ltail_vsx_8x
863 +
864 + lvx_4w $xt0,$x00,$inp
865 + lvx_4w $xt1,$x10,$inp
866 + lvx_4w $xt2,$x20,$inp
867 + lvx_4w $xt3,$x30,$inp
868 +
869 + vxor $xt0,$xt0,$xa0
870 + vxor $xt1,$xt1,$xb0
871 + vxor $xt2,$xt2,$xc0
872 + vxor $xt3,$xt3,$xd0
873 +
874 + stvx_4w $xt0,$x00,$out
875 + stvx_4w $xt1,$x10,$out
876 + addi $inp,$inp,0x40
877 + stvx_4w $xt2,$x20,$out
878 + subi $len,$len,0x40
879 + stvx_4w $xt3,$x30,$out
880 + addi $out,$out,0x40
881 + beq Ldone_vsx_8x
882 +
883 + vadduwm $xa0,$xa1,@K[0]
884 + vadduwm $xb0,$xb1,@K[1]
885 + vadduwm $xc0,$xc1,@K[2]
886 + vadduwm $xd0,$xd1,@K[3]
887 +
888 + be?vperm $xa0,$xa0,$xa0,$beperm
889 + be?vperm $xb0,$xb0,$xb0,$beperm
890 + be?vperm $xc0,$xc0,$xc0,$beperm
891 + be?vperm $xd0,$xd0,$xd0,$beperm
892 +
893 + ${UCMP}i $len,0x40
894 + blt Ltail_vsx_8x
895 +
896 + lvx_4w $xt0,$x00,$inp
897 + lvx_4w $xt1,$x10,$inp
898 + lvx_4w $xt2,$x20,$inp
899 + lvx_4w $xt3,$x30,$inp
900 +
901 + vxor $xt0,$xt0,$xa0
902 + vxor $xt1,$xt1,$xb0
903 + vxor $xt2,$xt2,$xc0
904 + vxor $xt3,$xt3,$xd0
905 +
906 + stvx_4w $xt0,$x00,$out
907 + stvx_4w $xt1,$x10,$out
908 + addi $inp,$inp,0x40
909 + stvx_4w $xt2,$x20,$out
910 + subi $len,$len,0x40
911 + stvx_4w $xt3,$x30,$out
912 + addi $out,$out,0x40
913 + beq Ldone_vsx_8x
914 +
915 + vadduwm $xa0,$xa2,@K[0]
916 + vadduwm $xb0,$xb2,@K[1]
917 + vadduwm $xc0,$xc2,@K[2]
918 + vadduwm $xd0,$xd2,@K[3]
919 +
920 + be?vperm $xa0,$xa0,$xa0,$beperm
921 + be?vperm $xb0,$xb0,$xb0,$beperm
922 + be?vperm $xc0,$xc0,$xc0,$beperm
923 + be?vperm $xd0,$xd0,$xd0,$beperm
924 +
925 + ${UCMP}i $len,0x40
926 + blt Ltail_vsx_8x
927 +
928 + lvx_4w $xt0,$x00,$inp
929 + lvx_4w $xt1,$x10,$inp
930 + lvx_4w $xt2,$x20,$inp
931 + lvx_4w $xt3,$x30,$inp
932 +
933 + vxor $xt0,$xt0,$xa0
934 + vxor $xt1,$xt1,$xb0
935 + vxor $xt2,$xt2,$xc0
936 + vxor $xt3,$xt3,$xd0
937 +
938 + stvx_4w $xt0,$x00,$out
939 + stvx_4w $xt1,$x10,$out
940 + addi $inp,$inp,0x40
941 + stvx_4w $xt2,$x20,$out
942 + subi $len,$len,0x40
943 + stvx_4w $xt3,$x30,$out
944 + addi $out,$out,0x40
945 + beq Ldone_vsx_8x
946 +
947 + vadduwm $xa0,$xa3,@K[0]
948 + vadduwm $xb0,$xb3,@K[1]
949 + vadduwm $xc0,$xc3,@K[2]
950 + vadduwm $xd0,$xd3,@K[3]
951 +
952 + be?vperm $xa0,$xa0,$xa0,$beperm
953 + be?vperm $xb0,$xb0,$xb0,$beperm
954 + be?vperm $xc0,$xc0,$xc0,$beperm
955 + be?vperm $xd0,$xd0,$xd0,$beperm
956 +
957 + ${UCMP}i $len,0x40
958 + blt Ltail_vsx_8x
959 +
960 + lvx_4w $xt0,$x00,$inp
961 + lvx_4w $xt1,$x10,$inp
962 + lvx_4w $xt2,$x20,$inp
963 + lvx_4w $xt3,$x30,$inp
964 +
965 + vxor $xt0,$xt0,$xa0
966 + vxor $xt1,$xt1,$xb0
967 + vxor $xt2,$xt2,$xc0
968 + vxor $xt3,$xt3,$xd0
969 +
970 + stvx_4w $xt0,$x00,$out
971 + stvx_4w $xt1,$x10,$out
972 + addi $inp,$inp,0x40
973 + stvx_4w $xt2,$x20,$out
974 + subi $len,$len,0x40
975 + stvx_4w $xt3,$x30,$out
976 + addi $out,$out,0x40
977 + beq Ldone_vsx_8x
978 +
979 +#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31.
980 +#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0.
981 +
982 + vxxlorc $CTR1 ,$xv5,$xv5
983 +
984 + vxxlorc $xcn4 ,$xv18,$xv18
985 + vxxlorc $xcn5 ,$xv19,$xv19
986 + vxxlorc $xcn6 ,$xv20,$xv20
987 + vxxlorc $xcn7 ,$xv21,$xv21
988 +
989 + vxxlorc $xdn4 ,$xv13,$xv13
990 + vxxlorc $xdn5 ,$xv14,$xv14
991 + vxxlorc $xdn6 ,$xv15,$xv15
992 + vxxlorc $xdn7 ,$xv16,$xv16
993 + vadduwm $xdn4,$xdn4,$CTR1
994 +
995 + vxxlorc $xb6 ,$xv6,$xv6
996 + vxxlorc $xb7 ,$xv7,$xv7
997 +#use xa1->xr0, as xt0...in the block 4-7
998 +
999 + vmrgew $xr0,$xa4,$xa5 # transpose data
1000 + vmrgew $xt1,$xa6,$xa7
1001 + vmrgow $xa4,$xa4,$xa5
1002 + vmrgow $xa6,$xa6,$xa7
1003 + vmrgew $xt2,$xb4,$xb5
1004 + vmrgew $xt3,$xb6,$xb7
1005 + vmrgow $xb4,$xb4,$xb5
1006 + vmrgow $xb6,$xb6,$xb7
1007 +
1008 + vpermdi $xa5,$xa4,$xa6,0b00
1009 + vpermdi $xa7,$xa4,$xa6,0b11
1010 + vpermdi $xa4,$xr0,$xt1,0b00
1011 + vpermdi $xa6,$xr0,$xt1,0b11
1012 + vpermdi $xb5,$xb4,$xb6,0b00
1013 + vpermdi $xb7,$xb4,$xb6,0b11
1014 + vpermdi $xb4,$xt2,$xt3,0b00
1015 + vpermdi $xb6,$xt2,$xt3,0b11
1016 +
1017 + vmrgew $xr0,$xcn4,$xcn5
1018 + vmrgew $xt1,$xcn6,$xcn7
1019 + vmrgow $xcn4,$xcn4,$xcn5
1020 + vmrgow $xcn6,$xcn6,$xcn7
1021 + vmrgew $xt2,$xdn4,$xdn5
1022 + vmrgew $xt3,$xdn6,$xdn7
1023 + vmrgow $xdn4,$xdn4,$xdn5
1024 + vmrgow $xdn6,$xdn6,$xdn7
1025 +
1026 + vpermdi $xcn5,$xcn4,$xcn6,0b00
1027 + vpermdi $xcn7,$xcn4,$xcn6,0b11
1028 + vpermdi $xcn4,$xr0,$xt1,0b00
1029 + vpermdi $xcn6,$xr0,$xt1,0b11
1030 + vpermdi $xdn5,$xdn4,$xdn6,0b00
1031 + vpermdi $xdn7,$xdn4,$xdn6,0b11
1032 + vpermdi $xdn4,$xt2,$xt3,0b00
1033 + vpermdi $xdn6,$xt2,$xt3,0b11
1034 +
1035 + vspltisw $xr0,8
1036 + vadduwm $CTR1,$CTR1,$xr0 # next counter value
1037 + vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5
1038 +
1039 + vadduwm $xan0,$xa4,@K[0]
1040 + vadduwm $xbn0,$xb4,@K[1]
1041 + vadduwm $xcn0,$xcn4,@K[2]
1042 + vadduwm $xdn0,$xdn4,@K[3]
1043 +
1044 + be?vperm $xan0,$xa4,$xa4,$beperm
1045 + be?vperm $xbn0,$xb4,$xb4,$beperm
1046 + be?vperm $xcn0,$xcn4,$xcn4,$beperm
1047 + be?vperm $xdn0,$xdn4,$xdn4,$beperm
1048 +
1049 + ${UCMP}i $len,0x40
1050 + blt Ltail_vsx_8x_1
1051 +
1052 + lvx_4w $xr0,$x00,$inp
1053 + lvx_4w $xt1,$x10,$inp
1054 + lvx_4w $xt2,$x20,$inp
1055 + lvx_4w $xt3,$x30,$inp
1056 +
1057 + vxor $xr0,$xr0,$xan0
1058 + vxor $xt1,$xt1,$xbn0
1059 + vxor $xt2,$xt2,$xcn0
1060 + vxor $xt3,$xt3,$xdn0
1061 +
1062 + stvx_4w $xr0,$x00,$out
1063 + stvx_4w $xt1,$x10,$out
1064 + addi $inp,$inp,0x40
1065 + stvx_4w $xt2,$x20,$out
1066 + subi $len,$len,0x40
1067 + stvx_4w $xt3,$x30,$out
1068 + addi $out,$out,0x40
1069 + beq Ldone_vsx_8x
1070 +
1071 + vadduwm $xan0,$xa5,@K[0]
1072 + vadduwm $xbn0,$xb5,@K[1]
1073 + vadduwm $xcn0,$xcn5,@K[2]
1074 + vadduwm $xdn0,$xdn5,@K[3]
1075 +
1076 + be?vperm $xan0,$xan0,$xan0,$beperm
1077 + be?vperm $xbn0,$xbn0,$xbn0,$beperm
1078 + be?vperm $xcn0,$xcn0,$xcn0,$beperm
1079 + be?vperm $xdn0,$xdn0,$xdn0,$beperm
1080 +
1081 + ${UCMP}i $len,0x40
1082 + blt Ltail_vsx_8x_1
1083 +
1084 + lvx_4w $xr0,$x00,$inp
1085 + lvx_4w $xt1,$x10,$inp
1086 + lvx_4w $xt2,$x20,$inp
1087 + lvx_4w $xt3,$x30,$inp
1088 +
1089 + vxor $xr0,$xr0,$xan0
1090 + vxor $xt1,$xt1,$xbn0
1091 + vxor $xt2,$xt2,$xcn0
1092 + vxor $xt3,$xt3,$xdn0
1093 +
1094 + stvx_4w $xr0,$x00,$out
1095 + stvx_4w $xt1,$x10,$out
1096 + addi $inp,$inp,0x40
1097 + stvx_4w $xt2,$x20,$out
1098 + subi $len,$len,0x40
1099 + stvx_4w $xt3,$x30,$out
1100 + addi $out,$out,0x40
1101 + beq Ldone_vsx_8x
1102 +
1103 + vadduwm $xan0,$xa6,@K[0]
1104 + vadduwm $xbn0,$xb6,@K[1]
1105 + vadduwm $xcn0,$xcn6,@K[2]
1106 + vadduwm $xdn0,$xdn6,@K[3]
1107 +
1108 + be?vperm $xan0,$xan0,$xan0,$beperm
1109 + be?vperm $xbn0,$xbn0,$xbn0,$beperm
1110 + be?vperm $xcn0,$xcn0,$xcn0,$beperm
1111 + be?vperm $xdn0,$xdn0,$xdn0,$beperm
1112 +
1113 + ${UCMP}i $len,0x40
1114 + blt Ltail_vsx_8x_1
1115 +
1116 + lvx_4w $xr0,$x00,$inp
1117 + lvx_4w $xt1,$x10,$inp
1118 + lvx_4w $xt2,$x20,$inp
1119 + lvx_4w $xt3,$x30,$inp
1120 +
1121 + vxor $xr0,$xr0,$xan0
1122 + vxor $xt1,$xt1,$xbn0
1123 + vxor $xt2,$xt2,$xcn0
1124 + vxor $xt3,$xt3,$xdn0
1125 +
1126 + stvx_4w $xr0,$x00,$out
1127 + stvx_4w $xt1,$x10,$out
1128 + addi $inp,$inp,0x40
1129 + stvx_4w $xt2,$x20,$out
1130 + subi $len,$len,0x40
1131 + stvx_4w $xt3,$x30,$out
1132 + addi $out,$out,0x40
1133 + beq Ldone_vsx_8x
1134 +
1135 + vadduwm $xan0,$xa7,@K[0]
1136 + vadduwm $xbn0,$xb7,@K[1]
1137 + vadduwm $xcn0,$xcn7,@K[2]
1138 + vadduwm $xdn0,$xdn7,@K[3]
1139 +
1140 + be?vperm $xan0,$xan0,$xan0,$beperm
1141 + be?vperm $xbn0,$xbn0,$xbn0,$beperm
1142 + be?vperm $xcn0,$xcn0,$xcn0,$beperm
1143 + be?vperm $xdn0,$xdn0,$xdn0,$beperm
1144 +
1145 + ${UCMP}i $len,0x40
1146 + blt Ltail_vsx_8x_1
1147 +
1148 + lvx_4w $xr0,$x00,$inp
1149 + lvx_4w $xt1,$x10,$inp
1150 + lvx_4w $xt2,$x20,$inp
1151 + lvx_4w $xt3,$x30,$inp
1152 +
1153 + vxor $xr0,$xr0,$xan0
1154 + vxor $xt1,$xt1,$xbn0
1155 + vxor $xt2,$xt2,$xcn0
1156 + vxor $xt3,$xt3,$xdn0
1157 +
1158 + stvx_4w $xr0,$x00,$out
1159 + stvx_4w $xt1,$x10,$out
1160 + addi $inp,$inp,0x40
1161 + stvx_4w $xt2,$x20,$out
1162 + subi $len,$len,0x40
1163 + stvx_4w $xt3,$x30,$out
1164 + addi $out,$out,0x40
1165 + beq Ldone_vsx_8x
1166 +
1167 + mtctr r0
1168 + bne Loop_outer_vsx_8x
1169 +
1170 +Ldone_vsx_8x:
1171 + lwz r12,`$FRAME-4`($sp) # pull vrsave
1172 + li r10,`15+$LOCALS+64`
1173 + li r11,`31+$LOCALS+64`
1174 + $POP r0, `$FRAME+$LRSAVE`($sp)
1175 + mtspr 256,r12 # restore vrsave
1176 + lvx v24,r10,$sp
1177 + addi r10,r10,32
1178 + lvx v25,r11,$sp
1179 + addi r11,r11,32
1180 + lvx v26,r10,$sp
1181 + addi r10,r10,32
1182 + lvx v27,r11,$sp
1183 + addi r11,r11,32
1184 + lvx v28,r10,$sp
1185 + addi r10,r10,32
1186 + lvx v29,r11,$sp
1187 + addi r11,r11,32
1188 + lvx v30,r10,$sp
1189 + lvx v31,r11,$sp
1190 + mtlr r0
1191 + addi $sp,$sp,$FRAME
1192 + blr
1193 +
1194 +.align 4
1195 +Ltail_vsx_8x:
1196 + addi r11,$sp,$LOCALS
1197 + mtctr $len
1198 + stvx_4w $xa0,$x00,r11 # offload block to stack
1199 + stvx_4w $xb0,$x10,r11
1200 + stvx_4w $xc0,$x20,r11
1201 + stvx_4w $xd0,$x30,r11
1202 + subi r12,r11,1 # prepare for *++ptr
1203 + subi $inp,$inp,1
1204 + subi $out,$out,1
1205 + bl Loop_tail_vsx_8x
1206 +Ltail_vsx_8x_1:
1207 + addi r11,$sp,$LOCALS
1208 + mtctr $len
1209 + stvx_4w $xan0,$x00,r11 # offload block to stack
1210 + stvx_4w $xbn0,$x10,r11
1211 + stvx_4w $xcn0,$x20,r11
1212 + stvx_4w $xdn0,$x30,r11
1213 + subi r12,r11,1 # prepare for *++ptr
1214 + subi $inp,$inp,1
1215 + subi $out,$out,1
1216 + bl Loop_tail_vsx_8x
1217 +
1218 +Loop_tail_vsx_8x:
1219 + lbzu r6,1(r12)
1220 + lbzu r7,1($inp)
1221 + xor r6,r6,r7
1222 + stbu r6,1($out)
1223 + bdnz Loop_tail_vsx_8x
1224 +
1225 + stvx_4w $K[0],$x00,r11 # wipe copy of the block
1226 + stvx_4w $K[0],$x10,r11
1227 + stvx_4w $K[0],$x20,r11
1228 + stvx_4w $K[0],$x30,r11
1229 +
1230 + b Ldone_vsx_8x
1231 + .long 0
1232 + .byte 0,12,0x04,1,0x80,0,5,0
1233 + .long 0
1234 +.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x
1235 +___
1236 +}}}
1237 +
1238 +
1239 +$code.=<<___;
1240 +.align 5
1241 +Lconsts:
1242 + mflr r0
1243 + bcl 20,31,\$+4
1244 + mflr r12 #vvvvv "distance between . and Lsigma
1245 + addi r12,r12,`64-8`
1246 + mtlr r0
1247 + blr
1248 + .long 0
1249 + .byte 0,12,0x14,0,0,0,0,0
1250 + .space `64-9*4`
1251 +Lsigma:
1252 + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574
1253 + .long 1,0,0,0
1254 + .long 2,0,0,0
1255 + .long 3,0,0,0
1256 + .long 4,0,0,0
1257 +___
1258 +$code.=<<___ if ($LITTLE_ENDIAN);
1259 + .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001
1260 + .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300
1261 +___
1262 +$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words
1263 + .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d
1264 + .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c
1265 +___
1266 +$code.=<<___;
1267 + .long 0x61707865,0x61707865,0x61707865,0x61707865
1268 + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
1269 + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
1270 + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
1271 + .long 0,1,2,3
1272 + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c
1273 +.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>"
1274 +.align 2
1275 +___
1276 +
1277 +foreach (split("\n",$code)) {
1278 + s/\`([^\`]*)\`/eval $1/ge;
1279 +
1280 + # instructions prefixed with '?' are endian-specific and need
1281 + # to be adjusted accordingly...
1282 + if ($flavour !~ /le$/) { # big-endian
1283 + s/be\?// or
1284 + s/le\?/#le#/ or
1285 + s/\?lvsr/lvsl/ or
1286 + s/\?lvsl/lvsr/ or
1287 + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or
1288 + s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/;
1289 + } else { # little-endian
1290 + s/le\?// or
1291 + s/be\?/#be#/ or
1292 + s/\?([a-z]+)/$1/ or
1293 + s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/;
1294 + }
1295 +
1296 + print $_,"\n";
1297 +}
1298 +
1299 +close STDOUT or die "error closing STDOUT: $!";
1300 diff --git a/crypto/chacha/build.info b/crypto/chacha/build.info
1301 index c12cb9c..2a819b2 100644
1302 --- a/crypto/chacha/build.info
1303 +++ b/crypto/chacha/build.info
1304 @@ -12,7 +12,7 @@ IF[{- !$disabled{asm} -}]
1305 $CHACHAASM_armv4=chacha-armv4.S
1306 $CHACHAASM_aarch64=chacha-armv8.S
1307
1308 - $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s
1309 + $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s
1310 $CHACHAASM_ppc64=$CHACHAASM_ppc32
1311
1312 $CHACHAASM_c64xplus=chacha-c64xplus.s
1313 @@ -29,6 +29,7 @@ SOURCE[../../libcrypto]=$CHACHAASM
1314 GENERATE[chacha-x86.S]=asm/chacha-x86.pl
1315 GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl
1316 GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl
1317 +GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl
1318 GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl
1319 INCLUDE[chacha-armv4.o]=..
1320 GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl
1321 diff --git a/crypto/chacha/chacha_ppc.c b/crypto/chacha/chacha_ppc.c
1322 index 5319040..f99cca8 100644
1323 --- a/crypto/chacha/chacha_ppc.c
1324 +++ b/crypto/chacha/chacha_ppc.c
1325 @@ -23,13 +23,18 @@ void ChaCha20_ctr32_vmx(unsigned char *out, const unsigned char *inp,
1326 void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp,
1327 size_t len, const unsigned int key[8],
1328 const unsigned int counter[4]);
1329 +void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp,
1330 + size_t len, const unsigned int key[8],
1331 + const unsigned int counter[4]);
1332 void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
1333 size_t len, const unsigned int key[8],
1334 const unsigned int counter[4])
1335 {
1336 - OPENSSL_ppccap_P & PPC_CRYPTO207
1337 - ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
1338 - : OPENSSL_ppccap_P & PPC_ALTIVEC
1339 - ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
1340 - : ChaCha20_ctr32_int(out, inp, len, key, counter);
1341 + OPENSSL_ppccap_P & PPC_BRD31
1342 + ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter)
1343 + :OPENSSL_ppccap_P & PPC_CRYPTO207
1344 + ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
1345 + : OPENSSL_ppccap_P & PPC_ALTIVEC
1346 + ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
1347 + : ChaCha20_ctr32_int(out, inp, len, key, counter);
1348 }
1349 diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
1350 index 2ee4440..4590340 100755
1351 --- a/crypto/perlasm/ppc-xlate.pl
1352 +++ b/crypto/perlasm/ppc-xlate.pl
1353 @@ -293,6 +293,14 @@ my $vpermdi = sub { # xxpermdi
1354 $dm = oct($dm) if ($dm =~ /^0/);
1355 " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7;
1356 };
1357 +my $vxxlor = sub { # xxlor
1358 + my ($f, $vrt, $vra, $vrb) = @_;
1359 + " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6;
1360 +};
1361 +my $vxxlorc = sub { # xxlor
1362 + my ($f, $vrt, $vra, $vrb) = @_;
1363 + " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1;
1364 +};
1365
1366 # PowerISA 2.07 stuff
1367 sub vcrypto_op {
1368 @@ -377,6 +385,15 @@ my $addex = sub {
1369 };
1370 my $vmsumudm = sub { vfour_vsr(@_, 35); };
1371
1372 +# PowerISA 3.1 stuff
1373 +my $brd = sub {
1374 + my ($f, $ra, $rs) = @_;
1375 + " .long ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1);
1376 +};
1377 +my $vsrq = sub { vcrypto_op(@_, 517); };
1378 +
1379 +
1380 +
1381 while($line=<>) {
1382
1383 $line =~ s|[#!;].*$||; # get rid of asm-style comments...
1384 diff --git a/crypto/ppccap.c b/crypto/ppccap.c
1385 index 8bcfed2..664627c 100644
1386 --- a/crypto/ppccap.c
1387 +++ b/crypto/ppccap.c
1388 @@ -45,6 +45,7 @@ void OPENSSL_ppc64_probe(void);
1389 void OPENSSL_altivec_probe(void);
1390 void OPENSSL_crypto207_probe(void);
1391 void OPENSSL_madd300_probe(void);
1392 +void OPENSSL_brd31_probe(void);
1393
1394 long OPENSSL_rdtsc_mftb(void);
1395 long OPENSSL_rdtsc_mfspr268(void);
1396 @@ -117,16 +118,21 @@ static unsigned long getauxval(unsigned long key)
1397 #endif
1398
1399 /* I wish <sys/auxv.h> was universally available */
1400 -#define HWCAP 16 /* AT_HWCAP */
1401 +#ifndef AT_HWCAP
1402 +# define AT_HWCAP 16 /* AT_HWCAP */
1403 +#endif
1404 #define HWCAP_PPC64 (1U << 30)
1405 #define HWCAP_ALTIVEC (1U << 28)
1406 #define HWCAP_FPU (1U << 27)
1407 #define HWCAP_POWER6_EXT (1U << 9)
1408 #define HWCAP_VSX (1U << 7)
1409
1410 -#define HWCAP2 26 /* AT_HWCAP2 */
1411 +#ifndef AT_HWCAP2
1412 +# define AT_HWCAP2 26 /* AT_HWCAP2 */
1413 +#endif
1414 #define HWCAP_VEC_CRYPTO (1U << 25)
1415 #define HWCAP_ARCH_3_00 (1U << 23)
1416 +#define HWCAP_ARCH_3_1 (1U << 18)
1417
1418 # if defined(__GNUC__) && __GNUC__>=2
1419 __attribute__ ((constructor))
1420 @@ -187,6 +193,9 @@ void OPENSSL_cpuid_setup(void)
1421 if (__power_set(0xffffffffU<<17)) /* POWER9 and later */
1422 OPENSSL_ppccap_P |= PPC_MADD300;
1423
1424 + if (__power_set(0xffffffffU<<18)) /* POWER10 and later */
1425 + OPENSSL_ppccap_P |= PPC_BRD31;
1426 +
1427 return;
1428 # endif
1429 #endif
1430 @@ -215,8 +224,8 @@ void OPENSSL_cpuid_setup(void)
1431
1432 #ifdef OSSL_IMPLEMENT_GETAUXVAL
1433 {
1434 - unsigned long hwcap = getauxval(HWCAP);
1435 - unsigned long hwcap2 = getauxval(HWCAP2);
1436 + unsigned long hwcap = getauxval(AT_HWCAP);
1437 + unsigned long hwcap2 = getauxval(AT_HWCAP2);
1438
1439 if (hwcap & HWCAP_FPU) {
1440 OPENSSL_ppccap_P |= PPC_FPU;
1441 @@ -242,6 +251,10 @@ void OPENSSL_cpuid_setup(void)
1442 if (hwcap2 & HWCAP_ARCH_3_00) {
1443 OPENSSL_ppccap_P |= PPC_MADD300;
1444 }
1445 +
1446 + if (hwcap2 & HWCAP_ARCH_3_1) {
1447 + OPENSSL_ppccap_P |= PPC_BRD31;
1448 + }
1449 }
1450 #endif
1451
1452 @@ -263,7 +276,7 @@ void OPENSSL_cpuid_setup(void)
1453 sigaction(SIGILL, &ill_act, &ill_oact);
1454
1455 #ifndef OSSL_IMPLEMENT_GETAUXVAL
1456 - if (sigsetjmp(ill_jmp,1) == 0) {
1457 + if (sigsetjmp(ill_jmp, 1) == 0) {
1458 OPENSSL_fpu_probe();
1459 OPENSSL_ppccap_P |= PPC_FPU;
1460
1461 diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
1462 index c6555df..706164a 100755
1463 --- a/crypto/ppccpuid.pl
1464 +++ b/crypto/ppccpuid.pl
1465 @@ -81,6 +81,17 @@ $code=<<___;
1466 .long 0
1467 .byte 0,12,0x14,0,0,0,0,0
1468
1469 +.globl .OPENSSL_brd31_probe
1470 +.align 4
1471 +.OPENSSL_brd31_probe:
1472 + xor r0,r0,r0
1473 + brd r3,r0
1474 + blr
1475 + .long 0
1476 + .byte 0,12,0x14,0,0,0,0,0
1477 +.size .OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe
1478 +
1479 +
1480 .globl .OPENSSL_wipe_cpu
1481 .align 4
1482 .OPENSSL_wipe_cpu:
1483 diff --git a/include/crypto/ppc_arch.h b/include/crypto/ppc_arch.h
1484 index 3b3ce4b..fcc846c 100644
1485 --- a/include/crypto/ppc_arch.h
1486 +++ b/include/crypto/ppc_arch.h
1487 @@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P;
1488 # define PPC_MADD300 (1<<4)
1489 # define PPC_MFTB (1<<5)
1490 # define PPC_MFSPR268 (1<<6)
1491 +# define PPC_BRD31 (1<<7)
1492
1493 #endif

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed