1 |
Upstream-Status: Backport [ |
2 |
https://github.com/openssl/openssl/commit/f596bbe4da779b56eea34d96168b557d78e1149, |
3 |
https://github.com/openssl/openssl/commit/7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa, |
4 |
hunks in crypto/ppccap.c from https://github.com/openssl/openssl/commit/f5485b97b6c9977c0d39c7669b9f97a879312447 |
5 |
] |
6 |
diff --git a/crypto/chacha/asm/chachap10-ppc.pl b/crypto/chacha/asm/chachap10-ppc.pl |
7 |
new file mode 100755 |
8 |
index 0000000..36e9a8d |
9 |
--- /dev/null |
10 |
+++ b/crypto/chacha/asm/chachap10-ppc.pl |
11 |
@@ -0,0 +1,1288 @@ |
12 |
+#! /usr/bin/env perl |
13 |
+# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
14 |
+# |
15 |
+# Licensed under the Apache License 2.0 (the "License"). You may not use |
16 |
+# this file except in compliance with the License. You can obtain a copy |
17 |
+# in the file LICENSE in the source distribution or at |
18 |
+# https://www.openssl.org/source/license.html |
19 |
+ |
20 |
+# |
21 |
+# ==================================================================== |
22 |
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
23 |
+# project. The module is, however, dual licensed under OpenSSL and |
24 |
+# CRYPTOGAMS licenses depending on where you obtain it. For further |
25 |
+# details see http://www.openssl.org/~appro/cryptogams/. |
26 |
+# ==================================================================== |
27 |
+# |
28 |
+# October 2015 |
29 |
+# |
30 |
+# ChaCha20 for PowerPC/AltiVec. |
31 |
+# |
32 |
+# June 2018 |
33 |
+# |
34 |
+# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for |
35 |
+# processors that can't issue more than one vector instruction per |
36 |
+# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x |
37 |
+# interleave would perform better. Incidentally PowerISA 2.07 (first |
38 |
+# implemented by POWER8) defined new usable instructions, hence 4xVSX |
39 |
+# code path... |
40 |
+# |
41 |
+# Performance in cycles per byte out of large buffer. |
42 |
+# |
43 |
+# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX |
44 |
+# |
45 |
+# Freescale e300 13.6/+115% - - |
46 |
+# PPC74x0/G4e 6.81/+310% 3.81 - |
47 |
+# PPC970/G5 9.29/+160% ? - |
48 |
+# POWER7 8.62/+61% 3.35 - |
49 |
+# POWER8 8.70/+51% 2.91 2.09 |
50 |
+# POWER9 8.80/+29% 4.44(*) 2.45(**) |
51 |
+# |
52 |
+# (*) this is trade-off result, it's possible to improve it, but |
53 |
+# then it would negatively affect all others; |
54 |
+# (**) POWER9 seems to be "allergic" to mixing vector and integer |
55 |
+# instructions, which is why switch to vector-only code pays |
56 |
+# off that much; |
57 |
+ |
58 |
+# $output is the last argument if it looks like a file (it has an extension) |
59 |
+# $flavour is the first argument if it doesn't look like a file |
60 |
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; |
61 |
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; |
62 |
+ |
63 |
+if ($flavour =~ /64/) { |
64 |
+ $SIZE_T =8; |
65 |
+ $LRSAVE =2*$SIZE_T; |
66 |
+ $STU ="stdu"; |
67 |
+ $POP ="ld"; |
68 |
+ $PUSH ="std"; |
69 |
+ $UCMP ="cmpld"; |
70 |
+} elsif ($flavour =~ /32/) { |
71 |
+ $SIZE_T =4; |
72 |
+ $LRSAVE =$SIZE_T; |
73 |
+ $STU ="stwu"; |
74 |
+ $POP ="lwz"; |
75 |
+ $PUSH ="stw"; |
76 |
+ $UCMP ="cmplw"; |
77 |
+} else { die "nonsense $flavour"; } |
78 |
+ |
79 |
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0; |
80 |
+ |
81 |
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
82 |
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
83 |
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
84 |
+die "can't locate ppc-xlate.pl"; |
85 |
+ |
86 |
+open STDOUT,"| $^X $xlate $flavour \"$output\"" |
87 |
+ or die "can't call $xlate: $!"; |
88 |
+ |
89 |
+$LOCALS=6*$SIZE_T; |
90 |
+$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables |
91 |
+ |
92 |
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm |
93 |
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; |
94 |
+ $code .= "\t$opcode\t".join(',',@_)."\n"; |
95 |
+} |
96 |
+ |
97 |
+my $sp = "r1"; |
98 |
+ |
99 |
+my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); |
100 |
+ |
101 |
+ |
102 |
+{{{ |
103 |
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, |
104 |
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15)); |
105 |
+my @K = map("v$_",(16..19)); |
106 |
+my $CTR = "v26"; |
107 |
+my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30)); |
108 |
+my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3); |
109 |
+my $beperm = "v31"; |
110 |
+ |
111 |
+my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); |
112 |
+ |
113 |
+my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload |
114 |
+ |
115 |
+ |
116 |
+sub VSX_lane_ROUND_4x { |
117 |
+my ($a0,$b0,$c0,$d0)=@_; |
118 |
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); |
119 |
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); |
120 |
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); |
121 |
+my @x=map("\"v$_\"",(0..15)); |
122 |
+ |
123 |
+ ( |
124 |
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 |
125 |
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 |
126 |
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 |
127 |
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 |
128 |
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])", |
129 |
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])", |
130 |
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])", |
131 |
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])", |
132 |
+ "&vrlw (@x[$d0],@x[$d0],'$sixteen')", |
133 |
+ "&vrlw (@x[$d1],@x[$d1],'$sixteen')", |
134 |
+ "&vrlw (@x[$d2],@x[$d2],'$sixteen')", |
135 |
+ "&vrlw (@x[$d3],@x[$d3],'$sixteen')", |
136 |
+ |
137 |
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", |
138 |
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", |
139 |
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", |
140 |
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", |
141 |
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])", |
142 |
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])", |
143 |
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])", |
144 |
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])", |
145 |
+ "&vrlw (@x[$b0],@x[$b0],'$twelve')", |
146 |
+ "&vrlw (@x[$b1],@x[$b1],'$twelve')", |
147 |
+ "&vrlw (@x[$b2],@x[$b2],'$twelve')", |
148 |
+ "&vrlw (@x[$b3],@x[$b3],'$twelve')", |
149 |
+ |
150 |
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", |
151 |
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", |
152 |
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", |
153 |
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", |
154 |
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])", |
155 |
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])", |
156 |
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])", |
157 |
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])", |
158 |
+ "&vrlw (@x[$d0],@x[$d0],'$eight')", |
159 |
+ "&vrlw (@x[$d1],@x[$d1],'$eight')", |
160 |
+ "&vrlw (@x[$d2],@x[$d2],'$eight')", |
161 |
+ "&vrlw (@x[$d3],@x[$d3],'$eight')", |
162 |
+ |
163 |
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", |
164 |
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", |
165 |
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", |
166 |
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", |
167 |
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])", |
168 |
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])", |
169 |
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])", |
170 |
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])", |
171 |
+ "&vrlw (@x[$b0],@x[$b0],'$seven')", |
172 |
+ "&vrlw (@x[$b1],@x[$b1],'$seven')", |
173 |
+ "&vrlw (@x[$b2],@x[$b2],'$seven')", |
174 |
+ "&vrlw (@x[$b3],@x[$b3],'$seven')" |
175 |
+ ); |
176 |
+} |
177 |
+ |
178 |
+$code.=<<___; |
179 |
+ |
180 |
+.globl .ChaCha20_ctr32_vsx_p10 |
181 |
+.align 5 |
182 |
+.ChaCha20_ctr32_vsx_p10: |
183 |
+ ${UCMP}i $len,255 |
184 |
+ bgt ChaCha20_ctr32_vsx_8x |
185 |
+ $STU $sp,-$FRAME($sp) |
186 |
+ mflr r0 |
187 |
+ li r10,`15+$LOCALS+64` |
188 |
+ li r11,`31+$LOCALS+64` |
189 |
+ mfspr r12,256 |
190 |
+ stvx v26,r10,$sp |
191 |
+ addi r10,r10,32 |
192 |
+ stvx v27,r11,$sp |
193 |
+ addi r11,r11,32 |
194 |
+ stvx v28,r10,$sp |
195 |
+ addi r10,r10,32 |
196 |
+ stvx v29,r11,$sp |
197 |
+ addi r11,r11,32 |
198 |
+ stvx v30,r10,$sp |
199 |
+ stvx v31,r11,$sp |
200 |
+ stw r12,`$FRAME-4`($sp) # save vrsave |
201 |
+ li r12,-4096+63 |
202 |
+ $PUSH r0, `$FRAME+$LRSAVE`($sp) |
203 |
+ mtspr 256,r12 # preserve 29 AltiVec registers |
204 |
+ |
205 |
+ bl Lconsts # returns pointer Lsigma in r12 |
206 |
+ lvx_4w @K[0],0,r12 # load sigma |
207 |
+ addi r12,r12,0x70 |
208 |
+ li $x10,16 |
209 |
+ li $x20,32 |
210 |
+ li $x30,48 |
211 |
+ li r11,64 |
212 |
+ |
213 |
+ lvx_4w @K[1],0,$key # load key |
214 |
+ lvx_4w @K[2],$x10,$key |
215 |
+ lvx_4w @K[3],0,$ctr # load counter |
216 |
+ |
217 |
+ vxor $xt0,$xt0,$xt0 |
218 |
+ lvx_4w $xt1,r11,r12 |
219 |
+ vspltw $CTR,@K[3],0 |
220 |
+ vsldoi @K[3],@K[3],$xt0,4 |
221 |
+ vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0] |
222 |
+ vadduwm $CTR,$CTR,$xt1 |
223 |
+ |
224 |
+ be?lvsl $beperm,0,$x10 # 0x00..0f |
225 |
+ be?vspltisb $xt0,3 # 0x03..03 |
226 |
+ be?vxor $beperm,$beperm,$xt0 # swap bytes within words |
227 |
+ |
228 |
+ li r0,10 # inner loop counter |
229 |
+ mtctr r0 |
230 |
+ b Loop_outer_vsx |
231 |
+ |
232 |
+.align 5 |
233 |
+Loop_outer_vsx: |
234 |
+ lvx $xa0,$x00,r12 # load [smashed] sigma |
235 |
+ lvx $xa1,$x10,r12 |
236 |
+ lvx $xa2,$x20,r12 |
237 |
+ lvx $xa3,$x30,r12 |
238 |
+ |
239 |
+ vspltw $xb0,@K[1],0 # smash the key |
240 |
+ vspltw $xb1,@K[1],1 |
241 |
+ vspltw $xb2,@K[1],2 |
242 |
+ vspltw $xb3,@K[1],3 |
243 |
+ |
244 |
+ vspltw $xc0,@K[2],0 |
245 |
+ vspltw $xc1,@K[2],1 |
246 |
+ vspltw $xc2,@K[2],2 |
247 |
+ vspltw $xc3,@K[2],3 |
248 |
+ |
249 |
+ vmr $xd0,$CTR # smash the counter |
250 |
+ vspltw $xd1,@K[3],1 |
251 |
+ vspltw $xd2,@K[3],2 |
252 |
+ vspltw $xd3,@K[3],3 |
253 |
+ |
254 |
+ vspltisw $sixteen,-16 # synthesize constants |
255 |
+ vspltisw $twelve,12 |
256 |
+ vspltisw $eight,8 |
257 |
+ vspltisw $seven,7 |
258 |
+ |
259 |
+Loop_vsx_4x: |
260 |
+___ |
261 |
+ foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; } |
262 |
+ foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; } |
263 |
+$code.=<<___; |
264 |
+ |
265 |
+ bdnz Loop_vsx_4x |
266 |
+ |
267 |
+ vadduwm $xd0,$xd0,$CTR |
268 |
+ |
269 |
+ vmrgew $xt0,$xa0,$xa1 # transpose data |
270 |
+ vmrgew $xt1,$xa2,$xa3 |
271 |
+ vmrgow $xa0,$xa0,$xa1 |
272 |
+ vmrgow $xa2,$xa2,$xa3 |
273 |
+ vmrgew $xt2,$xb0,$xb1 |
274 |
+ vmrgew $xt3,$xb2,$xb3 |
275 |
+ vpermdi $xa1,$xa0,$xa2,0b00 |
276 |
+ vpermdi $xa3,$xa0,$xa2,0b11 |
277 |
+ vpermdi $xa0,$xt0,$xt1,0b00 |
278 |
+ vpermdi $xa2,$xt0,$xt1,0b11 |
279 |
+ |
280 |
+ vmrgow $xb0,$xb0,$xb1 |
281 |
+ vmrgow $xb2,$xb2,$xb3 |
282 |
+ vmrgew $xt0,$xc0,$xc1 |
283 |
+ vmrgew $xt1,$xc2,$xc3 |
284 |
+ vpermdi $xb1,$xb0,$xb2,0b00 |
285 |
+ vpermdi $xb3,$xb0,$xb2,0b11 |
286 |
+ vpermdi $xb0,$xt2,$xt3,0b00 |
287 |
+ vpermdi $xb2,$xt2,$xt3,0b11 |
288 |
+ |
289 |
+ vmrgow $xc0,$xc0,$xc1 |
290 |
+ vmrgow $xc2,$xc2,$xc3 |
291 |
+ vmrgew $xt2,$xd0,$xd1 |
292 |
+ vmrgew $xt3,$xd2,$xd3 |
293 |
+ vpermdi $xc1,$xc0,$xc2,0b00 |
294 |
+ vpermdi $xc3,$xc0,$xc2,0b11 |
295 |
+ vpermdi $xc0,$xt0,$xt1,0b00 |
296 |
+ vpermdi $xc2,$xt0,$xt1,0b11 |
297 |
+ |
298 |
+ vmrgow $xd0,$xd0,$xd1 |
299 |
+ vmrgow $xd2,$xd2,$xd3 |
300 |
+ vspltisw $xt0,4 |
301 |
+ vadduwm $CTR,$CTR,$xt0 # next counter value |
302 |
+ vpermdi $xd1,$xd0,$xd2,0b00 |
303 |
+ vpermdi $xd3,$xd0,$xd2,0b11 |
304 |
+ vpermdi $xd0,$xt2,$xt3,0b00 |
305 |
+ vpermdi $xd2,$xt2,$xt3,0b11 |
306 |
+ |
307 |
+ vadduwm $xa0,$xa0,@K[0] |
308 |
+ vadduwm $xb0,$xb0,@K[1] |
309 |
+ vadduwm $xc0,$xc0,@K[2] |
310 |
+ vadduwm $xd0,$xd0,@K[3] |
311 |
+ |
312 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
313 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
314 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
315 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
316 |
+ |
317 |
+ ${UCMP}i $len,0x40 |
318 |
+ blt Ltail_vsx |
319 |
+ |
320 |
+ lvx_4w $xt0,$x00,$inp |
321 |
+ lvx_4w $xt1,$x10,$inp |
322 |
+ lvx_4w $xt2,$x20,$inp |
323 |
+ lvx_4w $xt3,$x30,$inp |
324 |
+ |
325 |
+ vxor $xt0,$xt0,$xa0 |
326 |
+ vxor $xt1,$xt1,$xb0 |
327 |
+ vxor $xt2,$xt2,$xc0 |
328 |
+ vxor $xt3,$xt3,$xd0 |
329 |
+ |
330 |
+ stvx_4w $xt0,$x00,$out |
331 |
+ stvx_4w $xt1,$x10,$out |
332 |
+ addi $inp,$inp,0x40 |
333 |
+ stvx_4w $xt2,$x20,$out |
334 |
+ subi $len,$len,0x40 |
335 |
+ stvx_4w $xt3,$x30,$out |
336 |
+ addi $out,$out,0x40 |
337 |
+ beq Ldone_vsx |
338 |
+ |
339 |
+ vadduwm $xa0,$xa1,@K[0] |
340 |
+ vadduwm $xb0,$xb1,@K[1] |
341 |
+ vadduwm $xc0,$xc1,@K[2] |
342 |
+ vadduwm $xd0,$xd1,@K[3] |
343 |
+ |
344 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
345 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
346 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
347 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
348 |
+ |
349 |
+ ${UCMP}i $len,0x40 |
350 |
+ blt Ltail_vsx |
351 |
+ |
352 |
+ lvx_4w $xt0,$x00,$inp |
353 |
+ lvx_4w $xt1,$x10,$inp |
354 |
+ lvx_4w $xt2,$x20,$inp |
355 |
+ lvx_4w $xt3,$x30,$inp |
356 |
+ |
357 |
+ vxor $xt0,$xt0,$xa0 |
358 |
+ vxor $xt1,$xt1,$xb0 |
359 |
+ vxor $xt2,$xt2,$xc0 |
360 |
+ vxor $xt3,$xt3,$xd0 |
361 |
+ |
362 |
+ stvx_4w $xt0,$x00,$out |
363 |
+ stvx_4w $xt1,$x10,$out |
364 |
+ addi $inp,$inp,0x40 |
365 |
+ stvx_4w $xt2,$x20,$out |
366 |
+ subi $len,$len,0x40 |
367 |
+ stvx_4w $xt3,$x30,$out |
368 |
+ addi $out,$out,0x40 |
369 |
+ beq Ldone_vsx |
370 |
+ |
371 |
+ vadduwm $xa0,$xa2,@K[0] |
372 |
+ vadduwm $xb0,$xb2,@K[1] |
373 |
+ vadduwm $xc0,$xc2,@K[2] |
374 |
+ vadduwm $xd0,$xd2,@K[3] |
375 |
+ |
376 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
377 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
378 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
379 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
380 |
+ |
381 |
+ ${UCMP}i $len,0x40 |
382 |
+ blt Ltail_vsx |
383 |
+ |
384 |
+ lvx_4w $xt0,$x00,$inp |
385 |
+ lvx_4w $xt1,$x10,$inp |
386 |
+ lvx_4w $xt2,$x20,$inp |
387 |
+ lvx_4w $xt3,$x30,$inp |
388 |
+ |
389 |
+ vxor $xt0,$xt0,$xa0 |
390 |
+ vxor $xt1,$xt1,$xb0 |
391 |
+ vxor $xt2,$xt2,$xc0 |
392 |
+ vxor $xt3,$xt3,$xd0 |
393 |
+ |
394 |
+ stvx_4w $xt0,$x00,$out |
395 |
+ stvx_4w $xt1,$x10,$out |
396 |
+ addi $inp,$inp,0x40 |
397 |
+ stvx_4w $xt2,$x20,$out |
398 |
+ subi $len,$len,0x40 |
399 |
+ stvx_4w $xt3,$x30,$out |
400 |
+ addi $out,$out,0x40 |
401 |
+ beq Ldone_vsx |
402 |
+ |
403 |
+ vadduwm $xa0,$xa3,@K[0] |
404 |
+ vadduwm $xb0,$xb3,@K[1] |
405 |
+ vadduwm $xc0,$xc3,@K[2] |
406 |
+ vadduwm $xd0,$xd3,@K[3] |
407 |
+ |
408 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
409 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
410 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
411 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
412 |
+ |
413 |
+ ${UCMP}i $len,0x40 |
414 |
+ blt Ltail_vsx |
415 |
+ |
416 |
+ lvx_4w $xt0,$x00,$inp |
417 |
+ lvx_4w $xt1,$x10,$inp |
418 |
+ lvx_4w $xt2,$x20,$inp |
419 |
+ lvx_4w $xt3,$x30,$inp |
420 |
+ |
421 |
+ vxor $xt0,$xt0,$xa0 |
422 |
+ vxor $xt1,$xt1,$xb0 |
423 |
+ vxor $xt2,$xt2,$xc0 |
424 |
+ vxor $xt3,$xt3,$xd0 |
425 |
+ |
426 |
+ stvx_4w $xt0,$x00,$out |
427 |
+ stvx_4w $xt1,$x10,$out |
428 |
+ addi $inp,$inp,0x40 |
429 |
+ stvx_4w $xt2,$x20,$out |
430 |
+ subi $len,$len,0x40 |
431 |
+ stvx_4w $xt3,$x30,$out |
432 |
+ addi $out,$out,0x40 |
433 |
+ mtctr r0 |
434 |
+ bne Loop_outer_vsx |
435 |
+ |
436 |
+Ldone_vsx: |
437 |
+ lwz r12,`$FRAME-4`($sp) # pull vrsave |
438 |
+ li r10,`15+$LOCALS+64` |
439 |
+ li r11,`31+$LOCALS+64` |
440 |
+ $POP r0, `$FRAME+$LRSAVE`($sp) |
441 |
+ mtspr 256,r12 # restore vrsave |
442 |
+ lvx v26,r10,$sp |
443 |
+ addi r10,r10,32 |
444 |
+ lvx v27,r11,$sp |
445 |
+ addi r11,r11,32 |
446 |
+ lvx v28,r10,$sp |
447 |
+ addi r10,r10,32 |
448 |
+ lvx v29,r11,$sp |
449 |
+ addi r11,r11,32 |
450 |
+ lvx v30,r10,$sp |
451 |
+ lvx v31,r11,$sp |
452 |
+ mtlr r0 |
453 |
+ addi $sp,$sp,$FRAME |
454 |
+ blr |
455 |
+ |
456 |
+.align 4 |
457 |
+Ltail_vsx: |
458 |
+ addi r11,$sp,$LOCALS |
459 |
+ mtctr $len |
460 |
+ stvx_4w $xa0,$x00,r11 # offload block to stack |
461 |
+ stvx_4w $xb0,$x10,r11 |
462 |
+ stvx_4w $xc0,$x20,r11 |
463 |
+ stvx_4w $xd0,$x30,r11 |
464 |
+ subi r12,r11,1 # prepare for *++ptr |
465 |
+ subi $inp,$inp,1 |
466 |
+ subi $out,$out,1 |
467 |
+ |
468 |
+Loop_tail_vsx: |
469 |
+ lbzu r6,1(r12) |
470 |
+ lbzu r7,1($inp) |
471 |
+ xor r6,r6,r7 |
472 |
+ stbu r6,1($out) |
473 |
+ bdnz Loop_tail_vsx |
474 |
+ |
475 |
+ stvx_4w $K[0],$x00,r11 # wipe copy of the block |
476 |
+ stvx_4w $K[0],$x10,r11 |
477 |
+ stvx_4w $K[0],$x20,r11 |
478 |
+ stvx_4w $K[0],$x30,r11 |
479 |
+ |
480 |
+ b Ldone_vsx |
481 |
+ .long 0 |
482 |
+ .byte 0,12,0x04,1,0x80,0,5,0 |
483 |
+ .long 0 |
484 |
+.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10 |
485 |
+___ |
486 |
+}}} |
487 |
+ |
488 |
+##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to |
489 |
+# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info. |
490 |
+# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value. |
491 |
+# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel. |
492 |
+# |
493 |
+{{{ |
494 |
+#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); |
495 |
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, |
496 |
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3, |
497 |
+ $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7, |
498 |
+ $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31)); |
499 |
+my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15)); |
500 |
+my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3)); |
501 |
+my @K = map("v$_",27,(24..26)); |
502 |
+my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31)); |
503 |
+my $xr0 = "v4"; |
504 |
+my $CTR0 = "v22"; |
505 |
+my $CTR1 = "v5"; |
506 |
+my $beperm = "v31"; |
507 |
+my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); |
508 |
+my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7)); |
509 |
+my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17)); |
510 |
+my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21)); |
511 |
+my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26)); |
512 |
+ |
513 |
+my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload |
514 |
+ |
515 |
+sub VSX_lane_ROUND_8x { |
516 |
+my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_; |
517 |
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); |
518 |
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); |
519 |
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); |
520 |
+my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4)); |
521 |
+my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5)); |
522 |
+my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6)); |
523 |
+my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17)); |
524 |
+my @x=map("\"v$_\"",(0..31)); |
525 |
+ |
526 |
+ ( |
527 |
+ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13 |
528 |
+ "&vxxlorc (@x[$c7], $xv9,$xv9)", |
529 |
+ |
530 |
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 |
531 |
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 |
532 |
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 |
533 |
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 |
534 |
+ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1 |
535 |
+ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2 |
536 |
+ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3 |
537 |
+ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4 |
538 |
+ |
539 |
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])", |
540 |
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])", |
541 |
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])", |
542 |
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])", |
543 |
+ "&vxor (@x[$d4],@x[$d4],@x[$a4])", |
544 |
+ "&vxor (@x[$d5],@x[$d5],@x[$a5])", |
545 |
+ "&vxor (@x[$d6],@x[$d6],@x[$a6])", |
546 |
+ "&vxor (@x[$d7],@x[$d7],@x[$a7])", |
547 |
+ |
548 |
+ "&vrlw (@x[$d0],@x[$d0],@x[$c7])", |
549 |
+ "&vrlw (@x[$d1],@x[$d1],@x[$c7])", |
550 |
+ "&vrlw (@x[$d2],@x[$d2],@x[$c7])", |
551 |
+ "&vrlw (@x[$d3],@x[$d3],@x[$c7])", |
552 |
+ "&vrlw (@x[$d4],@x[$d4],@x[$c7])", |
553 |
+ "&vrlw (@x[$d5],@x[$d5],@x[$c7])", |
554 |
+ "&vrlw (@x[$d6],@x[$d6],@x[$c7])", |
555 |
+ "&vrlw (@x[$d7],@x[$d7],@x[$c7])", |
556 |
+ |
557 |
+ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", |
558 |
+ "&vxxlorc (@x[$c7], $xv15,$xv15)", |
559 |
+ "&vxxlorc (@x[$a7], $xv10,$xv10)", |
560 |
+ |
561 |
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", |
562 |
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", |
563 |
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", |
564 |
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", |
565 |
+ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", |
566 |
+ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", |
567 |
+ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", |
568 |
+ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", |
569 |
+ |
570 |
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])", |
571 |
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])", |
572 |
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])", |
573 |
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])", |
574 |
+ "&vxor (@x[$b4],@x[$b4],@x[$c4])", |
575 |
+ "&vxor (@x[$b5],@x[$b5],@x[$c5])", |
576 |
+ "&vxor (@x[$b6],@x[$b6],@x[$c6])", |
577 |
+ "&vxor (@x[$b7],@x[$b7],@x[$c7])", |
578 |
+ |
579 |
+ "&vrlw (@x[$b0],@x[$b0],@x[$a7])", |
580 |
+ "&vrlw (@x[$b1],@x[$b1],@x[$a7])", |
581 |
+ "&vrlw (@x[$b2],@x[$b2],@x[$a7])", |
582 |
+ "&vrlw (@x[$b3],@x[$b3],@x[$a7])", |
583 |
+ "&vrlw (@x[$b4],@x[$b4],@x[$a7])", |
584 |
+ "&vrlw (@x[$b5],@x[$b5],@x[$a7])", |
585 |
+ "&vrlw (@x[$b6],@x[$b6],@x[$a7])", |
586 |
+ "&vrlw (@x[$b7],@x[$b7],@x[$a7])", |
587 |
+ |
588 |
+ "&vxxlorc (@x[$a7], $xv13,$xv13)", |
589 |
+ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", |
590 |
+ "&vxxlorc (@x[$c7], $xv11,$xv11)", |
591 |
+ |
592 |
+ |
593 |
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", |
594 |
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", |
595 |
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", |
596 |
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", |
597 |
+ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", |
598 |
+ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", |
599 |
+ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", |
600 |
+ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", |
601 |
+ |
602 |
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])", |
603 |
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])", |
604 |
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])", |
605 |
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])", |
606 |
+ "&vxor (@x[$d4],@x[$d4],@x[$a4])", |
607 |
+ "&vxor (@x[$d5],@x[$d5],@x[$a5])", |
608 |
+ "&vxor (@x[$d6],@x[$d6],@x[$a6])", |
609 |
+ "&vxor (@x[$d7],@x[$d7],@x[$a7])", |
610 |
+ |
611 |
+ "&vrlw (@x[$d0],@x[$d0],@x[$c7])", |
612 |
+ "&vrlw (@x[$d1],@x[$d1],@x[$c7])", |
613 |
+ "&vrlw (@x[$d2],@x[$d2],@x[$c7])", |
614 |
+ "&vrlw (@x[$d3],@x[$d3],@x[$c7])", |
615 |
+ "&vrlw (@x[$d4],@x[$d4],@x[$c7])", |
616 |
+ "&vrlw (@x[$d5],@x[$d5],@x[$c7])", |
617 |
+ "&vrlw (@x[$d6],@x[$d6],@x[$c7])", |
618 |
+ "&vrlw (@x[$d7],@x[$d7],@x[$c7])", |
619 |
+ |
620 |
+ "&vxxlorc (@x[$c7], $xv15,$xv15)", |
621 |
+ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", |
622 |
+ "&vxxlorc (@x[$a7], $xv12,$xv12)", |
623 |
+ |
624 |
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", |
625 |
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", |
626 |
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", |
627 |
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", |
628 |
+ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", |
629 |
+ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", |
630 |
+ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", |
631 |
+ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", |
632 |
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])", |
633 |
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])", |
634 |
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])", |
635 |
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])", |
636 |
+ "&vxor (@x[$b4],@x[$b4],@x[$c4])", |
637 |
+ "&vxor (@x[$b5],@x[$b5],@x[$c5])", |
638 |
+ "&vxor (@x[$b6],@x[$b6],@x[$c6])", |
639 |
+ "&vxor (@x[$b7],@x[$b7],@x[$c7])", |
640 |
+ "&vrlw (@x[$b0],@x[$b0],@x[$a7])", |
641 |
+ "&vrlw (@x[$b1],@x[$b1],@x[$a7])", |
642 |
+ "&vrlw (@x[$b2],@x[$b2],@x[$a7])", |
643 |
+ "&vrlw (@x[$b3],@x[$b3],@x[$a7])", |
644 |
+ "&vrlw (@x[$b4],@x[$b4],@x[$a7])", |
645 |
+ "&vrlw (@x[$b5],@x[$b5],@x[$a7])", |
646 |
+ "&vrlw (@x[$b6],@x[$b6],@x[$a7])", |
647 |
+ "&vrlw (@x[$b7],@x[$b7],@x[$a7])", |
648 |
+ |
649 |
+ "&vxxlorc (@x[$a7], $xv13,$xv13)", |
650 |
+ ); |
651 |
+} |
652 |
+ |
653 |
+$code.=<<___; |
654 |
+ |
655 |
+.globl .ChaCha20_ctr32_vsx_8x |
656 |
+.align 5 |
657 |
+.ChaCha20_ctr32_vsx_8x: |
658 |
+ $STU $sp,-$FRAME($sp) |
659 |
+ mflr r0 |
660 |
+ li r10,`15+$LOCALS+64` |
661 |
+ li r11,`31+$LOCALS+64` |
662 |
+ mfspr r12,256 |
663 |
+ stvx v24,r10,$sp |
664 |
+ addi r10,r10,32 |
665 |
+ stvx v25,r11,$sp |
666 |
+ addi r11,r11,32 |
667 |
+ stvx v26,r10,$sp |
668 |
+ addi r10,r10,32 |
669 |
+ stvx v27,r11,$sp |
670 |
+ addi r11,r11,32 |
671 |
+ stvx v28,r10,$sp |
672 |
+ addi r10,r10,32 |
673 |
+ stvx v29,r11,$sp |
674 |
+ addi r11,r11,32 |
675 |
+ stvx v30,r10,$sp |
676 |
+ stvx v31,r11,$sp |
677 |
+ stw r12,`$FRAME-4`($sp) # save vrsave |
678 |
+ li r12,-4096+63 |
679 |
+ $PUSH r0, `$FRAME+$LRSAVE`($sp) |
680 |
+ mtspr 256,r12 # preserve 29 AltiVec registers |
681 |
+ |
682 |
+ bl Lconsts # returns pointer Lsigma in r12 |
683 |
+ |
684 |
+ lvx_4w @K[0],0,r12 # load sigma |
685 |
+ addi r12,r12,0x70 |
686 |
+ li $x10,16 |
687 |
+ li $x20,32 |
688 |
+ li $x30,48 |
689 |
+ li r11,64 |
690 |
+ |
691 |
+ vspltisw $xa4,-16 # synthesize constants |
692 |
+ vspltisw $xb4,12 # synthesize constants |
693 |
+ vspltisw $xc4,8 # synthesize constants |
694 |
+ vspltisw $xd4,7 # synthesize constants |
695 |
+ |
696 |
+ lvx $xa0,$x00,r12 # load [smashed] sigma |
697 |
+ lvx $xa1,$x10,r12 |
698 |
+ lvx $xa2,$x20,r12 |
699 |
+ lvx $xa3,$x30,r12 |
700 |
+ |
701 |
+ vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12 |
702 |
+ vxxlor $xv10 ,$xb4,$xb4 |
703 |
+ vxxlor $xv11 ,$xc4,$xc4 |
704 |
+ vxxlor $xv12 ,$xd4,$xd4 |
705 |
+ vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25 |
706 |
+ vxxlor $xv23 ,$xa1,$xa1 |
707 |
+ vxxlor $xv24 ,$xa2,$xa2 |
708 |
+ vxxlor $xv25 ,$xa3,$xa3 |
709 |
+ |
710 |
+ lvx_4w @K[1],0,$key # load key |
711 |
+ lvx_4w @K[2],$x10,$key |
712 |
+ lvx_4w @K[3],0,$ctr # load counter |
713 |
+ vspltisw $xt3,4 |
714 |
+ |
715 |
+ |
716 |
+ vxor $xt2,$xt2,$xt2 |
717 |
+ lvx_4w $xt1,r11,r12 |
718 |
+ vspltw $xa2,@K[3],0 #save the original count after spltw |
719 |
+ vsldoi @K[3],@K[3],$xt2,4 |
720 |
+ vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0] |
721 |
+ vadduwm $xt1,$xa2,$xt1 |
722 |
+ vadduwm $xt3,$xt1,$xt3 # next counter value |
723 |
+ vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8. |
724 |
+ |
725 |
+ be?lvsl $beperm,0,$x10 # 0x00..0f |
726 |
+ be?vspltisb $xt0,3 # 0x03..03 |
727 |
+ be?vxor $beperm,$beperm,$xt0 # swap bytes within words |
728 |
+ be?vxxlor $xv26 ,$beperm,$beperm |
729 |
+ |
730 |
+ vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2 |
731 |
+ vxxlor $xv1 ,@K[1],@K[1] |
732 |
+ vxxlor $xv2 ,@K[2],@K[2] |
733 |
+ vxxlor $xv3 ,@K[3],@K[3] |
734 |
+ vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5 |
735 |
+ vxxlor $xv5 ,$xt3,$xt3 |
736 |
+ vxxlor $xv8 ,$xa0,$xa0 |
737 |
+ |
738 |
+ li r0,10 # inner loop counter |
739 |
+ mtctr r0 |
740 |
+ b Loop_outer_vsx_8x |
741 |
+ |
742 |
+.align 5 |
743 |
+Loop_outer_vsx_8x: |
744 |
+ vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma |
745 |
+ vxxlorc $xa1,$xv23,$xv23 |
746 |
+ vxxlorc $xa2,$xv24,$xv24 |
747 |
+ vxxlorc $xa3,$xv25,$xv25 |
748 |
+ vxxlorc $xa4,$xv22,$xv22 |
749 |
+ vxxlorc $xa5,$xv23,$xv23 |
750 |
+ vxxlorc $xa6,$xv24,$xv24 |
751 |
+ vxxlorc $xa7,$xv25,$xv25 |
752 |
+ |
753 |
+ vspltw $xb0,@K[1],0 # smash the key |
754 |
+ vspltw $xb1,@K[1],1 |
755 |
+ vspltw $xb2,@K[1],2 |
756 |
+ vspltw $xb3,@K[1],3 |
757 |
+ vspltw $xb4,@K[1],0 # smash the key |
758 |
+ vspltw $xb5,@K[1],1 |
759 |
+ vspltw $xb6,@K[1],2 |
760 |
+ vspltw $xb7,@K[1],3 |
761 |
+ |
762 |
+ vspltw $xc0,@K[2],0 |
763 |
+ vspltw $xc1,@K[2],1 |
764 |
+ vspltw $xc2,@K[2],2 |
765 |
+ vspltw $xc3,@K[2],3 |
766 |
+ vspltw $xc4,@K[2],0 |
767 |
+ vspltw $xc7,@K[2],3 |
768 |
+ vspltw $xc5,@K[2],1 |
769 |
+ |
770 |
+ vxxlorc $xd0,$xv4,$xv4 # smash the counter |
771 |
+ vspltw $xd1,@K[3],1 |
772 |
+ vspltw $xd2,@K[3],2 |
773 |
+ vspltw $xd3,@K[3],3 |
774 |
+ vxxlorc $xd4,$xv5,$xv5 # smash the counter |
775 |
+ vspltw $xd5,@K[3],1 |
776 |
+ vspltw $xd6,@K[3],2 |
777 |
+ vspltw $xd7,@K[3],3 |
778 |
+ vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done |
779 |
+ |
780 |
+Loop_vsx_8x: |
781 |
+___ |
782 |
+ foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; } |
783 |
+ foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; } |
784 |
+$code.=<<___; |
785 |
+ |
786 |
+ bdnz Loop_vsx_8x |
787 |
+ vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31 |
788 |
+ vxxlor $xv14 ,$xd5,$xd5 # |
789 |
+ vxxlor $xv15 ,$xd6,$xd6 # |
790 |
+ vxxlor $xv16 ,$xd7,$xd7 # |
791 |
+ |
792 |
+ vxxlor $xv18 ,$xc4,$xc4 # |
793 |
+ vxxlor $xv19 ,$xc5,$xc5 # |
794 |
+ vxxlor $xv20 ,$xc6,$xc6 # |
795 |
+ vxxlor $xv21 ,$xc7,$xc7 # |
796 |
+ |
797 |
+ vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs |
798 |
+ vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs |
799 |
+ be?vxxlorc $beperm,$xv26,$xv26 # copy back the the beperm. |
800 |
+ |
801 |
+ vxxlorc @K[0],$xv0,$xv0 #27 |
802 |
+ vxxlorc @K[1],$xv1,$xv1 #24 |
803 |
+ vxxlorc @K[2],$xv2,$xv2 #25 |
804 |
+ vxxlorc @K[3],$xv3,$xv3 #26 |
805 |
+ vxxlorc $CTR0,$xv4,$xv4 |
806 |
+###changing to vertical |
807 |
+ |
808 |
+ vmrgew $xt0,$xa0,$xa1 # transpose data |
809 |
+ vmrgew $xt1,$xa2,$xa3 |
810 |
+ vmrgow $xa0,$xa0,$xa1 |
811 |
+ vmrgow $xa2,$xa2,$xa3 |
812 |
+ |
813 |
+ vmrgew $xt2,$xb0,$xb1 |
814 |
+ vmrgew $xt3,$xb2,$xb3 |
815 |
+ vmrgow $xb0,$xb0,$xb1 |
816 |
+ vmrgow $xb2,$xb2,$xb3 |
817 |
+ |
818 |
+ vadduwm $xd0,$xd0,$CTR0 |
819 |
+ |
820 |
+ vpermdi $xa1,$xa0,$xa2,0b00 |
821 |
+ vpermdi $xa3,$xa0,$xa2,0b11 |
822 |
+ vpermdi $xa0,$xt0,$xt1,0b00 |
823 |
+ vpermdi $xa2,$xt0,$xt1,0b11 |
824 |
+ vpermdi $xb1,$xb0,$xb2,0b00 |
825 |
+ vpermdi $xb3,$xb0,$xb2,0b11 |
826 |
+ vpermdi $xb0,$xt2,$xt3,0b00 |
827 |
+ vpermdi $xb2,$xt2,$xt3,0b11 |
828 |
+ |
829 |
+ vmrgew $xt0,$xc0,$xc1 |
830 |
+ vmrgew $xt1,$xc2,$xc3 |
831 |
+ vmrgow $xc0,$xc0,$xc1 |
832 |
+ vmrgow $xc2,$xc2,$xc3 |
833 |
+ vmrgew $xt2,$xd0,$xd1 |
834 |
+ vmrgew $xt3,$xd2,$xd3 |
835 |
+ vmrgow $xd0,$xd0,$xd1 |
836 |
+ vmrgow $xd2,$xd2,$xd3 |
837 |
+ |
838 |
+ vpermdi $xc1,$xc0,$xc2,0b00 |
839 |
+ vpermdi $xc3,$xc0,$xc2,0b11 |
840 |
+ vpermdi $xc0,$xt0,$xt1,0b00 |
841 |
+ vpermdi $xc2,$xt0,$xt1,0b11 |
842 |
+ vpermdi $xd1,$xd0,$xd2,0b00 |
843 |
+ vpermdi $xd3,$xd0,$xd2,0b11 |
844 |
+ vpermdi $xd0,$xt2,$xt3,0b00 |
845 |
+ vpermdi $xd2,$xt2,$xt3,0b11 |
846 |
+ |
847 |
+ vspltisw $xt0,8 |
848 |
+ vadduwm $CTR0,$CTR0,$xt0 # next counter value |
849 |
+ vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5 |
850 |
+ |
851 |
+ vadduwm $xa0,$xa0,@K[0] |
852 |
+ vadduwm $xb0,$xb0,@K[1] |
853 |
+ vadduwm $xc0,$xc0,@K[2] |
854 |
+ vadduwm $xd0,$xd0,@K[3] |
855 |
+ |
856 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
857 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
858 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
859 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
860 |
+ |
861 |
+ ${UCMP}i $len,0x40 |
862 |
+ blt Ltail_vsx_8x |
863 |
+ |
864 |
+ lvx_4w $xt0,$x00,$inp |
865 |
+ lvx_4w $xt1,$x10,$inp |
866 |
+ lvx_4w $xt2,$x20,$inp |
867 |
+ lvx_4w $xt3,$x30,$inp |
868 |
+ |
869 |
+ vxor $xt0,$xt0,$xa0 |
870 |
+ vxor $xt1,$xt1,$xb0 |
871 |
+ vxor $xt2,$xt2,$xc0 |
872 |
+ vxor $xt3,$xt3,$xd0 |
873 |
+ |
874 |
+ stvx_4w $xt0,$x00,$out |
875 |
+ stvx_4w $xt1,$x10,$out |
876 |
+ addi $inp,$inp,0x40 |
877 |
+ stvx_4w $xt2,$x20,$out |
878 |
+ subi $len,$len,0x40 |
879 |
+ stvx_4w $xt3,$x30,$out |
880 |
+ addi $out,$out,0x40 |
881 |
+ beq Ldone_vsx_8x |
882 |
+ |
883 |
+ vadduwm $xa0,$xa1,@K[0] |
884 |
+ vadduwm $xb0,$xb1,@K[1] |
885 |
+ vadduwm $xc0,$xc1,@K[2] |
886 |
+ vadduwm $xd0,$xd1,@K[3] |
887 |
+ |
888 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
889 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
890 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
891 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
892 |
+ |
893 |
+ ${UCMP}i $len,0x40 |
894 |
+ blt Ltail_vsx_8x |
895 |
+ |
896 |
+ lvx_4w $xt0,$x00,$inp |
897 |
+ lvx_4w $xt1,$x10,$inp |
898 |
+ lvx_4w $xt2,$x20,$inp |
899 |
+ lvx_4w $xt3,$x30,$inp |
900 |
+ |
901 |
+ vxor $xt0,$xt0,$xa0 |
902 |
+ vxor $xt1,$xt1,$xb0 |
903 |
+ vxor $xt2,$xt2,$xc0 |
904 |
+ vxor $xt3,$xt3,$xd0 |
905 |
+ |
906 |
+ stvx_4w $xt0,$x00,$out |
907 |
+ stvx_4w $xt1,$x10,$out |
908 |
+ addi $inp,$inp,0x40 |
909 |
+ stvx_4w $xt2,$x20,$out |
910 |
+ subi $len,$len,0x40 |
911 |
+ stvx_4w $xt3,$x30,$out |
912 |
+ addi $out,$out,0x40 |
913 |
+ beq Ldone_vsx_8x |
914 |
+ |
915 |
+ vadduwm $xa0,$xa2,@K[0] |
916 |
+ vadduwm $xb0,$xb2,@K[1] |
917 |
+ vadduwm $xc0,$xc2,@K[2] |
918 |
+ vadduwm $xd0,$xd2,@K[3] |
919 |
+ |
920 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
921 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
922 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
923 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
924 |
+ |
925 |
+ ${UCMP}i $len,0x40 |
926 |
+ blt Ltail_vsx_8x |
927 |
+ |
928 |
+ lvx_4w $xt0,$x00,$inp |
929 |
+ lvx_4w $xt1,$x10,$inp |
930 |
+ lvx_4w $xt2,$x20,$inp |
931 |
+ lvx_4w $xt3,$x30,$inp |
932 |
+ |
933 |
+ vxor $xt0,$xt0,$xa0 |
934 |
+ vxor $xt1,$xt1,$xb0 |
935 |
+ vxor $xt2,$xt2,$xc0 |
936 |
+ vxor $xt3,$xt3,$xd0 |
937 |
+ |
938 |
+ stvx_4w $xt0,$x00,$out |
939 |
+ stvx_4w $xt1,$x10,$out |
940 |
+ addi $inp,$inp,0x40 |
941 |
+ stvx_4w $xt2,$x20,$out |
942 |
+ subi $len,$len,0x40 |
943 |
+ stvx_4w $xt3,$x30,$out |
944 |
+ addi $out,$out,0x40 |
945 |
+ beq Ldone_vsx_8x |
946 |
+ |
947 |
+ vadduwm $xa0,$xa3,@K[0] |
948 |
+ vadduwm $xb0,$xb3,@K[1] |
949 |
+ vadduwm $xc0,$xc3,@K[2] |
950 |
+ vadduwm $xd0,$xd3,@K[3] |
951 |
+ |
952 |
+ be?vperm $xa0,$xa0,$xa0,$beperm |
953 |
+ be?vperm $xb0,$xb0,$xb0,$beperm |
954 |
+ be?vperm $xc0,$xc0,$xc0,$beperm |
955 |
+ be?vperm $xd0,$xd0,$xd0,$beperm |
956 |
+ |
957 |
+ ${UCMP}i $len,0x40 |
958 |
+ blt Ltail_vsx_8x |
959 |
+ |
960 |
+ lvx_4w $xt0,$x00,$inp |
961 |
+ lvx_4w $xt1,$x10,$inp |
962 |
+ lvx_4w $xt2,$x20,$inp |
963 |
+ lvx_4w $xt3,$x30,$inp |
964 |
+ |
965 |
+ vxor $xt0,$xt0,$xa0 |
966 |
+ vxor $xt1,$xt1,$xb0 |
967 |
+ vxor $xt2,$xt2,$xc0 |
968 |
+ vxor $xt3,$xt3,$xd0 |
969 |
+ |
970 |
+ stvx_4w $xt0,$x00,$out |
971 |
+ stvx_4w $xt1,$x10,$out |
972 |
+ addi $inp,$inp,0x40 |
973 |
+ stvx_4w $xt2,$x20,$out |
974 |
+ subi $len,$len,0x40 |
975 |
+ stvx_4w $xt3,$x30,$out |
976 |
+ addi $out,$out,0x40 |
977 |
+ beq Ldone_vsx_8x |
978 |
+ |
979 |
+#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31. |
980 |
+#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0. |
981 |
+ |
982 |
+ vxxlorc $CTR1 ,$xv5,$xv5 |
983 |
+ |
984 |
+ vxxlorc $xcn4 ,$xv18,$xv18 |
985 |
+ vxxlorc $xcn5 ,$xv19,$xv19 |
986 |
+ vxxlorc $xcn6 ,$xv20,$xv20 |
987 |
+ vxxlorc $xcn7 ,$xv21,$xv21 |
988 |
+ |
989 |
+ vxxlorc $xdn4 ,$xv13,$xv13 |
990 |
+ vxxlorc $xdn5 ,$xv14,$xv14 |
991 |
+ vxxlorc $xdn6 ,$xv15,$xv15 |
992 |
+ vxxlorc $xdn7 ,$xv16,$xv16 |
993 |
+ vadduwm $xdn4,$xdn4,$CTR1 |
994 |
+ |
995 |
+ vxxlorc $xb6 ,$xv6,$xv6 |
996 |
+ vxxlorc $xb7 ,$xv7,$xv7 |
997 |
+#use xa1->xr0, as xt0...in the block 4-7 |
998 |
+ |
999 |
+ vmrgew $xr0,$xa4,$xa5 # transpose data |
1000 |
+ vmrgew $xt1,$xa6,$xa7 |
1001 |
+ vmrgow $xa4,$xa4,$xa5 |
1002 |
+ vmrgow $xa6,$xa6,$xa7 |
1003 |
+ vmrgew $xt2,$xb4,$xb5 |
1004 |
+ vmrgew $xt3,$xb6,$xb7 |
1005 |
+ vmrgow $xb4,$xb4,$xb5 |
1006 |
+ vmrgow $xb6,$xb6,$xb7 |
1007 |
+ |
1008 |
+ vpermdi $xa5,$xa4,$xa6,0b00 |
1009 |
+ vpermdi $xa7,$xa4,$xa6,0b11 |
1010 |
+ vpermdi $xa4,$xr0,$xt1,0b00 |
1011 |
+ vpermdi $xa6,$xr0,$xt1,0b11 |
1012 |
+ vpermdi $xb5,$xb4,$xb6,0b00 |
1013 |
+ vpermdi $xb7,$xb4,$xb6,0b11 |
1014 |
+ vpermdi $xb4,$xt2,$xt3,0b00 |
1015 |
+ vpermdi $xb6,$xt2,$xt3,0b11 |
1016 |
+ |
1017 |
+ vmrgew $xr0,$xcn4,$xcn5 |
1018 |
+ vmrgew $xt1,$xcn6,$xcn7 |
1019 |
+ vmrgow $xcn4,$xcn4,$xcn5 |
1020 |
+ vmrgow $xcn6,$xcn6,$xcn7 |
1021 |
+ vmrgew $xt2,$xdn4,$xdn5 |
1022 |
+ vmrgew $xt3,$xdn6,$xdn7 |
1023 |
+ vmrgow $xdn4,$xdn4,$xdn5 |
1024 |
+ vmrgow $xdn6,$xdn6,$xdn7 |
1025 |
+ |
1026 |
+ vpermdi $xcn5,$xcn4,$xcn6,0b00 |
1027 |
+ vpermdi $xcn7,$xcn4,$xcn6,0b11 |
1028 |
+ vpermdi $xcn4,$xr0,$xt1,0b00 |
1029 |
+ vpermdi $xcn6,$xr0,$xt1,0b11 |
1030 |
+ vpermdi $xdn5,$xdn4,$xdn6,0b00 |
1031 |
+ vpermdi $xdn7,$xdn4,$xdn6,0b11 |
1032 |
+ vpermdi $xdn4,$xt2,$xt3,0b00 |
1033 |
+ vpermdi $xdn6,$xt2,$xt3,0b11 |
1034 |
+ |
1035 |
+ vspltisw $xr0,8 |
1036 |
+ vadduwm $CTR1,$CTR1,$xr0 # next counter value |
1037 |
+ vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5 |
1038 |
+ |
1039 |
+ vadduwm $xan0,$xa4,@K[0] |
1040 |
+ vadduwm $xbn0,$xb4,@K[1] |
1041 |
+ vadduwm $xcn0,$xcn4,@K[2] |
1042 |
+ vadduwm $xdn0,$xdn4,@K[3] |
1043 |
+ |
1044 |
+ be?vperm $xan0,$xa4,$xa4,$beperm |
1045 |
+ be?vperm $xbn0,$xb4,$xb4,$beperm |
1046 |
+ be?vperm $xcn0,$xcn4,$xcn4,$beperm |
1047 |
+ be?vperm $xdn0,$xdn4,$xdn4,$beperm |
1048 |
+ |
1049 |
+ ${UCMP}i $len,0x40 |
1050 |
+ blt Ltail_vsx_8x_1 |
1051 |
+ |
1052 |
+ lvx_4w $xr0,$x00,$inp |
1053 |
+ lvx_4w $xt1,$x10,$inp |
1054 |
+ lvx_4w $xt2,$x20,$inp |
1055 |
+ lvx_4w $xt3,$x30,$inp |
1056 |
+ |
1057 |
+ vxor $xr0,$xr0,$xan0 |
1058 |
+ vxor $xt1,$xt1,$xbn0 |
1059 |
+ vxor $xt2,$xt2,$xcn0 |
1060 |
+ vxor $xt3,$xt3,$xdn0 |
1061 |
+ |
1062 |
+ stvx_4w $xr0,$x00,$out |
1063 |
+ stvx_4w $xt1,$x10,$out |
1064 |
+ addi $inp,$inp,0x40 |
1065 |
+ stvx_4w $xt2,$x20,$out |
1066 |
+ subi $len,$len,0x40 |
1067 |
+ stvx_4w $xt3,$x30,$out |
1068 |
+ addi $out,$out,0x40 |
1069 |
+ beq Ldone_vsx_8x |
1070 |
+ |
1071 |
+ vadduwm $xan0,$xa5,@K[0] |
1072 |
+ vadduwm $xbn0,$xb5,@K[1] |
1073 |
+ vadduwm $xcn0,$xcn5,@K[2] |
1074 |
+ vadduwm $xdn0,$xdn5,@K[3] |
1075 |
+ |
1076 |
+ be?vperm $xan0,$xan0,$xan0,$beperm |
1077 |
+ be?vperm $xbn0,$xbn0,$xbn0,$beperm |
1078 |
+ be?vperm $xcn0,$xcn0,$xcn0,$beperm |
1079 |
+ be?vperm $xdn0,$xdn0,$xdn0,$beperm |
1080 |
+ |
1081 |
+ ${UCMP}i $len,0x40 |
1082 |
+ blt Ltail_vsx_8x_1 |
1083 |
+ |
1084 |
+ lvx_4w $xr0,$x00,$inp |
1085 |
+ lvx_4w $xt1,$x10,$inp |
1086 |
+ lvx_4w $xt2,$x20,$inp |
1087 |
+ lvx_4w $xt3,$x30,$inp |
1088 |
+ |
1089 |
+ vxor $xr0,$xr0,$xan0 |
1090 |
+ vxor $xt1,$xt1,$xbn0 |
1091 |
+ vxor $xt2,$xt2,$xcn0 |
1092 |
+ vxor $xt3,$xt3,$xdn0 |
1093 |
+ |
1094 |
+ stvx_4w $xr0,$x00,$out |
1095 |
+ stvx_4w $xt1,$x10,$out |
1096 |
+ addi $inp,$inp,0x40 |
1097 |
+ stvx_4w $xt2,$x20,$out |
1098 |
+ subi $len,$len,0x40 |
1099 |
+ stvx_4w $xt3,$x30,$out |
1100 |
+ addi $out,$out,0x40 |
1101 |
+ beq Ldone_vsx_8x |
1102 |
+ |
1103 |
+ vadduwm $xan0,$xa6,@K[0] |
1104 |
+ vadduwm $xbn0,$xb6,@K[1] |
1105 |
+ vadduwm $xcn0,$xcn6,@K[2] |
1106 |
+ vadduwm $xdn0,$xdn6,@K[3] |
1107 |
+ |
1108 |
+ be?vperm $xan0,$xan0,$xan0,$beperm |
1109 |
+ be?vperm $xbn0,$xbn0,$xbn0,$beperm |
1110 |
+ be?vperm $xcn0,$xcn0,$xcn0,$beperm |
1111 |
+ be?vperm $xdn0,$xdn0,$xdn0,$beperm |
1112 |
+ |
1113 |
+ ${UCMP}i $len,0x40 |
1114 |
+ blt Ltail_vsx_8x_1 |
1115 |
+ |
1116 |
+ lvx_4w $xr0,$x00,$inp |
1117 |
+ lvx_4w $xt1,$x10,$inp |
1118 |
+ lvx_4w $xt2,$x20,$inp |
1119 |
+ lvx_4w $xt3,$x30,$inp |
1120 |
+ |
1121 |
+ vxor $xr0,$xr0,$xan0 |
1122 |
+ vxor $xt1,$xt1,$xbn0 |
1123 |
+ vxor $xt2,$xt2,$xcn0 |
1124 |
+ vxor $xt3,$xt3,$xdn0 |
1125 |
+ |
1126 |
+ stvx_4w $xr0,$x00,$out |
1127 |
+ stvx_4w $xt1,$x10,$out |
1128 |
+ addi $inp,$inp,0x40 |
1129 |
+ stvx_4w $xt2,$x20,$out |
1130 |
+ subi $len,$len,0x40 |
1131 |
+ stvx_4w $xt3,$x30,$out |
1132 |
+ addi $out,$out,0x40 |
1133 |
+ beq Ldone_vsx_8x |
1134 |
+ |
1135 |
+ vadduwm $xan0,$xa7,@K[0] |
1136 |
+ vadduwm $xbn0,$xb7,@K[1] |
1137 |
+ vadduwm $xcn0,$xcn7,@K[2] |
1138 |
+ vadduwm $xdn0,$xdn7,@K[3] |
1139 |
+ |
1140 |
+ be?vperm $xan0,$xan0,$xan0,$beperm |
1141 |
+ be?vperm $xbn0,$xbn0,$xbn0,$beperm |
1142 |
+ be?vperm $xcn0,$xcn0,$xcn0,$beperm |
1143 |
+ be?vperm $xdn0,$xdn0,$xdn0,$beperm |
1144 |
+ |
1145 |
+ ${UCMP}i $len,0x40 |
1146 |
+ blt Ltail_vsx_8x_1 |
1147 |
+ |
1148 |
+ lvx_4w $xr0,$x00,$inp |
1149 |
+ lvx_4w $xt1,$x10,$inp |
1150 |
+ lvx_4w $xt2,$x20,$inp |
1151 |
+ lvx_4w $xt3,$x30,$inp |
1152 |
+ |
1153 |
+ vxor $xr0,$xr0,$xan0 |
1154 |
+ vxor $xt1,$xt1,$xbn0 |
1155 |
+ vxor $xt2,$xt2,$xcn0 |
1156 |
+ vxor $xt3,$xt3,$xdn0 |
1157 |
+ |
1158 |
+ stvx_4w $xr0,$x00,$out |
1159 |
+ stvx_4w $xt1,$x10,$out |
1160 |
+ addi $inp,$inp,0x40 |
1161 |
+ stvx_4w $xt2,$x20,$out |
1162 |
+ subi $len,$len,0x40 |
1163 |
+ stvx_4w $xt3,$x30,$out |
1164 |
+ addi $out,$out,0x40 |
1165 |
+ beq Ldone_vsx_8x |
1166 |
+ |
1167 |
+ mtctr r0 |
1168 |
+ bne Loop_outer_vsx_8x |
1169 |
+ |
1170 |
+Ldone_vsx_8x: |
1171 |
+ lwz r12,`$FRAME-4`($sp) # pull vrsave |
1172 |
+ li r10,`15+$LOCALS+64` |
1173 |
+ li r11,`31+$LOCALS+64` |
1174 |
+ $POP r0, `$FRAME+$LRSAVE`($sp) |
1175 |
+ mtspr 256,r12 # restore vrsave |
1176 |
+ lvx v24,r10,$sp |
1177 |
+ addi r10,r10,32 |
1178 |
+ lvx v25,r11,$sp |
1179 |
+ addi r11,r11,32 |
1180 |
+ lvx v26,r10,$sp |
1181 |
+ addi r10,r10,32 |
1182 |
+ lvx v27,r11,$sp |
1183 |
+ addi r11,r11,32 |
1184 |
+ lvx v28,r10,$sp |
1185 |
+ addi r10,r10,32 |
1186 |
+ lvx v29,r11,$sp |
1187 |
+ addi r11,r11,32 |
1188 |
+ lvx v30,r10,$sp |
1189 |
+ lvx v31,r11,$sp |
1190 |
+ mtlr r0 |
1191 |
+ addi $sp,$sp,$FRAME |
1192 |
+ blr |
1193 |
+ |
1194 |
+.align 4 |
1195 |
+Ltail_vsx_8x: |
1196 |
+ addi r11,$sp,$LOCALS |
1197 |
+ mtctr $len |
1198 |
+ stvx_4w $xa0,$x00,r11 # offload block to stack |
1199 |
+ stvx_4w $xb0,$x10,r11 |
1200 |
+ stvx_4w $xc0,$x20,r11 |
1201 |
+ stvx_4w $xd0,$x30,r11 |
1202 |
+ subi r12,r11,1 # prepare for *++ptr |
1203 |
+ subi $inp,$inp,1 |
1204 |
+ subi $out,$out,1 |
1205 |
+ bl Loop_tail_vsx_8x |
1206 |
+Ltail_vsx_8x_1: |
1207 |
+ addi r11,$sp,$LOCALS |
1208 |
+ mtctr $len |
1209 |
+ stvx_4w $xan0,$x00,r11 # offload block to stack |
1210 |
+ stvx_4w $xbn0,$x10,r11 |
1211 |
+ stvx_4w $xcn0,$x20,r11 |
1212 |
+ stvx_4w $xdn0,$x30,r11 |
1213 |
+ subi r12,r11,1 # prepare for *++ptr |
1214 |
+ subi $inp,$inp,1 |
1215 |
+ subi $out,$out,1 |
1216 |
+ bl Loop_tail_vsx_8x |
1217 |
+ |
1218 |
+Loop_tail_vsx_8x: |
1219 |
+ lbzu r6,1(r12) |
1220 |
+ lbzu r7,1($inp) |
1221 |
+ xor r6,r6,r7 |
1222 |
+ stbu r6,1($out) |
1223 |
+ bdnz Loop_tail_vsx_8x |
1224 |
+ |
1225 |
+ stvx_4w $K[0],$x00,r11 # wipe copy of the block |
1226 |
+ stvx_4w $K[0],$x10,r11 |
1227 |
+ stvx_4w $K[0],$x20,r11 |
1228 |
+ stvx_4w $K[0],$x30,r11 |
1229 |
+ |
1230 |
+ b Ldone_vsx_8x |
1231 |
+ .long 0 |
1232 |
+ .byte 0,12,0x04,1,0x80,0,5,0 |
1233 |
+ .long 0 |
1234 |
+.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x |
1235 |
+___ |
1236 |
+}}} |
1237 |
+ |
1238 |
+ |
1239 |
+$code.=<<___; |
1240 |
+.align 5 |
1241 |
+Lconsts: |
1242 |
+ mflr r0 |
1243 |
+ bcl 20,31,\$+4 |
1244 |
+ mflr r12 #vvvvv "distance between . and Lsigma |
1245 |
+ addi r12,r12,`64-8` |
1246 |
+ mtlr r0 |
1247 |
+ blr |
1248 |
+ .long 0 |
1249 |
+ .byte 0,12,0x14,0,0,0,0,0 |
1250 |
+ .space `64-9*4` |
1251 |
+Lsigma: |
1252 |
+ .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 |
1253 |
+ .long 1,0,0,0 |
1254 |
+ .long 2,0,0,0 |
1255 |
+ .long 3,0,0,0 |
1256 |
+ .long 4,0,0,0 |
1257 |
+___ |
1258 |
+$code.=<<___ if ($LITTLE_ENDIAN); |
1259 |
+ .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001 |
1260 |
+ .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300 |
1261 |
+___ |
1262 |
+$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words |
1263 |
+ .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d |
1264 |
+ .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c |
1265 |
+___ |
1266 |
+$code.=<<___; |
1267 |
+ .long 0x61707865,0x61707865,0x61707865,0x61707865 |
1268 |
+ .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e |
1269 |
+ .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 |
1270 |
+ .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 |
1271 |
+ .long 0,1,2,3 |
1272 |
+ .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c |
1273 |
+.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>" |
1274 |
+.align 2 |
1275 |
+___ |
1276 |
+ |
1277 |
+foreach (split("\n",$code)) { |
1278 |
+ s/\`([^\`]*)\`/eval $1/ge; |
1279 |
+ |
1280 |
+ # instructions prefixed with '?' are endian-specific and need |
1281 |
+ # to be adjusted accordingly... |
1282 |
+ if ($flavour !~ /le$/) { # big-endian |
1283 |
+ s/be\?// or |
1284 |
+ s/le\?/#le#/ or |
1285 |
+ s/\?lvsr/lvsl/ or |
1286 |
+ s/\?lvsl/lvsr/ or |
1287 |
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or |
1288 |
+ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/; |
1289 |
+ } else { # little-endian |
1290 |
+ s/le\?// or |
1291 |
+ s/be\?/#be#/ or |
1292 |
+ s/\?([a-z]+)/$1/ or |
1293 |
+ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/; |
1294 |
+ } |
1295 |
+ |
1296 |
+ print $_,"\n"; |
1297 |
+} |
1298 |
+ |
1299 |
+close STDOUT or die "error closing STDOUT: $!"; |
1300 |
diff --git a/crypto/chacha/build.info b/crypto/chacha/build.info |
1301 |
index c12cb9c..2a819b2 100644 |
1302 |
--- a/crypto/chacha/build.info |
1303 |
+++ b/crypto/chacha/build.info |
1304 |
@@ -12,7 +12,7 @@ IF[{- !$disabled{asm} -}] |
1305 |
$CHACHAASM_armv4=chacha-armv4.S |
1306 |
$CHACHAASM_aarch64=chacha-armv8.S |
1307 |
|
1308 |
- $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s |
1309 |
+ $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s |
1310 |
$CHACHAASM_ppc64=$CHACHAASM_ppc32 |
1311 |
|
1312 |
$CHACHAASM_c64xplus=chacha-c64xplus.s |
1313 |
@@ -29,6 +29,7 @@ SOURCE[../../libcrypto]=$CHACHAASM |
1314 |
GENERATE[chacha-x86.S]=asm/chacha-x86.pl |
1315 |
GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl |
1316 |
GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl |
1317 |
+GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl |
1318 |
GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl |
1319 |
INCLUDE[chacha-armv4.o]=.. |
1320 |
GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl |
1321 |
diff --git a/crypto/chacha/chacha_ppc.c b/crypto/chacha/chacha_ppc.c |
1322 |
index 5319040..f99cca8 100644 |
1323 |
--- a/crypto/chacha/chacha_ppc.c |
1324 |
+++ b/crypto/chacha/chacha_ppc.c |
1325 |
@@ -23,13 +23,18 @@ void ChaCha20_ctr32_vmx(unsigned char *out, const unsigned char *inp, |
1326 |
void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp, |
1327 |
size_t len, const unsigned int key[8], |
1328 |
const unsigned int counter[4]); |
1329 |
+void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp, |
1330 |
+ size_t len, const unsigned int key[8], |
1331 |
+ const unsigned int counter[4]); |
1332 |
void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, |
1333 |
size_t len, const unsigned int key[8], |
1334 |
const unsigned int counter[4]) |
1335 |
{ |
1336 |
- OPENSSL_ppccap_P & PPC_CRYPTO207 |
1337 |
- ? ChaCha20_ctr32_vsx(out, inp, len, key, counter) |
1338 |
- : OPENSSL_ppccap_P & PPC_ALTIVEC |
1339 |
- ? ChaCha20_ctr32_vmx(out, inp, len, key, counter) |
1340 |
- : ChaCha20_ctr32_int(out, inp, len, key, counter); |
1341 |
+ OPENSSL_ppccap_P & PPC_BRD31 |
1342 |
+ ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter) |
1343 |
+ :OPENSSL_ppccap_P & PPC_CRYPTO207 |
1344 |
+ ? ChaCha20_ctr32_vsx(out, inp, len, key, counter) |
1345 |
+ : OPENSSL_ppccap_P & PPC_ALTIVEC |
1346 |
+ ? ChaCha20_ctr32_vmx(out, inp, len, key, counter) |
1347 |
+ : ChaCha20_ctr32_int(out, inp, len, key, counter); |
1348 |
} |
1349 |
diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl |
1350 |
index 2ee4440..4590340 100755 |
1351 |
--- a/crypto/perlasm/ppc-xlate.pl |
1352 |
+++ b/crypto/perlasm/ppc-xlate.pl |
1353 |
@@ -293,6 +293,14 @@ my $vpermdi = sub { # xxpermdi |
1354 |
$dm = oct($dm) if ($dm =~ /^0/); |
1355 |
" .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7; |
1356 |
}; |
1357 |
+my $vxxlor = sub { # xxlor |
1358 |
+ my ($f, $vrt, $vra, $vrb) = @_; |
1359 |
+ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6; |
1360 |
+}; |
1361 |
+my $vxxlorc = sub { # xxlor |
1362 |
+ my ($f, $vrt, $vra, $vrb) = @_; |
1363 |
+ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1; |
1364 |
+}; |
1365 |
|
1366 |
# PowerISA 2.07 stuff |
1367 |
sub vcrypto_op { |
1368 |
@@ -377,6 +385,15 @@ my $addex = sub { |
1369 |
}; |
1370 |
my $vmsumudm = sub { vfour_vsr(@_, 35); }; |
1371 |
|
1372 |
+# PowerISA 3.1 stuff |
1373 |
+my $brd = sub { |
1374 |
+ my ($f, $ra, $rs) = @_; |
1375 |
+ " .long ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1); |
1376 |
+}; |
1377 |
+my $vsrq = sub { vcrypto_op(@_, 517); }; |
1378 |
+ |
1379 |
+ |
1380 |
+ |
1381 |
while($line=<>) { |
1382 |
|
1383 |
$line =~ s|[#!;].*$||; # get rid of asm-style comments... |
1384 |
diff --git a/crypto/ppccap.c b/crypto/ppccap.c |
1385 |
index 8bcfed2..664627c 100644 |
1386 |
--- a/crypto/ppccap.c |
1387 |
+++ b/crypto/ppccap.c |
1388 |
@@ -45,6 +45,7 @@ void OPENSSL_ppc64_probe(void); |
1389 |
void OPENSSL_altivec_probe(void); |
1390 |
void OPENSSL_crypto207_probe(void); |
1391 |
void OPENSSL_madd300_probe(void); |
1392 |
+void OPENSSL_brd31_probe(void); |
1393 |
|
1394 |
long OPENSSL_rdtsc_mftb(void); |
1395 |
long OPENSSL_rdtsc_mfspr268(void); |
1396 |
@@ -117,16 +118,21 @@ static unsigned long getauxval(unsigned long key) |
1397 |
#endif |
1398 |
|
1399 |
/* I wish <sys/auxv.h> was universally available */ |
1400 |
-#define HWCAP 16 /* AT_HWCAP */ |
1401 |
+#ifndef AT_HWCAP |
1402 |
+# define AT_HWCAP 16 /* AT_HWCAP */ |
1403 |
+#endif |
1404 |
#define HWCAP_PPC64 (1U << 30) |
1405 |
#define HWCAP_ALTIVEC (1U << 28) |
1406 |
#define HWCAP_FPU (1U << 27) |
1407 |
#define HWCAP_POWER6_EXT (1U << 9) |
1408 |
#define HWCAP_VSX (1U << 7) |
1409 |
|
1410 |
-#define HWCAP2 26 /* AT_HWCAP2 */ |
1411 |
+#ifndef AT_HWCAP2 |
1412 |
+# define AT_HWCAP2 26 /* AT_HWCAP2 */ |
1413 |
+#endif |
1414 |
#define HWCAP_VEC_CRYPTO (1U << 25) |
1415 |
#define HWCAP_ARCH_3_00 (1U << 23) |
1416 |
+#define HWCAP_ARCH_3_1 (1U << 18) |
1417 |
|
1418 |
# if defined(__GNUC__) && __GNUC__>=2 |
1419 |
__attribute__ ((constructor)) |
1420 |
@@ -187,6 +193,9 @@ void OPENSSL_cpuid_setup(void) |
1421 |
if (__power_set(0xffffffffU<<17)) /* POWER9 and later */ |
1422 |
OPENSSL_ppccap_P |= PPC_MADD300; |
1423 |
|
1424 |
+ if (__power_set(0xffffffffU<<18)) /* POWER10 and later */ |
1425 |
+ OPENSSL_ppccap_P |= PPC_BRD31; |
1426 |
+ |
1427 |
return; |
1428 |
# endif |
1429 |
#endif |
1430 |
@@ -215,8 +224,8 @@ void OPENSSL_cpuid_setup(void) |
1431 |
|
1432 |
#ifdef OSSL_IMPLEMENT_GETAUXVAL |
1433 |
{ |
1434 |
- unsigned long hwcap = getauxval(HWCAP); |
1435 |
- unsigned long hwcap2 = getauxval(HWCAP2); |
1436 |
+ unsigned long hwcap = getauxval(AT_HWCAP); |
1437 |
+ unsigned long hwcap2 = getauxval(AT_HWCAP2); |
1438 |
|
1439 |
if (hwcap & HWCAP_FPU) { |
1440 |
OPENSSL_ppccap_P |= PPC_FPU; |
1441 |
@@ -242,6 +251,10 @@ void OPENSSL_cpuid_setup(void) |
1442 |
if (hwcap2 & HWCAP_ARCH_3_00) { |
1443 |
OPENSSL_ppccap_P |= PPC_MADD300; |
1444 |
} |
1445 |
+ |
1446 |
+ if (hwcap2 & HWCAP_ARCH_3_1) { |
1447 |
+ OPENSSL_ppccap_P |= PPC_BRD31; |
1448 |
+ } |
1449 |
} |
1450 |
#endif |
1451 |
|
1452 |
@@ -263,7 +276,7 @@ void OPENSSL_cpuid_setup(void) |
1453 |
sigaction(SIGILL, &ill_act, &ill_oact); |
1454 |
|
1455 |
#ifndef OSSL_IMPLEMENT_GETAUXVAL |
1456 |
- if (sigsetjmp(ill_jmp,1) == 0) { |
1457 |
+ if (sigsetjmp(ill_jmp, 1) == 0) { |
1458 |
OPENSSL_fpu_probe(); |
1459 |
OPENSSL_ppccap_P |= PPC_FPU; |
1460 |
|
1461 |
diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl |
1462 |
index c6555df..706164a 100755 |
1463 |
--- a/crypto/ppccpuid.pl |
1464 |
+++ b/crypto/ppccpuid.pl |
1465 |
@@ -81,6 +81,17 @@ $code=<<___; |
1466 |
.long 0 |
1467 |
.byte 0,12,0x14,0,0,0,0,0 |
1468 |
|
1469 |
+.globl .OPENSSL_brd31_probe |
1470 |
+.align 4 |
1471 |
+.OPENSSL_brd31_probe: |
1472 |
+ xor r0,r0,r0 |
1473 |
+ brd r3,r0 |
1474 |
+ blr |
1475 |
+ .long 0 |
1476 |
+ .byte 0,12,0x14,0,0,0,0,0 |
1477 |
+.size .OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe |
1478 |
+ |
1479 |
+ |
1480 |
.globl .OPENSSL_wipe_cpu |
1481 |
.align 4 |
1482 |
.OPENSSL_wipe_cpu: |
1483 |
diff --git a/include/crypto/ppc_arch.h b/include/crypto/ppc_arch.h |
1484 |
index 3b3ce4b..fcc846c 100644 |
1485 |
--- a/include/crypto/ppc_arch.h |
1486 |
+++ b/include/crypto/ppc_arch.h |
1487 |
@@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P; |
1488 |
# define PPC_MADD300 (1<<4) |
1489 |
# define PPC_MFTB (1<<5) |
1490 |
# define PPC_MFSPR268 (1<<6) |
1491 |
+# define PPC_BRD31 (1<<7) |
1492 |
|
1493 |
#endif |