/[smecontribs]/rpms/openssl3/contribs10/0071-AES-GCM-performance-optimization.patch
ViewVC logotype

Contents of /rpms/openssl3/contribs10/0071-AES-GCM-performance-optimization.patch

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download)
Wed Jan 31 17:24:43 2024 UTC (9 months, 3 weeks ago) by jpp
Branch: MAIN
CVS Tags: openssl3-3_0_7-5_el7_sme_1, HEAD
Initial import

1 Upstream-Status: Backport [https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c, https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd]
2 diff --git a/crypto/modes/asm/aes-gcm-ppc.pl b/crypto/modes/asm/aes-gcm-ppc.pl
3 new file mode 100644
4 index 0000000..6624e6c
5 --- /dev/null
6 +++ b/crypto/modes/asm/aes-gcm-ppc.pl
7 @@ -0,0 +1,1438 @@
8 +#! /usr/bin/env perl
9 +# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
10 +# Copyright 2021- IBM Inc. All rights reserved
11 +#
12 +# Licensed under the Apache License 2.0 (the "License"). You may not use
13 +# this file except in compliance with the License. You can obtain a copy
14 +# in the file LICENSE in the source distribution or at
15 +# https://www.openssl.org/source/license.html
16 +#
17 +#===================================================================================
18 +# Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
19 +#
20 +# GHASH is based on the Karatsuba multiplication method.
21 +#
22 +# Xi xor X1
23 +#
24 +# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
25 +# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
26 +# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
27 +# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
28 +# (X4.h * H.h + X4.l * H.l + X4 * H)
29 +#
30 +# Xi = v0
31 +# H Poly = v2
32 +# Hash keys = v3 - v14
33 +# ( H.l, H, H.h)
34 +# ( H^2.l, H^2, H^2.h)
35 +# ( H^3.l, H^3, H^3.h)
36 +# ( H^4.l, H^4, H^4.h)
37 +#
38 +# v30 is IV
39 +# v31 - counter 1
40 +#
41 +# AES used,
42 +# vs0 - vs14 for round keys
43 +# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
44 +#
45 +# This implementation uses stitched AES-GCM approach to improve overall performance.
46 +# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
47 +#
48 +# Current large block (16384 bytes) performance per second with 128 bit key --
49 +#
50 +# Encrypt Decrypt
51 +# Power10[le] (3.5GHz) 5.32G 5.26G
52 +#
53 +# ===================================================================================
54 +#
55 +# $output is the last argument if it looks like a file (it has an extension)
56 +# $flavour is the first argument if it doesn't look like a file
57 +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
58 +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
59 +
60 +if ($flavour =~ /64/) {
61 + $SIZE_T=8;
62 + $LRSAVE=2*$SIZE_T;
63 + $STU="stdu";
64 + $POP="ld";
65 + $PUSH="std";
66 + $UCMP="cmpld";
67 + $SHRI="srdi";
68 +} elsif ($flavour =~ /32/) {
69 + $SIZE_T=4;
70 + $LRSAVE=$SIZE_T;
71 + $STU="stwu";
72 + $POP="lwz";
73 + $PUSH="stw";
74 + $UCMP="cmplw";
75 + $SHRI="srwi";
76 +} else { die "nonsense $flavour"; }
77 +
78 +$sp="r1";
79 +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
80 +
81 +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
82 +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
83 +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
84 +die "can't locate ppc-xlate.pl";
85 +
86 +open STDOUT,"| $^X $xlate $flavour \"$output\""
87 + or die "can't call $xlate: $!";
88 +
89 +$code=<<___;
90 +.machine "any"
91 +.text
92 +
93 +# 4x loops
94 +# v15 - v18 - input states
95 +# vs1 - vs9 - round keys
96 +#
97 +.macro Loop_aes_middle4x
98 + xxlor 19+32, 1, 1
99 + xxlor 20+32, 2, 2
100 + xxlor 21+32, 3, 3
101 + xxlor 22+32, 4, 4
102 +
103 + vcipher 15, 15, 19
104 + vcipher 16, 16, 19
105 + vcipher 17, 17, 19
106 + vcipher 18, 18, 19
107 +
108 + vcipher 15, 15, 20
109 + vcipher 16, 16, 20
110 + vcipher 17, 17, 20
111 + vcipher 18, 18, 20
112 +
113 + vcipher 15, 15, 21
114 + vcipher 16, 16, 21
115 + vcipher 17, 17, 21
116 + vcipher 18, 18, 21
117 +
118 + vcipher 15, 15, 22
119 + vcipher 16, 16, 22
120 + vcipher 17, 17, 22
121 + vcipher 18, 18, 22
122 +
123 + xxlor 19+32, 5, 5
124 + xxlor 20+32, 6, 6
125 + xxlor 21+32, 7, 7
126 + xxlor 22+32, 8, 8
127 +
128 + vcipher 15, 15, 19
129 + vcipher 16, 16, 19
130 + vcipher 17, 17, 19
131 + vcipher 18, 18, 19
132 +
133 + vcipher 15, 15, 20
134 + vcipher 16, 16, 20
135 + vcipher 17, 17, 20
136 + vcipher 18, 18, 20
137 +
138 + vcipher 15, 15, 21
139 + vcipher 16, 16, 21
140 + vcipher 17, 17, 21
141 + vcipher 18, 18, 21
142 +
143 + vcipher 15, 15, 22
144 + vcipher 16, 16, 22
145 + vcipher 17, 17, 22
146 + vcipher 18, 18, 22
147 +
148 + xxlor 23+32, 9, 9
149 + vcipher 15, 15, 23
150 + vcipher 16, 16, 23
151 + vcipher 17, 17, 23
152 + vcipher 18, 18, 23
153 +.endm
154 +
155 +# 8x loops
156 +# v15 - v22 - input states
157 +# vs1 - vs9 - round keys
158 +#
159 +.macro Loop_aes_middle8x
160 + xxlor 23+32, 1, 1
161 + xxlor 24+32, 2, 2
162 + xxlor 25+32, 3, 3
163 + xxlor 26+32, 4, 4
164 +
165 + vcipher 15, 15, 23
166 + vcipher 16, 16, 23
167 + vcipher 17, 17, 23
168 + vcipher 18, 18, 23
169 + vcipher 19, 19, 23
170 + vcipher 20, 20, 23
171 + vcipher 21, 21, 23
172 + vcipher 22, 22, 23
173 +
174 + vcipher 15, 15, 24
175 + vcipher 16, 16, 24
176 + vcipher 17, 17, 24
177 + vcipher 18, 18, 24
178 + vcipher 19, 19, 24
179 + vcipher 20, 20, 24
180 + vcipher 21, 21, 24
181 + vcipher 22, 22, 24
182 +
183 + vcipher 15, 15, 25
184 + vcipher 16, 16, 25
185 + vcipher 17, 17, 25
186 + vcipher 18, 18, 25
187 + vcipher 19, 19, 25
188 + vcipher 20, 20, 25
189 + vcipher 21, 21, 25
190 + vcipher 22, 22, 25
191 +
192 + vcipher 15, 15, 26
193 + vcipher 16, 16, 26
194 + vcipher 17, 17, 26
195 + vcipher 18, 18, 26
196 + vcipher 19, 19, 26
197 + vcipher 20, 20, 26
198 + vcipher 21, 21, 26
199 + vcipher 22, 22, 26
200 +
201 + xxlor 23+32, 5, 5
202 + xxlor 24+32, 6, 6
203 + xxlor 25+32, 7, 7
204 + xxlor 26+32, 8, 8
205 +
206 + vcipher 15, 15, 23
207 + vcipher 16, 16, 23
208 + vcipher 17, 17, 23
209 + vcipher 18, 18, 23
210 + vcipher 19, 19, 23
211 + vcipher 20, 20, 23
212 + vcipher 21, 21, 23
213 + vcipher 22, 22, 23
214 +
215 + vcipher 15, 15, 24
216 + vcipher 16, 16, 24
217 + vcipher 17, 17, 24
218 + vcipher 18, 18, 24
219 + vcipher 19, 19, 24
220 + vcipher 20, 20, 24
221 + vcipher 21, 21, 24
222 + vcipher 22, 22, 24
223 +
224 + vcipher 15, 15, 25
225 + vcipher 16, 16, 25
226 + vcipher 17, 17, 25
227 + vcipher 18, 18, 25
228 + vcipher 19, 19, 25
229 + vcipher 20, 20, 25
230 + vcipher 21, 21, 25
231 + vcipher 22, 22, 25
232 +
233 + vcipher 15, 15, 26
234 + vcipher 16, 16, 26
235 + vcipher 17, 17, 26
236 + vcipher 18, 18, 26
237 + vcipher 19, 19, 26
238 + vcipher 20, 20, 26
239 + vcipher 21, 21, 26
240 + vcipher 22, 22, 26
241 +
242 + xxlor 23+32, 9, 9
243 + vcipher 15, 15, 23
244 + vcipher 16, 16, 23
245 + vcipher 17, 17, 23
246 + vcipher 18, 18, 23
247 + vcipher 19, 19, 23
248 + vcipher 20, 20, 23
249 + vcipher 21, 21, 23
250 + vcipher 22, 22, 23
251 +.endm
252 +
253 +#
254 +# Compute 4x hash values based on Karatsuba method.
255 +#
256 +ppc_aes_gcm_ghash:
257 + vxor 15, 15, 0
258 +
259 + xxlxor 29, 29, 29
260 +
261 + vpmsumd 23, 12, 15 # H4.L * X.L
262 + vpmsumd 24, 9, 16
263 + vpmsumd 25, 6, 17
264 + vpmsumd 26, 3, 18
265 +
266 + vxor 23, 23, 24
267 + vxor 23, 23, 25
268 + vxor 23, 23, 26 # L
269 +
270 + vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
271 + vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
272 + vpmsumd 26, 7, 17
273 + vpmsumd 27, 4, 18
274 +
275 + vxor 24, 24, 25
276 + vxor 24, 24, 26
277 + vxor 24, 24, 27 # M
278 +
279 + # sum hash and reduction with H Poly
280 + vpmsumd 28, 23, 2 # reduction
281 +
282 + xxlor 29+32, 29, 29
283 + vsldoi 26, 24, 29, 8 # mL
284 + vsldoi 29, 29, 24, 8 # mH
285 + vxor 23, 23, 26 # mL + L
286 +
287 + vsldoi 23, 23, 23, 8 # swap
288 + vxor 23, 23, 28
289 +
290 + vpmsumd 24, 14, 15 # H4.H * X.H
291 + vpmsumd 25, 11, 16
292 + vpmsumd 26, 8, 17
293 + vpmsumd 27, 5, 18
294 +
295 + vxor 24, 24, 25
296 + vxor 24, 24, 26
297 + vxor 24, 24, 27
298 +
299 + vxor 24, 24, 29
300 +
301 + # sum hash and reduction with H Poly
302 + vsldoi 27, 23, 23, 8 # swap
303 + vpmsumd 23, 23, 2
304 + vxor 27, 27, 24
305 + vxor 23, 23, 27
306 +
307 + xxlor 32, 23+32, 23+32 # update hash
308 +
309 + blr
310 +
311 +#
312 +# Combine two 4x ghash
313 +# v15 - v22 - input blocks
314 +#
315 +.macro ppc_aes_gcm_ghash2_4x
316 + # first 4x hash
317 + vxor 15, 15, 0 # Xi + X
318 +
319 + xxlxor 29, 29, 29
320 +
321 + vpmsumd 23, 12, 15 # H4.L * X.L
322 + vpmsumd 24, 9, 16
323 + vpmsumd 25, 6, 17
324 + vpmsumd 26, 3, 18
325 +
326 + vxor 23, 23, 24
327 + vxor 23, 23, 25
328 + vxor 23, 23, 26 # L
329 +
330 + vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
331 + vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
332 + vpmsumd 26, 7, 17
333 + vpmsumd 27, 4, 18
334 +
335 + vxor 24, 24, 25
336 + vxor 24, 24, 26
337 +
338 + # sum hash and reduction with H Poly
339 + vpmsumd 28, 23, 2 # reduction
340 +
341 + xxlor 29+32, 29, 29
342 +
343 + vxor 24, 24, 27 # M
344 + vsldoi 26, 24, 29, 8 # mL
345 + vsldoi 29, 29, 24, 8 # mH
346 + vxor 23, 23, 26 # mL + L
347 +
348 + vsldoi 23, 23, 23, 8 # swap
349 + vxor 23, 23, 28
350 +
351 + vpmsumd 24, 14, 15 # H4.H * X.H
352 + vpmsumd 25, 11, 16
353 + vpmsumd 26, 8, 17
354 + vpmsumd 27, 5, 18
355 +
356 + vxor 24, 24, 25
357 + vxor 24, 24, 26
358 + vxor 24, 24, 27 # H
359 +
360 + vxor 24, 24, 29 # H + mH
361 +
362 + # sum hash and reduction with H Poly
363 + vsldoi 27, 23, 23, 8 # swap
364 + vpmsumd 23, 23, 2
365 + vxor 27, 27, 24
366 + vxor 27, 23, 27 # 1st Xi
367 +
368 + # 2nd 4x hash
369 + vpmsumd 24, 9, 20
370 + vpmsumd 25, 6, 21
371 + vpmsumd 26, 3, 22
372 + vxor 19, 19, 27 # Xi + X
373 + vpmsumd 23, 12, 19 # H4.L * X.L
374 +
375 + vxor 23, 23, 24
376 + vxor 23, 23, 25
377 + vxor 23, 23, 26 # L
378 +
379 + vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
380 + vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
381 + vpmsumd 26, 7, 21
382 + vpmsumd 27, 4, 22
383 +
384 + vxor 24, 24, 25
385 + vxor 24, 24, 26
386 +
387 + # sum hash and reduction with H Poly
388 + vpmsumd 28, 23, 2 # reduction
389 +
390 + xxlor 29+32, 29, 29
391 +
392 + vxor 24, 24, 27 # M
393 + vsldoi 26, 24, 29, 8 # mL
394 + vsldoi 29, 29, 24, 8 # mH
395 + vxor 23, 23, 26 # mL + L
396 +
397 + vsldoi 23, 23, 23, 8 # swap
398 + vxor 23, 23, 28
399 +
400 + vpmsumd 24, 14, 19 # H4.H * X.H
401 + vpmsumd 25, 11, 20
402 + vpmsumd 26, 8, 21
403 + vpmsumd 27, 5, 22
404 +
405 + vxor 24, 24, 25
406 + vxor 24, 24, 26
407 + vxor 24, 24, 27 # H
408 +
409 + vxor 24, 24, 29 # H + mH
410 +
411 + # sum hash and reduction with H Poly
412 + vsldoi 27, 23, 23, 8 # swap
413 + vpmsumd 23, 23, 2
414 + vxor 27, 27, 24
415 + vxor 23, 23, 27
416 +
417 + xxlor 32, 23+32, 23+32 # update hash
418 +
419 +.endm
420 +
421 +#
422 +# Compute update single hash
423 +#
424 +.macro ppc_update_hash_1x
425 + vxor 28, 28, 0
426 +
427 + vxor 19, 19, 19
428 +
429 + vpmsumd 22, 3, 28 # L
430 + vpmsumd 23, 4, 28 # M
431 + vpmsumd 24, 5, 28 # H
432 +
433 + vpmsumd 27, 22, 2 # reduction
434 +
435 + vsldoi 25, 23, 19, 8 # mL
436 + vsldoi 26, 19, 23, 8 # mH
437 + vxor 22, 22, 25 # LL + LL
438 + vxor 24, 24, 26 # HH + HH
439 +
440 + vsldoi 22, 22, 22, 8 # swap
441 + vxor 22, 22, 27
442 +
443 + vsldoi 20, 22, 22, 8 # swap
444 + vpmsumd 22, 22, 2 # reduction
445 + vxor 20, 20, 24
446 + vxor 22, 22, 20
447 +
448 + vmr 0, 22 # update hash
449 +
450 +.endm
451 +
452 +#
453 +# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
454 +# const AES_KEY *key, unsigned char iv[16],
455 +# void *Xip);
456 +#
457 +# r3 - inp
458 +# r4 - out
459 +# r5 - len
460 +# r6 - AES round keys
461 +# r7 - iv
462 +# r8 - Xi, HPoli, hash keys
463 +#
464 +.global ppc_aes_gcm_encrypt
465 +.align 5
466 +ppc_aes_gcm_encrypt:
467 +_ppc_aes_gcm_encrypt:
468 +
469 + stdu 1,-512(1)
470 + mflr 0
471 +
472 + std 14,112(1)
473 + std 15,120(1)
474 + std 16,128(1)
475 + std 17,136(1)
476 + std 18,144(1)
477 + std 19,152(1)
478 + std 20,160(1)
479 + std 21,168(1)
480 + li 9, 256
481 + stvx 20, 9, 1
482 + addi 9, 9, 16
483 + stvx 21, 9, 1
484 + addi 9, 9, 16
485 + stvx 22, 9, 1
486 + addi 9, 9, 16
487 + stvx 23, 9, 1
488 + addi 9, 9, 16
489 + stvx 24, 9, 1
490 + addi 9, 9, 16
491 + stvx 25, 9, 1
492 + addi 9, 9, 16
493 + stvx 26, 9, 1
494 + addi 9, 9, 16
495 + stvx 27, 9, 1
496 + addi 9, 9, 16
497 + stvx 28, 9, 1
498 + addi 9, 9, 16
499 + stvx 29, 9, 1
500 + addi 9, 9, 16
501 + stvx 30, 9, 1
502 + addi 9, 9, 16
503 + stvx 31, 9, 1
504 + std 0, 528(1)
505 +
506 + # Load Xi
507 + lxvb16x 32, 0, 8 # load Xi
508 +
509 + # load Hash - h^4, h^3, h^2, h
510 + li 10, 32
511 + lxvd2x 2+32, 10, 8 # H Poli
512 + li 10, 48
513 + lxvd2x 3+32, 10, 8 # Hl
514 + li 10, 64
515 + lxvd2x 4+32, 10, 8 # H
516 + li 10, 80
517 + lxvd2x 5+32, 10, 8 # Hh
518 +
519 + li 10, 96
520 + lxvd2x 6+32, 10, 8 # H^2l
521 + li 10, 112
522 + lxvd2x 7+32, 10, 8 # H^2
523 + li 10, 128
524 + lxvd2x 8+32, 10, 8 # H^2h
525 +
526 + li 10, 144
527 + lxvd2x 9+32, 10, 8 # H^3l
528 + li 10, 160
529 + lxvd2x 10+32, 10, 8 # H^3
530 + li 10, 176
531 + lxvd2x 11+32, 10, 8 # H^3h
532 +
533 + li 10, 192
534 + lxvd2x 12+32, 10, 8 # H^4l
535 + li 10, 208
536 + lxvd2x 13+32, 10, 8 # H^4
537 + li 10, 224
538 + lxvd2x 14+32, 10, 8 # H^4h
539 +
540 + # initialize ICB: GHASH( IV ), IV - r7
541 + lxvb16x 30+32, 0, 7 # load IV - v30
542 +
543 + mr 12, 5 # length
544 + li 11, 0 # block index
545 +
546 + # counter 1
547 + vxor 31, 31, 31
548 + vspltisb 22, 1
549 + vsldoi 31, 31, 22,1 # counter 1
550 +
551 + # load round key to VSR
552 + lxv 0, 0(6)
553 + lxv 1, 0x10(6)
554 + lxv 2, 0x20(6)
555 + lxv 3, 0x30(6)
556 + lxv 4, 0x40(6)
557 + lxv 5, 0x50(6)
558 + lxv 6, 0x60(6)
559 + lxv 7, 0x70(6)
560 + lxv 8, 0x80(6)
561 + lxv 9, 0x90(6)
562 + lxv 10, 0xa0(6)
563 +
564 + # load rounds - 10 (128), 12 (192), 14 (256)
565 + lwz 9,240(6)
566 +
567 + #
568 + # vxor state, state, w # addroundkey
569 + xxlor 32+29, 0, 0
570 + vxor 15, 30, 29 # IV + round key - add round key 0
571 +
572 + cmpdi 9, 10
573 + beq Loop_aes_gcm_8x
574 +
575 + # load 2 more round keys (v11, v12)
576 + lxv 11, 0xb0(6)
577 + lxv 12, 0xc0(6)
578 +
579 + cmpdi 9, 12
580 + beq Loop_aes_gcm_8x
581 +
582 + # load 2 more round keys (v11, v12, v13, v14)
583 + lxv 13, 0xd0(6)
584 + lxv 14, 0xe0(6)
585 + cmpdi 9, 14
586 + beq Loop_aes_gcm_8x
587 +
588 + b aes_gcm_out
589 +
590 +.align 5
591 +Loop_aes_gcm_8x:
592 + mr 14, 3
593 + mr 9, 4
594 +
595 + # n blocks
596 + li 10, 128
597 + divdu 10, 5, 10 # n 128 bytes-blocks
598 + cmpdi 10, 0
599 + beq Loop_last_block
600 +
601 + vaddudm 30, 30, 31 # IV + counter
602 + vxor 16, 30, 29
603 + vaddudm 30, 30, 31
604 + vxor 17, 30, 29
605 + vaddudm 30, 30, 31
606 + vxor 18, 30, 29
607 + vaddudm 30, 30, 31
608 + vxor 19, 30, 29
609 + vaddudm 30, 30, 31
610 + vxor 20, 30, 29
611 + vaddudm 30, 30, 31
612 + vxor 21, 30, 29
613 + vaddudm 30, 30, 31
614 + vxor 22, 30, 29
615 +
616 + mtctr 10
617 +
618 + li 15, 16
619 + li 16, 32
620 + li 17, 48
621 + li 18, 64
622 + li 19, 80
623 + li 20, 96
624 + li 21, 112
625 +
626 + lwz 10, 240(6)
627 +
628 +Loop_8x_block:
629 +
630 + lxvb16x 15, 0, 14 # load block
631 + lxvb16x 16, 15, 14 # load block
632 + lxvb16x 17, 16, 14 # load block
633 + lxvb16x 18, 17, 14 # load block
634 + lxvb16x 19, 18, 14 # load block
635 + lxvb16x 20, 19, 14 # load block
636 + lxvb16x 21, 20, 14 # load block
637 + lxvb16x 22, 21, 14 # load block
638 + addi 14, 14, 128
639 +
640 + Loop_aes_middle8x
641 +
642 + xxlor 23+32, 10, 10
643 +
644 + cmpdi 10, 10
645 + beq Do_next_ghash
646 +
647 + # 192 bits
648 + xxlor 24+32, 11, 11
649 +
650 + vcipher 15, 15, 23
651 + vcipher 16, 16, 23
652 + vcipher 17, 17, 23
653 + vcipher 18, 18, 23
654 + vcipher 19, 19, 23
655 + vcipher 20, 20, 23
656 + vcipher 21, 21, 23
657 + vcipher 22, 22, 23
658 +
659 + vcipher 15, 15, 24
660 + vcipher 16, 16, 24
661 + vcipher 17, 17, 24
662 + vcipher 18, 18, 24
663 + vcipher 19, 19, 24
664 + vcipher 20, 20, 24
665 + vcipher 21, 21, 24
666 + vcipher 22, 22, 24
667 +
668 + xxlor 23+32, 12, 12
669 +
670 + cmpdi 10, 12
671 + beq Do_next_ghash
672 +
673 + # 256 bits
674 + xxlor 24+32, 13, 13
675 +
676 + vcipher 15, 15, 23
677 + vcipher 16, 16, 23
678 + vcipher 17, 17, 23
679 + vcipher 18, 18, 23
680 + vcipher 19, 19, 23
681 + vcipher 20, 20, 23
682 + vcipher 21, 21, 23
683 + vcipher 22, 22, 23
684 +
685 + vcipher 15, 15, 24
686 + vcipher 16, 16, 24
687 + vcipher 17, 17, 24
688 + vcipher 18, 18, 24
689 + vcipher 19, 19, 24
690 + vcipher 20, 20, 24
691 + vcipher 21, 21, 24
692 + vcipher 22, 22, 24
693 +
694 + xxlor 23+32, 14, 14
695 +
696 + cmpdi 10, 14
697 + beq Do_next_ghash
698 + b aes_gcm_out
699 +
700 +Do_next_ghash:
701 +
702 + #
703 + # last round
704 + vcipherlast 15, 15, 23
705 + vcipherlast 16, 16, 23
706 +
707 + xxlxor 47, 47, 15
708 + stxvb16x 47, 0, 9 # store output
709 + xxlxor 48, 48, 16
710 + stxvb16x 48, 15, 9 # store output
711 +
712 + vcipherlast 17, 17, 23
713 + vcipherlast 18, 18, 23
714 +
715 + xxlxor 49, 49, 17
716 + stxvb16x 49, 16, 9 # store output
717 + xxlxor 50, 50, 18
718 + stxvb16x 50, 17, 9 # store output
719 +
720 + vcipherlast 19, 19, 23
721 + vcipherlast 20, 20, 23
722 +
723 + xxlxor 51, 51, 19
724 + stxvb16x 51, 18, 9 # store output
725 + xxlxor 52, 52, 20
726 + stxvb16x 52, 19, 9 # store output
727 +
728 + vcipherlast 21, 21, 23
729 + vcipherlast 22, 22, 23
730 +
731 + xxlxor 53, 53, 21
732 + stxvb16x 53, 20, 9 # store output
733 + xxlxor 54, 54, 22
734 + stxvb16x 54, 21, 9 # store output
735 +
736 + addi 9, 9, 128
737 +
738 + # ghash here
739 + ppc_aes_gcm_ghash2_4x
740 +
741 + xxlor 27+32, 0, 0
742 + vaddudm 30, 30, 31 # IV + counter
743 + vmr 29, 30
744 + vxor 15, 30, 27 # add round key
745 + vaddudm 30, 30, 31
746 + vxor 16, 30, 27
747 + vaddudm 30, 30, 31
748 + vxor 17, 30, 27
749 + vaddudm 30, 30, 31
750 + vxor 18, 30, 27
751 + vaddudm 30, 30, 31
752 + vxor 19, 30, 27
753 + vaddudm 30, 30, 31
754 + vxor 20, 30, 27
755 + vaddudm 30, 30, 31
756 + vxor 21, 30, 27
757 + vaddudm 30, 30, 31
758 + vxor 22, 30, 27
759 +
760 + addi 12, 12, -128
761 + addi 11, 11, 128
762 +
763 + bdnz Loop_8x_block
764 +
765 + vmr 30, 29
766 +
767 +Loop_last_block:
768 + cmpdi 12, 0
769 + beq aes_gcm_out
770 +
771 + # loop last few blocks
772 + li 10, 16
773 + divdu 10, 12, 10
774 +
775 + mtctr 10
776 +
777 + lwz 10, 240(6)
778 +
779 + cmpdi 12, 16
780 + blt Final_block
781 +
782 +.macro Loop_aes_middle_1x
783 + xxlor 19+32, 1, 1
784 + xxlor 20+32, 2, 2
785 + xxlor 21+32, 3, 3
786 + xxlor 22+32, 4, 4
787 +
788 + vcipher 15, 15, 19
789 + vcipher 15, 15, 20
790 + vcipher 15, 15, 21
791 + vcipher 15, 15, 22
792 +
793 + xxlor 19+32, 5, 5
794 + xxlor 20+32, 6, 6
795 + xxlor 21+32, 7, 7
796 + xxlor 22+32, 8, 8
797 +
798 + vcipher 15, 15, 19
799 + vcipher 15, 15, 20
800 + vcipher 15, 15, 21
801 + vcipher 15, 15, 22
802 +
803 + xxlor 19+32, 9, 9
804 + vcipher 15, 15, 19
805 +.endm
806 +
807 +Next_rem_block:
808 + lxvb16x 15, 0, 14 # load block
809 +
810 + Loop_aes_middle_1x
811 +
812 + xxlor 23+32, 10, 10
813 +
814 + cmpdi 10, 10
815 + beq Do_next_1x
816 +
817 + # 192 bits
818 + xxlor 24+32, 11, 11
819 +
820 + vcipher 15, 15, 23
821 + vcipher 15, 15, 24
822 +
823 + xxlor 23+32, 12, 12
824 +
825 + cmpdi 10, 12
826 + beq Do_next_1x
827 +
828 + # 256 bits
829 + xxlor 24+32, 13, 13
830 +
831 + vcipher 15, 15, 23
832 + vcipher 15, 15, 24
833 +
834 + xxlor 23+32, 14, 14
835 +
836 + cmpdi 10, 14
837 + beq Do_next_1x
838 +
839 +Do_next_1x:
840 + vcipherlast 15, 15, 23
841 +
842 + xxlxor 47, 47, 15
843 + stxvb16x 47, 0, 9 # store output
844 + addi 14, 14, 16
845 + addi 9, 9, 16
846 +
847 + vmr 28, 15
848 + ppc_update_hash_1x
849 +
850 + addi 12, 12, -16
851 + addi 11, 11, 16
852 + xxlor 19+32, 0, 0
853 + vaddudm 30, 30, 31 # IV + counter
854 + vxor 15, 30, 19 # add round key
855 +
856 + bdnz Next_rem_block
857 +
858 + cmpdi 12, 0
859 + beq aes_gcm_out
860 +
861 +Final_block:
862 + Loop_aes_middle_1x
863 +
864 + xxlor 23+32, 10, 10
865 +
866 + cmpdi 10, 10
867 + beq Do_final_1x
868 +
869 + # 192 bits
870 + xxlor 24+32, 11, 11
871 +
872 + vcipher 15, 15, 23
873 + vcipher 15, 15, 24
874 +
875 + xxlor 23+32, 12, 12
876 +
877 + cmpdi 10, 12
878 + beq Do_final_1x
879 +
880 + # 256 bits
881 + xxlor 24+32, 13, 13
882 +
883 + vcipher 15, 15, 23
884 + vcipher 15, 15, 24
885 +
886 + xxlor 23+32, 14, 14
887 +
888 + cmpdi 10, 14
889 + beq Do_final_1x
890 +
891 +Do_final_1x:
892 + vcipherlast 15, 15, 23
893 +
894 + lxvb16x 15, 0, 14 # load last block
895 + xxlxor 47, 47, 15
896 +
897 + # create partial block mask
898 + li 15, 16
899 + sub 15, 15, 12 # index to the mask
900 +
901 + vspltisb 16, -1 # first 16 bytes - 0xffff...ff
902 + vspltisb 17, 0 # second 16 bytes - 0x0000...00
903 + li 10, 192
904 + stvx 16, 10, 1
905 + addi 10, 10, 16
906 + stvx 17, 10, 1
907 +
908 + addi 10, 1, 192
909 + lxvb16x 16, 15, 10 # load partial block mask
910 + xxland 47, 47, 16
911 +
912 + vmr 28, 15
913 + ppc_update_hash_1x
914 +
915 + # * should store only the remaining bytes.
916 + bl Write_partial_block
917 +
918 + b aes_gcm_out
919 +
920 +#
921 +# Write partial block
922 +# r9 - output
923 +# r12 - remaining bytes
924 +# v15 - partial input data
925 +#
926 +Write_partial_block:
927 + li 10, 192
928 + stxvb16x 15+32, 10, 1 # last block
929 +
930 + #add 10, 9, 11 # Output
931 + addi 10, 9, -1
932 + addi 16, 1, 191
933 +
934 + mtctr 12 # remaining bytes
935 + li 15, 0
936 +
937 +Write_last_byte:
938 + lbzu 14, 1(16)
939 + stbu 14, 1(10)
940 + bdnz Write_last_byte
941 + blr
942 +
943 +aes_gcm_out:
944 + # out = state
945 + stxvb16x 32, 0, 8 # write out Xi
946 + add 3, 11, 12 # return count
947 +
948 + li 9, 256
949 + lvx 20, 9, 1
950 + addi 9, 9, 16
951 + lvx 21, 9, 1
952 + addi 9, 9, 16
953 + lvx 22, 9, 1
954 + addi 9, 9, 16
955 + lvx 23, 9, 1
956 + addi 9, 9, 16
957 + lvx 24, 9, 1
958 + addi 9, 9, 16
959 + lvx 25, 9, 1
960 + addi 9, 9, 16
961 + lvx 26, 9, 1
962 + addi 9, 9, 16
963 + lvx 27, 9, 1
964 + addi 9, 9, 16
965 + lvx 28, 9, 1
966 + addi 9, 9, 16
967 + lvx 29, 9, 1
968 + addi 9, 9, 16
969 + lvx 30, 9, 1
970 + addi 9, 9, 16
971 + lvx 31, 9, 1
972 +
973 + ld 0, 528(1)
974 + ld 14,112(1)
975 + ld 15,120(1)
976 + ld 16,128(1)
977 + ld 17,136(1)
978 + ld 18,144(1)
979 + ld 19,152(1)
980 + ld 20,160(1)
981 + ld 21,168(1)
982 +
983 + mtlr 0
984 + addi 1, 1, 512
985 + blr
986 +
987 +#
988 +# 8x Decrypt
989 +#
990 +.global ppc_aes_gcm_decrypt
991 +.align 5
992 +ppc_aes_gcm_decrypt:
993 +_ppc_aes_gcm_decrypt:
994 +
995 + stdu 1,-512(1)
996 + mflr 0
997 +
998 + std 14,112(1)
999 + std 15,120(1)
1000 + std 16,128(1)
1001 + std 17,136(1)
1002 + std 18,144(1)
1003 + std 19,152(1)
1004 + std 20,160(1)
1005 + std 21,168(1)
1006 + li 9, 256
1007 + stvx 20, 9, 1
1008 + addi 9, 9, 16
1009 + stvx 21, 9, 1
1010 + addi 9, 9, 16
1011 + stvx 22, 9, 1
1012 + addi 9, 9, 16
1013 + stvx 23, 9, 1
1014 + addi 9, 9, 16
1015 + stvx 24, 9, 1
1016 + addi 9, 9, 16
1017 + stvx 25, 9, 1
1018 + addi 9, 9, 16
1019 + stvx 26, 9, 1
1020 + addi 9, 9, 16
1021 + stvx 27, 9, 1
1022 + addi 9, 9, 16
1023 + stvx 28, 9, 1
1024 + addi 9, 9, 16
1025 + stvx 29, 9, 1
1026 + addi 9, 9, 16
1027 + stvx 30, 9, 1
1028 + addi 9, 9, 16
1029 + stvx 31, 9, 1
1030 + std 0, 528(1)
1031 +
1032 + # Load Xi
1033 + lxvb16x 32, 0, 8 # load Xi
1034 +
1035 + # load Hash - h^4, h^3, h^2, h
1036 + li 10, 32
1037 + lxvd2x 2+32, 10, 8 # H Poli
1038 + li 10, 48
1039 + lxvd2x 3+32, 10, 8 # Hl
1040 + li 10, 64
1041 + lxvd2x 4+32, 10, 8 # H
1042 + li 10, 80
1043 + lxvd2x 5+32, 10, 8 # Hh
1044 +
1045 + li 10, 96
1046 + lxvd2x 6+32, 10, 8 # H^2l
1047 + li 10, 112
1048 + lxvd2x 7+32, 10, 8 # H^2
1049 + li 10, 128
1050 + lxvd2x 8+32, 10, 8 # H^2h
1051 +
1052 + li 10, 144
1053 + lxvd2x 9+32, 10, 8 # H^3l
1054 + li 10, 160
1055 + lxvd2x 10+32, 10, 8 # H^3
1056 + li 10, 176
1057 + lxvd2x 11+32, 10, 8 # H^3h
1058 +
1059 + li 10, 192
1060 + lxvd2x 12+32, 10, 8 # H^4l
1061 + li 10, 208
1062 + lxvd2x 13+32, 10, 8 # H^4
1063 + li 10, 224
1064 + lxvd2x 14+32, 10, 8 # H^4h
1065 +
1066 + # initialize ICB: GHASH( IV ), IV - r7
1067 + lxvb16x 30+32, 0, 7 # load IV - v30
1068 +
1069 + mr 12, 5 # length
1070 + li 11, 0 # block index
1071 +
1072 + # counter 1
1073 + vxor 31, 31, 31
1074 + vspltisb 22, 1
1075 + vsldoi 31, 31, 22,1 # counter 1
1076 +
1077 + # load round key to VSR
1078 + lxv 0, 0(6)
1079 + lxv 1, 0x10(6)
1080 + lxv 2, 0x20(6)
1081 + lxv 3, 0x30(6)
1082 + lxv 4, 0x40(6)
1083 + lxv 5, 0x50(6)
1084 + lxv 6, 0x60(6)
1085 + lxv 7, 0x70(6)
1086 + lxv 8, 0x80(6)
1087 + lxv 9, 0x90(6)
1088 + lxv 10, 0xa0(6)
1089 +
1090 + # load rounds - 10 (128), 12 (192), 14 (256)
1091 + lwz 9,240(6)
1092 +
1093 + #
1094 + # vxor state, state, w # addroundkey
1095 + xxlor 32+29, 0, 0
1096 + vxor 15, 30, 29 # IV + round key - add round key 0
1097 +
1098 + cmpdi 9, 10
1099 + beq Loop_aes_gcm_8x_dec
1100 +
1101 + # load 2 more round keys (v11, v12)
1102 + lxv 11, 0xb0(6)
1103 + lxv 12, 0xc0(6)
1104 +
1105 + cmpdi 9, 12
1106 + beq Loop_aes_gcm_8x_dec
1107 +
1108 + # load 2 more round keys (v11, v12, v13, v14)
1109 + lxv 13, 0xd0(6)
1110 + lxv 14, 0xe0(6)
1111 + cmpdi 9, 14
1112 + beq Loop_aes_gcm_8x_dec
1113 +
1114 + b aes_gcm_out
1115 +
1116 +.align 5
1117 +Loop_aes_gcm_8x_dec:
1118 + mr 14, 3
1119 + mr 9, 4
1120 +
1121 + # n blocks
1122 + li 10, 128
1123 + divdu 10, 5, 10 # n 128 bytes-blocks
1124 + cmpdi 10, 0
1125 + beq Loop_last_block_dec
1126 +
1127 + vaddudm 30, 30, 31 # IV + counter
1128 + vxor 16, 30, 29
1129 + vaddudm 30, 30, 31
1130 + vxor 17, 30, 29
1131 + vaddudm 30, 30, 31
1132 + vxor 18, 30, 29
1133 + vaddudm 30, 30, 31
1134 + vxor 19, 30, 29
1135 + vaddudm 30, 30, 31
1136 + vxor 20, 30, 29
1137 + vaddudm 30, 30, 31
1138 + vxor 21, 30, 29
1139 + vaddudm 30, 30, 31
1140 + vxor 22, 30, 29
1141 +
1142 + mtctr 10
1143 +
1144 + li 15, 16
1145 + li 16, 32
1146 + li 17, 48
1147 + li 18, 64
1148 + li 19, 80
1149 + li 20, 96
1150 + li 21, 112
1151 +
1152 + lwz 10, 240(6)
1153 +
1154 +Loop_8x_block_dec:
1155 +
1156 + lxvb16x 15, 0, 14 # load block
1157 + lxvb16x 16, 15, 14 # load block
1158 + lxvb16x 17, 16, 14 # load block
1159 + lxvb16x 18, 17, 14 # load block
1160 + lxvb16x 19, 18, 14 # load block
1161 + lxvb16x 20, 19, 14 # load block
1162 + lxvb16x 21, 20, 14 # load block
1163 + lxvb16x 22, 21, 14 # load block
1164 + addi 14, 14, 128
1165 +
1166 + Loop_aes_middle8x
1167 +
1168 + xxlor 23+32, 10, 10
1169 +
1170 + cmpdi 10, 10
1171 + beq Do_last_aes_dec
1172 +
1173 + # 192 bits
1174 + xxlor 24+32, 11, 11
1175 +
1176 + vcipher 15, 15, 23
1177 + vcipher 16, 16, 23
1178 + vcipher 17, 17, 23
1179 + vcipher 18, 18, 23
1180 + vcipher 19, 19, 23
1181 + vcipher 20, 20, 23
1182 + vcipher 21, 21, 23
1183 + vcipher 22, 22, 23
1184 +
1185 + vcipher 15, 15, 24
1186 + vcipher 16, 16, 24
1187 + vcipher 17, 17, 24
1188 + vcipher 18, 18, 24
1189 + vcipher 19, 19, 24
1190 + vcipher 20, 20, 24
1191 + vcipher 21, 21, 24
1192 + vcipher 22, 22, 24
1193 +
1194 + xxlor 23+32, 12, 12
1195 +
1196 + cmpdi 10, 12
1197 + beq Do_last_aes_dec
1198 +
1199 + # 256 bits
1200 + xxlor 24+32, 13, 13
1201 +
1202 + vcipher 15, 15, 23
1203 + vcipher 16, 16, 23
1204 + vcipher 17, 17, 23
1205 + vcipher 18, 18, 23
1206 + vcipher 19, 19, 23
1207 + vcipher 20, 20, 23
1208 + vcipher 21, 21, 23
1209 + vcipher 22, 22, 23
1210 +
1211 + vcipher 15, 15, 24
1212 + vcipher 16, 16, 24
1213 + vcipher 17, 17, 24
1214 + vcipher 18, 18, 24
1215 + vcipher 19, 19, 24
1216 + vcipher 20, 20, 24
1217 + vcipher 21, 21, 24
1218 + vcipher 22, 22, 24
1219 +
1220 + xxlor 23+32, 14, 14
1221 +
1222 + cmpdi 10, 14
1223 + beq Do_last_aes_dec
1224 + b aes_gcm_out
1225 +
1226 +Do_last_aes_dec:
1227 +
1228 + #
1229 + # last round
1230 + vcipherlast 15, 15, 23
1231 + vcipherlast 16, 16, 23
1232 +
1233 + xxlxor 47, 47, 15
1234 + stxvb16x 47, 0, 9 # store output
1235 + xxlxor 48, 48, 16
1236 + stxvb16x 48, 15, 9 # store output
1237 +
1238 + vcipherlast 17, 17, 23
1239 + vcipherlast 18, 18, 23
1240 +
1241 + xxlxor 49, 49, 17
1242 + stxvb16x 49, 16, 9 # store output
1243 + xxlxor 50, 50, 18
1244 + stxvb16x 50, 17, 9 # store output
1245 +
1246 + vcipherlast 19, 19, 23
1247 + vcipherlast 20, 20, 23
1248 +
1249 + xxlxor 51, 51, 19
1250 + stxvb16x 51, 18, 9 # store output
1251 + xxlxor 52, 52, 20
1252 + stxvb16x 52, 19, 9 # store output
1253 +
1254 + vcipherlast 21, 21, 23
1255 + vcipherlast 22, 22, 23
1256 +
1257 + xxlxor 53, 53, 21
1258 + stxvb16x 53, 20, 9 # store output
1259 + xxlxor 54, 54, 22
1260 + stxvb16x 54, 21, 9 # store output
1261 +
1262 + addi 9, 9, 128
1263 +
1264 + xxlor 15+32, 15, 15
1265 + xxlor 16+32, 16, 16
1266 + xxlor 17+32, 17, 17
1267 + xxlor 18+32, 18, 18
1268 + xxlor 19+32, 19, 19
1269 + xxlor 20+32, 20, 20
1270 + xxlor 21+32, 21, 21
1271 + xxlor 22+32, 22, 22
1272 +
1273 + # ghash here
1274 + ppc_aes_gcm_ghash2_4x
1275 +
1276 + xxlor 27+32, 0, 0
1277 + vaddudm 30, 30, 31 # IV + counter
1278 + vmr 29, 30
1279 + vxor 15, 30, 27 # add round key
1280 + vaddudm 30, 30, 31
1281 + vxor 16, 30, 27
1282 + vaddudm 30, 30, 31
1283 + vxor 17, 30, 27
1284 + vaddudm 30, 30, 31
1285 + vxor 18, 30, 27
1286 + vaddudm 30, 30, 31
1287 + vxor 19, 30, 27
1288 + vaddudm 30, 30, 31
1289 + vxor 20, 30, 27
1290 + vaddudm 30, 30, 31
1291 + vxor 21, 30, 27
1292 + vaddudm 30, 30, 31
1293 + vxor 22, 30, 27
1294 + addi 12, 12, -128
1295 + addi 11, 11, 128
1296 +
1297 + bdnz Loop_8x_block_dec
1298 +
1299 + vmr 30, 29
1300 +
1301 +Loop_last_block_dec:
1302 + cmpdi 12, 0
1303 + beq aes_gcm_out
1304 +
1305 + # loop last few blocks
1306 + li 10, 16
1307 + divdu 10, 12, 10
1308 +
1309 + mtctr 10
1310 +
1311 + lwz 10,240(6)
1312 +
1313 + cmpdi 12, 16
1314 + blt Final_block_dec
1315 +
1316 +Next_rem_block_dec:
1317 + lxvb16x 15, 0, 14 # load block
1318 +
1319 + Loop_aes_middle_1x
1320 +
1321 + xxlor 23+32, 10, 10
1322 +
1323 + cmpdi 10, 10
1324 + beq Do_next_1x_dec
1325 +
1326 + # 192 bits
1327 + xxlor 24+32, 11, 11
1328 +
1329 + vcipher 15, 15, 23
1330 + vcipher 15, 15, 24
1331 +
1332 + xxlor 23+32, 12, 12
1333 +
1334 + cmpdi 10, 12
1335 + beq Do_next_1x_dec
1336 +
1337 + # 256 bits
1338 + xxlor 24+32, 13, 13
1339 +
1340 + vcipher 15, 15, 23
1341 + vcipher 15, 15, 24
1342 +
1343 + xxlor 23+32, 14, 14
1344 +
1345 + cmpdi 10, 14
1346 + beq Do_next_1x_dec
1347 +
1348 +Do_next_1x_dec:
1349 + vcipherlast 15, 15, 23
1350 +
1351 + xxlxor 47, 47, 15
1352 + stxvb16x 47, 0, 9 # store output
1353 + addi 14, 14, 16
1354 + addi 9, 9, 16
1355 +
1356 + xxlor 28+32, 15, 15
1357 + ppc_update_hash_1x
1358 +
1359 + addi 12, 12, -16
1360 + addi 11, 11, 16
1361 + xxlor 19+32, 0, 0
1362 + vaddudm 30, 30, 31 # IV + counter
1363 + vxor 15, 30, 19 # add round key
1364 +
1365 + bdnz Next_rem_block_dec
1366 +
1367 + cmpdi 12, 0
1368 + beq aes_gcm_out
1369 +
1370 +Final_block_dec:
1371 + Loop_aes_middle_1x
1372 +
1373 + xxlor 23+32, 10, 10
1374 +
1375 + cmpdi 10, 10
1376 + beq Do_final_1x_dec
1377 +
1378 + # 192 bits
1379 + xxlor 24+32, 11, 11
1380 +
1381 + vcipher 15, 15, 23
1382 + vcipher 15, 15, 24
1383 +
1384 + xxlor 23+32, 12, 12
1385 +
1386 + cmpdi 10, 12
1387 + beq Do_final_1x_dec
1388 +
1389 + # 256 bits
1390 + xxlor 24+32, 13, 13
1391 +
1392 + vcipher 15, 15, 23
1393 + vcipher 15, 15, 24
1394 +
1395 + xxlor 23+32, 14, 14
1396 +
1397 + cmpdi 10, 14
1398 + beq Do_final_1x_dec
1399 +
1400 +Do_final_1x_dec:
1401 + vcipherlast 15, 15, 23
1402 +
1403 + lxvb16x 15, 0, 14 # load block
1404 + xxlxor 47, 47, 15
1405 +
1406 + # create partial block mask
1407 + li 15, 16
1408 + sub 15, 15, 12 # index to the mask
1409 +
1410 + vspltisb 16, -1 # first 16 bytes - 0xffff...ff
1411 + vspltisb 17, 0 # second 16 bytes - 0x0000...00
1412 + li 10, 192
1413 + stvx 16, 10, 1
1414 + addi 10, 10, 16
1415 + stvx 17, 10, 1
1416 +
1417 + addi 10, 1, 192
1418 + lxvb16x 16, 15, 10 # load block mask
1419 + xxland 47, 47, 16
1420 +
1421 + xxlor 28+32, 15, 15
1422 + ppc_update_hash_1x
1423 +
1424 + # * should store only the remaining bytes.
1425 + bl Write_partial_block
1426 +
1427 + b aes_gcm_out
1428 +
1429 +
1430 +___
1431 +
1432 +foreach (split("\n",$code)) {
1433 + s/\`([^\`]*)\`/eval $1/geo;
1434 +
1435 + if ($flavour =~ /le$/o) { # little-endian
1436 + s/le\?//o or
1437 + s/be\?/#be#/o;
1438 + } else {
1439 + s/le\?/#le#/o or
1440 + s/be\?//o;
1441 + }
1442 + print $_,"\n";
1443 +}
1444 +
1445 +close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1446 diff --git a/crypto/modes/build.info b/crypto/modes/build.info
1447 index 687e872..0ea122e 100644
1448 --- a/crypto/modes/build.info
1449 +++ b/crypto/modes/build.info
1450 @@ -32,7 +32,7 @@ IF[{- !$disabled{asm} -}]
1451 $MODESASM_parisc20_64=$MODESASM_parisc11
1452 $MODESDEF_parisc20_64=$MODESDEF_parisc11
1453
1454 - $MODESASM_ppc32=ghashp8-ppc.s
1455 + $MODESASM_ppc32=ghashp8-ppc.s aes-gcm-ppc.s
1456 $MODESDEF_ppc32=
1457 $MODESASM_ppc64=$MODESASM_ppc32
1458 $MODESDEF_ppc64=$MODESDEF_ppc32
1459 @@ -71,6 +71,7 @@ INCLUDE[ghash-sparcv9.o]=..
1460 GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl
1461 GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl
1462 GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl
1463 +GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl
1464 GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl
1465 INCLUDE[ghash-armv4.o]=..
1466 GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
1467 diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h
1468 index e95ad5a..0c281a3 100644
1469 --- a/include/crypto/aes_platform.h
1470 +++ b/include/crypto/aes_platform.h
1471 @@ -74,6 +74,26 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
1472 # define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
1473 # define HWAES_xts_encrypt aes_p8_xts_encrypt
1474 # define HWAES_xts_decrypt aes_p8_xts_decrypt
1475 +# define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
1476 +# define AES_GCM_ENC_BYTES 128
1477 +# define AES_GCM_DEC_BYTES 128
1478 +size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
1479 + size_t len, const void *key, unsigned char ivec[16],
1480 + u64 *Xi);
1481 +size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
1482 + size_t len, const void *key, unsigned char ivec[16],
1483 + u64 *Xi);
1484 +size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out,
1485 + size_t len, const void *key,
1486 + unsigned char ivec[16], u64 *Xi);
1487 +size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out,
1488 + size_t len, const void *key,
1489 + unsigned char ivec[16], u64 *Xi);
1490 +# define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap
1491 +# define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap
1492 +# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
1493 + (gctx)->gcm.ghash==gcm_ghash_p8)
1494 +void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
1495 # endif /* PPC */
1496
1497 # if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
1498 diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw.c b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
1499 index 44fa9d4..789ec12 100644
1500 --- a/providers/implementations/ciphers/cipher_aes_gcm_hw.c
1501 +++ b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
1502 @@ -141,6 +141,8 @@ static const PROV_GCM_HW aes_gcm = {
1503 # include "cipher_aes_gcm_hw_t4.inc"
1504 #elif defined(AES_PMULL_CAPABLE) && defined(AES_GCM_ASM)
1505 # include "cipher_aes_gcm_hw_armv8.inc"
1506 +#elif defined(PPC_AES_GCM_CAPABLE)
1507 +# include "cipher_aes_gcm_hw_ppc.inc"
1508 #else
1509 const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
1510 {
1511 diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
1512 new file mode 100644
1513 index 0000000..4eed0f4
1514 --- /dev/null
1515 +++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
1516 @@ -0,0 +1,119 @@
1517 +/*
1518 + * Copyright 2001-2021 The OpenSSL Project Authors. All Rights Reserved.
1519 + *
1520 + * Licensed under the Apache License 2.0 (the "License"). You may not use
1521 + * this file except in compliance with the License. You can obtain a copy
1522 + * in the file LICENSE in the source distribution or at
1523 + * https://www.openssl.org/source/license.html
1524 + */
1525 +
1526 +/*-
1527 + * PPC support for AES GCM.
1528 + * This file is included by cipher_aes_gcm_hw.c
1529 + */
1530 +
1531 +static int aes_ppc_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
1532 + size_t keylen)
1533 +{
1534 + PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx;
1535 + AES_KEY *ks = &actx->ks.ks;
1536 +
1537 + GCM_HW_SET_KEY_CTR_FN(ks, aes_p8_set_encrypt_key, aes_p8_encrypt,
1538 + aes_p8_ctr32_encrypt_blocks);
1539 + return 1;
1540 +}
1541 +
1542 +
1543 +extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
1544 + const void *key, unsigned char ivec[16], u64 *Xi);
1545 +extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
1546 + const void *key, unsigned char ivec[16], u64 *Xi);
1547 +
1548 +static inline u32 UTO32(unsigned char *buf)
1549 +{
1550 + return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]);
1551 +}
1552 +
1553 +static inline u32 add32TOU(unsigned char buf[4], u32 n)
1554 +{
1555 + u32 r;
1556 +
1557 + r = UTO32(buf);
1558 + r += n;
1559 + buf[0] = (unsigned char) (r >> 24) & 0xFF;
1560 + buf[1] = (unsigned char) (r >> 16) & 0xFF;
1561 + buf[2] = (unsigned char) (r >> 8) & 0xFF;
1562 + buf[3] = (unsigned char) r & 0xFF;
1563 + return r;
1564 +}
1565 +
1566 +static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
1567 + const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
1568 +{
1569 + int s = 0;
1570 + int ndone = 0;
1571 + int ctr_reset = 0;
1572 + u64 blocks_unused;
1573 + u64 nb = len / 16;
1574 + u64 next_ctr = 0;
1575 + unsigned char ctr_saved[12];
1576 +
1577 + memcpy(ctr_saved, ivec, 12);
1578 +
1579 + while (nb) {
1580 + blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12);
1581 + if (nb > blocks_unused) {
1582 + len = blocks_unused * 16;
1583 + nb -= blocks_unused;
1584 + next_ctr = blocks_unused;
1585 + ctr_reset = 1;
1586 + } else {
1587 + len = nb * 16;
1588 + next_ctr = nb;
1589 + nb = 0;
1590 + }
1591 +
1592 + s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi)
1593 + : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi);
1594 +
1595 + /* add counter to ivec */
1596 + add32TOU(ivec + 12, (u32) next_ctr);
1597 + if (ctr_reset) {
1598 + ctr_reset = 0;
1599 + in += len;
1600 + out += len;
1601 + }
1602 + memcpy(ivec, ctr_saved, 12);
1603 + ndone += s;
1604 + }
1605 +
1606 + return ndone;
1607 +}
1608 +
1609 +size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
1610 + const void *key, unsigned char ivec[16], u64 *Xi)
1611 +{
1612 + return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1);
1613 +}
1614 +
1615 +size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
1616 + const void *key, unsigned char ivec[16], u64 *Xi)
1617 +{
1618 + return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0);
1619 +}
1620 +
1621 +
1622 +static const PROV_GCM_HW aes_ppc_gcm = {
1623 + aes_ppc_gcm_initkey,
1624 + ossl_gcm_setiv,
1625 + ossl_gcm_aad_update,
1626 + generic_aes_gcm_cipher_update,
1627 + ossl_gcm_cipher_final,
1628 + ossl_gcm_one_shot
1629 +};
1630 +
1631 +const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
1632 +{
1633 + return PPC_AES_GCM_CAPABLE ? &aes_ppc_gcm : &aes_gcm;
1634 +}
1635 +

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed