/[smecontribs]/rpms/openssl3/contribs10/0071-AES-GCM-performance-optimization.patch
ViewVC logotype

Annotation of /rpms/openssl3/contribs10/0071-AES-GCM-performance-optimization.patch

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Wed Jan 31 17:24:43 2024 UTC (10 months ago) by jpp
Branch: MAIN
CVS Tags: openssl3-3_0_7-5_el7_sme_1, HEAD
Initial import

1 jpp 1.1 Upstream-Status: Backport [https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c, https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd]
2     diff --git a/crypto/modes/asm/aes-gcm-ppc.pl b/crypto/modes/asm/aes-gcm-ppc.pl
3     new file mode 100644
4     index 0000000..6624e6c
5     --- /dev/null
6     +++ b/crypto/modes/asm/aes-gcm-ppc.pl
7     @@ -0,0 +1,1438 @@
8     +#! /usr/bin/env perl
9     +# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
10     +# Copyright 2021- IBM Inc. All rights reserved
11     +#
12     +# Licensed under the Apache License 2.0 (the "License"). You may not use
13     +# this file except in compliance with the License. You can obtain a copy
14     +# in the file LICENSE in the source distribution or at
15     +# https://www.openssl.org/source/license.html
16     +#
17     +#===================================================================================
18     +# Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
19     +#
20     +# GHASH is based on the Karatsuba multiplication method.
21     +#
22     +# Xi xor X1
23     +#
24     +# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
25     +# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
26     +# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
27     +# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
28     +# (X4.h * H.h + X4.l * H.l + X4 * H)
29     +#
30     +# Xi = v0
31     +# H Poly = v2
32     +# Hash keys = v3 - v14
33     +# ( H.l, H, H.h)
34     +# ( H^2.l, H^2, H^2.h)
35     +# ( H^3.l, H^3, H^3.h)
36     +# ( H^4.l, H^4, H^4.h)
37     +#
38     +# v30 is IV
39     +# v31 - counter 1
40     +#
41     +# AES used,
42     +# vs0 - vs14 for round keys
43     +# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
44     +#
45     +# This implementation uses stitched AES-GCM approach to improve overall performance.
46     +# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
47     +#
48     +# Current large block (16384 bytes) performance per second with 128 bit key --
49     +#
50     +# Encrypt Decrypt
51     +# Power10[le] (3.5GHz) 5.32G 5.26G
52     +#
53     +# ===================================================================================
54     +#
55     +# $output is the last argument if it looks like a file (it has an extension)
56     +# $flavour is the first argument if it doesn't look like a file
57     +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
58     +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
59     +
60     +if ($flavour =~ /64/) {
61     + $SIZE_T=8;
62     + $LRSAVE=2*$SIZE_T;
63     + $STU="stdu";
64     + $POP="ld";
65     + $PUSH="std";
66     + $UCMP="cmpld";
67     + $SHRI="srdi";
68     +} elsif ($flavour =~ /32/) {
69     + $SIZE_T=4;
70     + $LRSAVE=$SIZE_T;
71     + $STU="stwu";
72     + $POP="lwz";
73     + $PUSH="stw";
74     + $UCMP="cmplw";
75     + $SHRI="srwi";
76     +} else { die "nonsense $flavour"; }
77     +
78     +$sp="r1";
79     +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
80     +
81     +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
82     +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
83     +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
84     +die "can't locate ppc-xlate.pl";
85     +
86     +open STDOUT,"| $^X $xlate $flavour \"$output\""
87     + or die "can't call $xlate: $!";
88     +
89     +$code=<<___;
90     +.machine "any"
91     +.text
92     +
93     +# 4x loops
94     +# v15 - v18 - input states
95     +# vs1 - vs9 - round keys
96     +#
97     +.macro Loop_aes_middle4x
98     + xxlor 19+32, 1, 1
99     + xxlor 20+32, 2, 2
100     + xxlor 21+32, 3, 3
101     + xxlor 22+32, 4, 4
102     +
103     + vcipher 15, 15, 19
104     + vcipher 16, 16, 19
105     + vcipher 17, 17, 19
106     + vcipher 18, 18, 19
107     +
108     + vcipher 15, 15, 20
109     + vcipher 16, 16, 20
110     + vcipher 17, 17, 20
111     + vcipher 18, 18, 20
112     +
113     + vcipher 15, 15, 21
114     + vcipher 16, 16, 21
115     + vcipher 17, 17, 21
116     + vcipher 18, 18, 21
117     +
118     + vcipher 15, 15, 22
119     + vcipher 16, 16, 22
120     + vcipher 17, 17, 22
121     + vcipher 18, 18, 22
122     +
123     + xxlor 19+32, 5, 5
124     + xxlor 20+32, 6, 6
125     + xxlor 21+32, 7, 7
126     + xxlor 22+32, 8, 8
127     +
128     + vcipher 15, 15, 19
129     + vcipher 16, 16, 19
130     + vcipher 17, 17, 19
131     + vcipher 18, 18, 19
132     +
133     + vcipher 15, 15, 20
134     + vcipher 16, 16, 20
135     + vcipher 17, 17, 20
136     + vcipher 18, 18, 20
137     +
138     + vcipher 15, 15, 21
139     + vcipher 16, 16, 21
140     + vcipher 17, 17, 21
141     + vcipher 18, 18, 21
142     +
143     + vcipher 15, 15, 22
144     + vcipher 16, 16, 22
145     + vcipher 17, 17, 22
146     + vcipher 18, 18, 22
147     +
148     + xxlor 23+32, 9, 9
149     + vcipher 15, 15, 23
150     + vcipher 16, 16, 23
151     + vcipher 17, 17, 23
152     + vcipher 18, 18, 23
153     +.endm
154     +
155     +# 8x loops
156     +# v15 - v22 - input states
157     +# vs1 - vs9 - round keys
158     +#
159     +.macro Loop_aes_middle8x
160     + xxlor 23+32, 1, 1
161     + xxlor 24+32, 2, 2
162     + xxlor 25+32, 3, 3
163     + xxlor 26+32, 4, 4
164     +
165     + vcipher 15, 15, 23
166     + vcipher 16, 16, 23
167     + vcipher 17, 17, 23
168     + vcipher 18, 18, 23
169     + vcipher 19, 19, 23
170     + vcipher 20, 20, 23
171     + vcipher 21, 21, 23
172     + vcipher 22, 22, 23
173     +
174     + vcipher 15, 15, 24
175     + vcipher 16, 16, 24
176     + vcipher 17, 17, 24
177     + vcipher 18, 18, 24
178     + vcipher 19, 19, 24
179     + vcipher 20, 20, 24
180     + vcipher 21, 21, 24
181     + vcipher 22, 22, 24
182     +
183     + vcipher 15, 15, 25
184     + vcipher 16, 16, 25
185     + vcipher 17, 17, 25
186     + vcipher 18, 18, 25
187     + vcipher 19, 19, 25
188     + vcipher 20, 20, 25
189     + vcipher 21, 21, 25
190     + vcipher 22, 22, 25
191     +
192     + vcipher 15, 15, 26
193     + vcipher 16, 16, 26
194     + vcipher 17, 17, 26
195     + vcipher 18, 18, 26
196     + vcipher 19, 19, 26
197     + vcipher 20, 20, 26
198     + vcipher 21, 21, 26
199     + vcipher 22, 22, 26
200     +
201     + xxlor 23+32, 5, 5
202     + xxlor 24+32, 6, 6
203     + xxlor 25+32, 7, 7
204     + xxlor 26+32, 8, 8
205     +
206     + vcipher 15, 15, 23
207     + vcipher 16, 16, 23
208     + vcipher 17, 17, 23
209     + vcipher 18, 18, 23
210     + vcipher 19, 19, 23
211     + vcipher 20, 20, 23
212     + vcipher 21, 21, 23
213     + vcipher 22, 22, 23
214     +
215     + vcipher 15, 15, 24
216     + vcipher 16, 16, 24
217     + vcipher 17, 17, 24
218     + vcipher 18, 18, 24
219     + vcipher 19, 19, 24
220     + vcipher 20, 20, 24
221     + vcipher 21, 21, 24
222     + vcipher 22, 22, 24
223     +
224     + vcipher 15, 15, 25
225     + vcipher 16, 16, 25
226     + vcipher 17, 17, 25
227     + vcipher 18, 18, 25
228     + vcipher 19, 19, 25
229     + vcipher 20, 20, 25
230     + vcipher 21, 21, 25
231     + vcipher 22, 22, 25
232     +
233     + vcipher 15, 15, 26
234     + vcipher 16, 16, 26
235     + vcipher 17, 17, 26
236     + vcipher 18, 18, 26
237     + vcipher 19, 19, 26
238     + vcipher 20, 20, 26
239     + vcipher 21, 21, 26
240     + vcipher 22, 22, 26
241     +
242     + xxlor 23+32, 9, 9
243     + vcipher 15, 15, 23
244     + vcipher 16, 16, 23
245     + vcipher 17, 17, 23
246     + vcipher 18, 18, 23
247     + vcipher 19, 19, 23
248     + vcipher 20, 20, 23
249     + vcipher 21, 21, 23
250     + vcipher 22, 22, 23
251     +.endm
252     +
253     +#
254     +# Compute 4x hash values based on Karatsuba method.
255     +#
256     +ppc_aes_gcm_ghash:
257     + vxor 15, 15, 0
258     +
259     + xxlxor 29, 29, 29
260     +
261     + vpmsumd 23, 12, 15 # H4.L * X.L
262     + vpmsumd 24, 9, 16
263     + vpmsumd 25, 6, 17
264     + vpmsumd 26, 3, 18
265     +
266     + vxor 23, 23, 24
267     + vxor 23, 23, 25
268     + vxor 23, 23, 26 # L
269     +
270     + vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
271     + vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
272     + vpmsumd 26, 7, 17
273     + vpmsumd 27, 4, 18
274     +
275     + vxor 24, 24, 25
276     + vxor 24, 24, 26
277     + vxor 24, 24, 27 # M
278     +
279     + # sum hash and reduction with H Poly
280     + vpmsumd 28, 23, 2 # reduction
281     +
282     + xxlor 29+32, 29, 29
283     + vsldoi 26, 24, 29, 8 # mL
284     + vsldoi 29, 29, 24, 8 # mH
285     + vxor 23, 23, 26 # mL + L
286     +
287     + vsldoi 23, 23, 23, 8 # swap
288     + vxor 23, 23, 28
289     +
290     + vpmsumd 24, 14, 15 # H4.H * X.H
291     + vpmsumd 25, 11, 16
292     + vpmsumd 26, 8, 17
293     + vpmsumd 27, 5, 18
294     +
295     + vxor 24, 24, 25
296     + vxor 24, 24, 26
297     + vxor 24, 24, 27
298     +
299     + vxor 24, 24, 29
300     +
301     + # sum hash and reduction with H Poly
302     + vsldoi 27, 23, 23, 8 # swap
303     + vpmsumd 23, 23, 2
304     + vxor 27, 27, 24
305     + vxor 23, 23, 27
306     +
307     + xxlor 32, 23+32, 23+32 # update hash
308     +
309     + blr
310     +
311     +#
312     +# Combine two 4x ghash
313     +# v15 - v22 - input blocks
314     +#
315     +.macro ppc_aes_gcm_ghash2_4x
316     + # first 4x hash
317     + vxor 15, 15, 0 # Xi + X
318     +
319     + xxlxor 29, 29, 29
320     +
321     + vpmsumd 23, 12, 15 # H4.L * X.L
322     + vpmsumd 24, 9, 16
323     + vpmsumd 25, 6, 17
324     + vpmsumd 26, 3, 18
325     +
326     + vxor 23, 23, 24
327     + vxor 23, 23, 25
328     + vxor 23, 23, 26 # L
329     +
330     + vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
331     + vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
332     + vpmsumd 26, 7, 17
333     + vpmsumd 27, 4, 18
334     +
335     + vxor 24, 24, 25
336     + vxor 24, 24, 26
337     +
338     + # sum hash and reduction with H Poly
339     + vpmsumd 28, 23, 2 # reduction
340     +
341     + xxlor 29+32, 29, 29
342     +
343     + vxor 24, 24, 27 # M
344     + vsldoi 26, 24, 29, 8 # mL
345     + vsldoi 29, 29, 24, 8 # mH
346     + vxor 23, 23, 26 # mL + L
347     +
348     + vsldoi 23, 23, 23, 8 # swap
349     + vxor 23, 23, 28
350     +
351     + vpmsumd 24, 14, 15 # H4.H * X.H
352     + vpmsumd 25, 11, 16
353     + vpmsumd 26, 8, 17
354     + vpmsumd 27, 5, 18
355     +
356     + vxor 24, 24, 25
357     + vxor 24, 24, 26
358     + vxor 24, 24, 27 # H
359     +
360     + vxor 24, 24, 29 # H + mH
361     +
362     + # sum hash and reduction with H Poly
363     + vsldoi 27, 23, 23, 8 # swap
364     + vpmsumd 23, 23, 2
365     + vxor 27, 27, 24
366     + vxor 27, 23, 27 # 1st Xi
367     +
368     + # 2nd 4x hash
369     + vpmsumd 24, 9, 20
370     + vpmsumd 25, 6, 21
371     + vpmsumd 26, 3, 22
372     + vxor 19, 19, 27 # Xi + X
373     + vpmsumd 23, 12, 19 # H4.L * X.L
374     +
375     + vxor 23, 23, 24
376     + vxor 23, 23, 25
377     + vxor 23, 23, 26 # L
378     +
379     + vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
380     + vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
381     + vpmsumd 26, 7, 21
382     + vpmsumd 27, 4, 22
383     +
384     + vxor 24, 24, 25
385     + vxor 24, 24, 26
386     +
387     + # sum hash and reduction with H Poly
388     + vpmsumd 28, 23, 2 # reduction
389     +
390     + xxlor 29+32, 29, 29
391     +
392     + vxor 24, 24, 27 # M
393     + vsldoi 26, 24, 29, 8 # mL
394     + vsldoi 29, 29, 24, 8 # mH
395     + vxor 23, 23, 26 # mL + L
396     +
397     + vsldoi 23, 23, 23, 8 # swap
398     + vxor 23, 23, 28
399     +
400     + vpmsumd 24, 14, 19 # H4.H * X.H
401     + vpmsumd 25, 11, 20
402     + vpmsumd 26, 8, 21
403     + vpmsumd 27, 5, 22
404     +
405     + vxor 24, 24, 25
406     + vxor 24, 24, 26
407     + vxor 24, 24, 27 # H
408     +
409     + vxor 24, 24, 29 # H + mH
410     +
411     + # sum hash and reduction with H Poly
412     + vsldoi 27, 23, 23, 8 # swap
413     + vpmsumd 23, 23, 2
414     + vxor 27, 27, 24
415     + vxor 23, 23, 27
416     +
417     + xxlor 32, 23+32, 23+32 # update hash
418     +
419     +.endm
420     +
421     +#
422     +# Compute update single hash
423     +#
424     +.macro ppc_update_hash_1x
425     + vxor 28, 28, 0
426     +
427     + vxor 19, 19, 19
428     +
429     + vpmsumd 22, 3, 28 # L
430     + vpmsumd 23, 4, 28 # M
431     + vpmsumd 24, 5, 28 # H
432     +
433     + vpmsumd 27, 22, 2 # reduction
434     +
435     + vsldoi 25, 23, 19, 8 # mL
436     + vsldoi 26, 19, 23, 8 # mH
437     + vxor 22, 22, 25 # LL + LL
438     + vxor 24, 24, 26 # HH + HH
439     +
440     + vsldoi 22, 22, 22, 8 # swap
441     + vxor 22, 22, 27
442     +
443     + vsldoi 20, 22, 22, 8 # swap
444     + vpmsumd 22, 22, 2 # reduction
445     + vxor 20, 20, 24
446     + vxor 22, 22, 20
447     +
448     + vmr 0, 22 # update hash
449     +
450     +.endm
451     +
452     +#
453     +# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
454     +# const AES_KEY *key, unsigned char iv[16],
455     +# void *Xip);
456     +#
457     +# r3 - inp
458     +# r4 - out
459     +# r5 - len
460     +# r6 - AES round keys
461     +# r7 - iv
462     +# r8 - Xi, HPoli, hash keys
463     +#
464     +.global ppc_aes_gcm_encrypt
465     +.align 5
466     +ppc_aes_gcm_encrypt:
467     +_ppc_aes_gcm_encrypt:
468     +
469     + stdu 1,-512(1)
470     + mflr 0
471     +
472     + std 14,112(1)
473     + std 15,120(1)
474     + std 16,128(1)
475     + std 17,136(1)
476     + std 18,144(1)
477     + std 19,152(1)
478     + std 20,160(1)
479     + std 21,168(1)
480     + li 9, 256
481     + stvx 20, 9, 1
482     + addi 9, 9, 16
483     + stvx 21, 9, 1
484     + addi 9, 9, 16
485     + stvx 22, 9, 1
486     + addi 9, 9, 16
487     + stvx 23, 9, 1
488     + addi 9, 9, 16
489     + stvx 24, 9, 1
490     + addi 9, 9, 16
491     + stvx 25, 9, 1
492     + addi 9, 9, 16
493     + stvx 26, 9, 1
494     + addi 9, 9, 16
495     + stvx 27, 9, 1
496     + addi 9, 9, 16
497     + stvx 28, 9, 1
498     + addi 9, 9, 16
499     + stvx 29, 9, 1
500     + addi 9, 9, 16
501     + stvx 30, 9, 1
502     + addi 9, 9, 16
503     + stvx 31, 9, 1
504     + std 0, 528(1)
505     +
506     + # Load Xi
507     + lxvb16x 32, 0, 8 # load Xi
508     +
509     + # load Hash - h^4, h^3, h^2, h
510     + li 10, 32
511     + lxvd2x 2+32, 10, 8 # H Poli
512     + li 10, 48
513     + lxvd2x 3+32, 10, 8 # Hl
514     + li 10, 64
515     + lxvd2x 4+32, 10, 8 # H
516     + li 10, 80
517     + lxvd2x 5+32, 10, 8 # Hh
518     +
519     + li 10, 96
520     + lxvd2x 6+32, 10, 8 # H^2l
521     + li 10, 112
522     + lxvd2x 7+32, 10, 8 # H^2
523     + li 10, 128
524     + lxvd2x 8+32, 10, 8 # H^2h
525     +
526     + li 10, 144
527     + lxvd2x 9+32, 10, 8 # H^3l
528     + li 10, 160
529     + lxvd2x 10+32, 10, 8 # H^3
530     + li 10, 176
531     + lxvd2x 11+32, 10, 8 # H^3h
532     +
533     + li 10, 192
534     + lxvd2x 12+32, 10, 8 # H^4l
535     + li 10, 208
536     + lxvd2x 13+32, 10, 8 # H^4
537     + li 10, 224
538     + lxvd2x 14+32, 10, 8 # H^4h
539     +
540     + # initialize ICB: GHASH( IV ), IV - r7
541     + lxvb16x 30+32, 0, 7 # load IV - v30
542     +
543     + mr 12, 5 # length
544     + li 11, 0 # block index
545     +
546     + # counter 1
547     + vxor 31, 31, 31
548     + vspltisb 22, 1
549     + vsldoi 31, 31, 22,1 # counter 1
550     +
551     + # load round key to VSR
552     + lxv 0, 0(6)
553     + lxv 1, 0x10(6)
554     + lxv 2, 0x20(6)
555     + lxv 3, 0x30(6)
556     + lxv 4, 0x40(6)
557     + lxv 5, 0x50(6)
558     + lxv 6, 0x60(6)
559     + lxv 7, 0x70(6)
560     + lxv 8, 0x80(6)
561     + lxv 9, 0x90(6)
562     + lxv 10, 0xa0(6)
563     +
564     + # load rounds - 10 (128), 12 (192), 14 (256)
565     + lwz 9,240(6)
566     +
567     + #
568     + # vxor state, state, w # addroundkey
569     + xxlor 32+29, 0, 0
570     + vxor 15, 30, 29 # IV + round key - add round key 0
571     +
572     + cmpdi 9, 10
573     + beq Loop_aes_gcm_8x
574     +
575     + # load 2 more round keys (v11, v12)
576     + lxv 11, 0xb0(6)
577     + lxv 12, 0xc0(6)
578     +
579     + cmpdi 9, 12
580     + beq Loop_aes_gcm_8x
581     +
582     + # load 2 more round keys (v11, v12, v13, v14)
583     + lxv 13, 0xd0(6)
584     + lxv 14, 0xe0(6)
585     + cmpdi 9, 14
586     + beq Loop_aes_gcm_8x
587     +
588     + b aes_gcm_out
589     +
590     +.align 5
591     +Loop_aes_gcm_8x:
592     + mr 14, 3
593     + mr 9, 4
594     +
595     + # n blocks
596     + li 10, 128
597     + divdu 10, 5, 10 # n 128 bytes-blocks
598     + cmpdi 10, 0
599     + beq Loop_last_block
600     +
601     + vaddudm 30, 30, 31 # IV + counter
602     + vxor 16, 30, 29
603     + vaddudm 30, 30, 31
604     + vxor 17, 30, 29
605     + vaddudm 30, 30, 31
606     + vxor 18, 30, 29
607     + vaddudm 30, 30, 31
608     + vxor 19, 30, 29
609     + vaddudm 30, 30, 31
610     + vxor 20, 30, 29
611     + vaddudm 30, 30, 31
612     + vxor 21, 30, 29
613     + vaddudm 30, 30, 31
614     + vxor 22, 30, 29
615     +
616     + mtctr 10
617     +
618     + li 15, 16
619     + li 16, 32
620     + li 17, 48
621     + li 18, 64
622     + li 19, 80
623     + li 20, 96
624     + li 21, 112
625     +
626     + lwz 10, 240(6)
627     +
628     +Loop_8x_block:
629     +
630     + lxvb16x 15, 0, 14 # load block
631     + lxvb16x 16, 15, 14 # load block
632     + lxvb16x 17, 16, 14 # load block
633     + lxvb16x 18, 17, 14 # load block
634     + lxvb16x 19, 18, 14 # load block
635     + lxvb16x 20, 19, 14 # load block
636     + lxvb16x 21, 20, 14 # load block
637     + lxvb16x 22, 21, 14 # load block
638     + addi 14, 14, 128
639     +
640     + Loop_aes_middle8x
641     +
642     + xxlor 23+32, 10, 10
643     +
644     + cmpdi 10, 10
645     + beq Do_next_ghash
646     +
647     + # 192 bits
648     + xxlor 24+32, 11, 11
649     +
650     + vcipher 15, 15, 23
651     + vcipher 16, 16, 23
652     + vcipher 17, 17, 23
653     + vcipher 18, 18, 23
654     + vcipher 19, 19, 23
655     + vcipher 20, 20, 23
656     + vcipher 21, 21, 23
657     + vcipher 22, 22, 23
658     +
659     + vcipher 15, 15, 24
660     + vcipher 16, 16, 24
661     + vcipher 17, 17, 24
662     + vcipher 18, 18, 24
663     + vcipher 19, 19, 24
664     + vcipher 20, 20, 24
665     + vcipher 21, 21, 24
666     + vcipher 22, 22, 24
667     +
668     + xxlor 23+32, 12, 12
669     +
670     + cmpdi 10, 12
671     + beq Do_next_ghash
672     +
673     + # 256 bits
674     + xxlor 24+32, 13, 13
675     +
676     + vcipher 15, 15, 23
677     + vcipher 16, 16, 23
678     + vcipher 17, 17, 23
679     + vcipher 18, 18, 23
680     + vcipher 19, 19, 23
681     + vcipher 20, 20, 23
682     + vcipher 21, 21, 23
683     + vcipher 22, 22, 23
684     +
685     + vcipher 15, 15, 24
686     + vcipher 16, 16, 24
687     + vcipher 17, 17, 24
688     + vcipher 18, 18, 24
689     + vcipher 19, 19, 24
690     + vcipher 20, 20, 24
691     + vcipher 21, 21, 24
692     + vcipher 22, 22, 24
693     +
694     + xxlor 23+32, 14, 14
695     +
696     + cmpdi 10, 14
697     + beq Do_next_ghash
698     + b aes_gcm_out
699     +
700     +Do_next_ghash:
701     +
702     + #
703     + # last round
704     + vcipherlast 15, 15, 23
705     + vcipherlast 16, 16, 23
706     +
707     + xxlxor 47, 47, 15
708     + stxvb16x 47, 0, 9 # store output
709     + xxlxor 48, 48, 16
710     + stxvb16x 48, 15, 9 # store output
711     +
712     + vcipherlast 17, 17, 23
713     + vcipherlast 18, 18, 23
714     +
715     + xxlxor 49, 49, 17
716     + stxvb16x 49, 16, 9 # store output
717     + xxlxor 50, 50, 18
718     + stxvb16x 50, 17, 9 # store output
719     +
720     + vcipherlast 19, 19, 23
721     + vcipherlast 20, 20, 23
722     +
723     + xxlxor 51, 51, 19
724     + stxvb16x 51, 18, 9 # store output
725     + xxlxor 52, 52, 20
726     + stxvb16x 52, 19, 9 # store output
727     +
728     + vcipherlast 21, 21, 23
729     + vcipherlast 22, 22, 23
730     +
731     + xxlxor 53, 53, 21
732     + stxvb16x 53, 20, 9 # store output
733     + xxlxor 54, 54, 22
734     + stxvb16x 54, 21, 9 # store output
735     +
736     + addi 9, 9, 128
737     +
738     + # ghash here
739     + ppc_aes_gcm_ghash2_4x
740     +
741     + xxlor 27+32, 0, 0
742     + vaddudm 30, 30, 31 # IV + counter
743     + vmr 29, 30
744     + vxor 15, 30, 27 # add round key
745     + vaddudm 30, 30, 31
746     + vxor 16, 30, 27
747     + vaddudm 30, 30, 31
748     + vxor 17, 30, 27
749     + vaddudm 30, 30, 31
750     + vxor 18, 30, 27
751     + vaddudm 30, 30, 31
752     + vxor 19, 30, 27
753     + vaddudm 30, 30, 31
754     + vxor 20, 30, 27
755     + vaddudm 30, 30, 31
756     + vxor 21, 30, 27
757     + vaddudm 30, 30, 31
758     + vxor 22, 30, 27
759     +
760     + addi 12, 12, -128
761     + addi 11, 11, 128
762     +
763     + bdnz Loop_8x_block
764     +
765     + vmr 30, 29
766     +
767     +Loop_last_block:
768     + cmpdi 12, 0
769     + beq aes_gcm_out
770     +
771     + # loop last few blocks
772     + li 10, 16
773     + divdu 10, 12, 10
774     +
775     + mtctr 10
776     +
777     + lwz 10, 240(6)
778     +
779     + cmpdi 12, 16
780     + blt Final_block
781     +
782     +.macro Loop_aes_middle_1x
783     + xxlor 19+32, 1, 1
784     + xxlor 20+32, 2, 2
785     + xxlor 21+32, 3, 3
786     + xxlor 22+32, 4, 4
787     +
788     + vcipher 15, 15, 19
789     + vcipher 15, 15, 20
790     + vcipher 15, 15, 21
791     + vcipher 15, 15, 22
792     +
793     + xxlor 19+32, 5, 5
794     + xxlor 20+32, 6, 6
795     + xxlor 21+32, 7, 7
796     + xxlor 22+32, 8, 8
797     +
798     + vcipher 15, 15, 19
799     + vcipher 15, 15, 20
800     + vcipher 15, 15, 21
801     + vcipher 15, 15, 22
802     +
803     + xxlor 19+32, 9, 9
804     + vcipher 15, 15, 19
805     +.endm
806     +
807     +Next_rem_block:
808     + lxvb16x 15, 0, 14 # load block
809     +
810     + Loop_aes_middle_1x
811     +
812     + xxlor 23+32, 10, 10
813     +
814     + cmpdi 10, 10
815     + beq Do_next_1x
816     +
817     + # 192 bits
818     + xxlor 24+32, 11, 11
819     +
820     + vcipher 15, 15, 23
821     + vcipher 15, 15, 24
822     +
823     + xxlor 23+32, 12, 12
824     +
825     + cmpdi 10, 12
826     + beq Do_next_1x
827     +
828     + # 256 bits
829     + xxlor 24+32, 13, 13
830     +
831     + vcipher 15, 15, 23
832     + vcipher 15, 15, 24
833     +
834     + xxlor 23+32, 14, 14
835     +
836     + cmpdi 10, 14
837     + beq Do_next_1x
838     +
839     +Do_next_1x:
840     + vcipherlast 15, 15, 23
841     +
842     + xxlxor 47, 47, 15
843     + stxvb16x 47, 0, 9 # store output
844     + addi 14, 14, 16
845     + addi 9, 9, 16
846     +
847     + vmr 28, 15
848     + ppc_update_hash_1x
849     +
850     + addi 12, 12, -16
851     + addi 11, 11, 16
852     + xxlor 19+32, 0, 0
853     + vaddudm 30, 30, 31 # IV + counter
854     + vxor 15, 30, 19 # add round key
855     +
856     + bdnz Next_rem_block
857     +
858     + cmpdi 12, 0
859     + beq aes_gcm_out
860     +
861     +Final_block:
862     + Loop_aes_middle_1x
863     +
864     + xxlor 23+32, 10, 10
865     +
866     + cmpdi 10, 10
867     + beq Do_final_1x
868     +
869     + # 192 bits
870     + xxlor 24+32, 11, 11
871     +
872     + vcipher 15, 15, 23
873     + vcipher 15, 15, 24
874     +
875     + xxlor 23+32, 12, 12
876     +
877     + cmpdi 10, 12
878     + beq Do_final_1x
879     +
880     + # 256 bits
881     + xxlor 24+32, 13, 13
882     +
883     + vcipher 15, 15, 23
884     + vcipher 15, 15, 24
885     +
886     + xxlor 23+32, 14, 14
887     +
888     + cmpdi 10, 14
889     + beq Do_final_1x
890     +
891     +Do_final_1x:
892     + vcipherlast 15, 15, 23
893     +
894     + lxvb16x 15, 0, 14 # load last block
895     + xxlxor 47, 47, 15
896     +
897     + # create partial block mask
898     + li 15, 16
899     + sub 15, 15, 12 # index to the mask
900     +
901     + vspltisb 16, -1 # first 16 bytes - 0xffff...ff
902     + vspltisb 17, 0 # second 16 bytes - 0x0000...00
903     + li 10, 192
904     + stvx 16, 10, 1
905     + addi 10, 10, 16
906     + stvx 17, 10, 1
907     +
908     + addi 10, 1, 192
909     + lxvb16x 16, 15, 10 # load partial block mask
910     + xxland 47, 47, 16
911     +
912     + vmr 28, 15
913     + ppc_update_hash_1x
914     +
915     + # * should store only the remaining bytes.
916     + bl Write_partial_block
917     +
918     + b aes_gcm_out
919     +
920     +#
921     +# Write partial block
922     +# r9 - output
923     +# r12 - remaining bytes
924     +# v15 - partial input data
925     +#
926     +Write_partial_block:
927     + li 10, 192
928     + stxvb16x 15+32, 10, 1 # last block
929     +
930     + #add 10, 9, 11 # Output
931     + addi 10, 9, -1
932     + addi 16, 1, 191
933     +
934     + mtctr 12 # remaining bytes
935     + li 15, 0
936     +
937     +Write_last_byte:
938     + lbzu 14, 1(16)
939     + stbu 14, 1(10)
940     + bdnz Write_last_byte
941     + blr
942     +
943     +aes_gcm_out:
944     + # out = state
945     + stxvb16x 32, 0, 8 # write out Xi
946     + add 3, 11, 12 # return count
947     +
948     + li 9, 256
949     + lvx 20, 9, 1
950     + addi 9, 9, 16
951     + lvx 21, 9, 1
952     + addi 9, 9, 16
953     + lvx 22, 9, 1
954     + addi 9, 9, 16
955     + lvx 23, 9, 1
956     + addi 9, 9, 16
957     + lvx 24, 9, 1
958     + addi 9, 9, 16
959     + lvx 25, 9, 1
960     + addi 9, 9, 16
961     + lvx 26, 9, 1
962     + addi 9, 9, 16
963     + lvx 27, 9, 1
964     + addi 9, 9, 16
965     + lvx 28, 9, 1
966     + addi 9, 9, 16
967     + lvx 29, 9, 1
968     + addi 9, 9, 16
969     + lvx 30, 9, 1
970     + addi 9, 9, 16
971     + lvx 31, 9, 1
972     +
973     + ld 0, 528(1)
974     + ld 14,112(1)
975     + ld 15,120(1)
976     + ld 16,128(1)
977     + ld 17,136(1)
978     + ld 18,144(1)
979     + ld 19,152(1)
980     + ld 20,160(1)
981     + ld 21,168(1)
982     +
983     + mtlr 0
984     + addi 1, 1, 512
985     + blr
986     +
987     +#
988     +# 8x Decrypt
989     +#
990     +.global ppc_aes_gcm_decrypt
991     +.align 5
992     +ppc_aes_gcm_decrypt:
993     +_ppc_aes_gcm_decrypt:
994     +
995     + stdu 1,-512(1)
996     + mflr 0
997     +
998     + std 14,112(1)
999     + std 15,120(1)
1000     + std 16,128(1)
1001     + std 17,136(1)
1002     + std 18,144(1)
1003     + std 19,152(1)
1004     + std 20,160(1)
1005     + std 21,168(1)
1006     + li 9, 256
1007     + stvx 20, 9, 1
1008     + addi 9, 9, 16
1009     + stvx 21, 9, 1
1010     + addi 9, 9, 16
1011     + stvx 22, 9, 1
1012     + addi 9, 9, 16
1013     + stvx 23, 9, 1
1014     + addi 9, 9, 16
1015     + stvx 24, 9, 1
1016     + addi 9, 9, 16
1017     + stvx 25, 9, 1
1018     + addi 9, 9, 16
1019     + stvx 26, 9, 1
1020     + addi 9, 9, 16
1021     + stvx 27, 9, 1
1022     + addi 9, 9, 16
1023     + stvx 28, 9, 1
1024     + addi 9, 9, 16
1025     + stvx 29, 9, 1
1026     + addi 9, 9, 16
1027     + stvx 30, 9, 1
1028     + addi 9, 9, 16
1029     + stvx 31, 9, 1
1030     + std 0, 528(1)
1031     +
1032     + # Load Xi
1033     + lxvb16x 32, 0, 8 # load Xi
1034     +
1035     + # load Hash - h^4, h^3, h^2, h
1036     + li 10, 32
1037     + lxvd2x 2+32, 10, 8 # H Poli
1038     + li 10, 48
1039     + lxvd2x 3+32, 10, 8 # Hl
1040     + li 10, 64
1041     + lxvd2x 4+32, 10, 8 # H
1042     + li 10, 80
1043     + lxvd2x 5+32, 10, 8 # Hh
1044     +
1045     + li 10, 96
1046     + lxvd2x 6+32, 10, 8 # H^2l
1047     + li 10, 112
1048     + lxvd2x 7+32, 10, 8 # H^2
1049     + li 10, 128
1050     + lxvd2x 8+32, 10, 8 # H^2h
1051     +
1052     + li 10, 144
1053     + lxvd2x 9+32, 10, 8 # H^3l
1054     + li 10, 160
1055     + lxvd2x 10+32, 10, 8 # H^3
1056     + li 10, 176
1057     + lxvd2x 11+32, 10, 8 # H^3h
1058     +
1059     + li 10, 192
1060     + lxvd2x 12+32, 10, 8 # H^4l
1061     + li 10, 208
1062     + lxvd2x 13+32, 10, 8 # H^4
1063     + li 10, 224
1064     + lxvd2x 14+32, 10, 8 # H^4h
1065     +
1066     + # initialize ICB: GHASH( IV ), IV - r7
1067     + lxvb16x 30+32, 0, 7 # load IV - v30
1068     +
1069     + mr 12, 5 # length
1070     + li 11, 0 # block index
1071     +
1072     + # counter 1
1073     + vxor 31, 31, 31
1074     + vspltisb 22, 1
1075     + vsldoi 31, 31, 22,1 # counter 1
1076     +
1077     + # load round key to VSR
1078     + lxv 0, 0(6)
1079     + lxv 1, 0x10(6)
1080     + lxv 2, 0x20(6)
1081     + lxv 3, 0x30(6)
1082     + lxv 4, 0x40(6)
1083     + lxv 5, 0x50(6)
1084     + lxv 6, 0x60(6)
1085     + lxv 7, 0x70(6)
1086     + lxv 8, 0x80(6)
1087     + lxv 9, 0x90(6)
1088     + lxv 10, 0xa0(6)
1089     +
1090     + # load rounds - 10 (128), 12 (192), 14 (256)
1091     + lwz 9,240(6)
1092     +
1093     + #
1094     + # vxor state, state, w # addroundkey
1095     + xxlor 32+29, 0, 0
1096     + vxor 15, 30, 29 # IV + round key - add round key 0
1097     +
1098     + cmpdi 9, 10
1099     + beq Loop_aes_gcm_8x_dec
1100     +
1101     + # load 2 more round keys (v11, v12)
1102     + lxv 11, 0xb0(6)
1103     + lxv 12, 0xc0(6)
1104     +
1105     + cmpdi 9, 12
1106     + beq Loop_aes_gcm_8x_dec
1107     +
1108     + # load 2 more round keys (v11, v12, v13, v14)
1109     + lxv 13, 0xd0(6)
1110     + lxv 14, 0xe0(6)
1111     + cmpdi 9, 14
1112     + beq Loop_aes_gcm_8x_dec
1113     +
1114     + b aes_gcm_out
1115     +
1116     +.align 5
1117     +Loop_aes_gcm_8x_dec:
1118     + mr 14, 3
1119     + mr 9, 4
1120     +
1121     + # n blocks
1122     + li 10, 128
1123     + divdu 10, 5, 10 # n 128 bytes-blocks
1124     + cmpdi 10, 0
1125     + beq Loop_last_block_dec
1126     +
1127     + vaddudm 30, 30, 31 # IV + counter
1128     + vxor 16, 30, 29
1129     + vaddudm 30, 30, 31
1130     + vxor 17, 30, 29
1131     + vaddudm 30, 30, 31
1132     + vxor 18, 30, 29
1133     + vaddudm 30, 30, 31
1134     + vxor 19, 30, 29
1135     + vaddudm 30, 30, 31
1136     + vxor 20, 30, 29
1137     + vaddudm 30, 30, 31
1138     + vxor 21, 30, 29
1139     + vaddudm 30, 30, 31
1140     + vxor 22, 30, 29
1141     +
1142     + mtctr 10
1143     +
1144     + li 15, 16
1145     + li 16, 32
1146     + li 17, 48
1147     + li 18, 64
1148     + li 19, 80
1149     + li 20, 96
1150     + li 21, 112
1151     +
1152     + lwz 10, 240(6)
1153     +
1154     +Loop_8x_block_dec:
1155     +
1156     + lxvb16x 15, 0, 14 # load block
1157     + lxvb16x 16, 15, 14 # load block
1158     + lxvb16x 17, 16, 14 # load block
1159     + lxvb16x 18, 17, 14 # load block
1160     + lxvb16x 19, 18, 14 # load block
1161     + lxvb16x 20, 19, 14 # load block
1162     + lxvb16x 21, 20, 14 # load block
1163     + lxvb16x 22, 21, 14 # load block
1164     + addi 14, 14, 128
1165     +
1166     + Loop_aes_middle8x
1167     +
1168     + xxlor 23+32, 10, 10
1169     +
1170     + cmpdi 10, 10
1171     + beq Do_last_aes_dec
1172     +
1173     + # 192 bits
1174     + xxlor 24+32, 11, 11
1175     +
1176     + vcipher 15, 15, 23
1177     + vcipher 16, 16, 23
1178     + vcipher 17, 17, 23
1179     + vcipher 18, 18, 23
1180     + vcipher 19, 19, 23
1181     + vcipher 20, 20, 23
1182     + vcipher 21, 21, 23
1183     + vcipher 22, 22, 23
1184     +
1185     + vcipher 15, 15, 24
1186     + vcipher 16, 16, 24
1187     + vcipher 17, 17, 24
1188     + vcipher 18, 18, 24
1189     + vcipher 19, 19, 24
1190     + vcipher 20, 20, 24
1191     + vcipher 21, 21, 24
1192     + vcipher 22, 22, 24
1193     +
1194     + xxlor 23+32, 12, 12
1195     +
1196     + cmpdi 10, 12
1197     + beq Do_last_aes_dec
1198     +
1199     + # 256 bits
1200     + xxlor 24+32, 13, 13
1201     +
1202     + vcipher 15, 15, 23
1203     + vcipher 16, 16, 23
1204     + vcipher 17, 17, 23
1205     + vcipher 18, 18, 23
1206     + vcipher 19, 19, 23
1207     + vcipher 20, 20, 23
1208     + vcipher 21, 21, 23
1209     + vcipher 22, 22, 23
1210     +
1211     + vcipher 15, 15, 24
1212     + vcipher 16, 16, 24
1213     + vcipher 17, 17, 24
1214     + vcipher 18, 18, 24
1215     + vcipher 19, 19, 24
1216     + vcipher 20, 20, 24
1217     + vcipher 21, 21, 24
1218     + vcipher 22, 22, 24
1219     +
1220     + xxlor 23+32, 14, 14
1221     +
1222     + cmpdi 10, 14
1223     + beq Do_last_aes_dec
1224     + b aes_gcm_out
1225     +
1226     +Do_last_aes_dec:
1227     +
1228     + #
1229     + # last round
1230     + vcipherlast 15, 15, 23
1231     + vcipherlast 16, 16, 23
1232     +
1233     + xxlxor 47, 47, 15
1234     + stxvb16x 47, 0, 9 # store output
1235     + xxlxor 48, 48, 16
1236     + stxvb16x 48, 15, 9 # store output
1237     +
1238     + vcipherlast 17, 17, 23
1239     + vcipherlast 18, 18, 23
1240     +
1241     + xxlxor 49, 49, 17
1242     + stxvb16x 49, 16, 9 # store output
1243     + xxlxor 50, 50, 18
1244     + stxvb16x 50, 17, 9 # store output
1245     +
1246     + vcipherlast 19, 19, 23
1247     + vcipherlast 20, 20, 23
1248     +
1249     + xxlxor 51, 51, 19
1250     + stxvb16x 51, 18, 9 # store output
1251     + xxlxor 52, 52, 20
1252     + stxvb16x 52, 19, 9 # store output
1253     +
1254     + vcipherlast 21, 21, 23
1255     + vcipherlast 22, 22, 23
1256     +
1257     + xxlxor 53, 53, 21
1258     + stxvb16x 53, 20, 9 # store output
1259     + xxlxor 54, 54, 22
1260     + stxvb16x 54, 21, 9 # store output
1261     +
1262     + addi 9, 9, 128
1263     +
1264     + xxlor 15+32, 15, 15
1265     + xxlor 16+32, 16, 16
1266     + xxlor 17+32, 17, 17
1267     + xxlor 18+32, 18, 18
1268     + xxlor 19+32, 19, 19
1269     + xxlor 20+32, 20, 20
1270     + xxlor 21+32, 21, 21
1271     + xxlor 22+32, 22, 22
1272     +
1273     + # ghash here
1274     + ppc_aes_gcm_ghash2_4x
1275     +
1276     + xxlor 27+32, 0, 0
1277     + vaddudm 30, 30, 31 # IV + counter
1278     + vmr 29, 30
1279     + vxor 15, 30, 27 # add round key
1280     + vaddudm 30, 30, 31
1281     + vxor 16, 30, 27
1282     + vaddudm 30, 30, 31
1283     + vxor 17, 30, 27
1284     + vaddudm 30, 30, 31
1285     + vxor 18, 30, 27
1286     + vaddudm 30, 30, 31
1287     + vxor 19, 30, 27
1288     + vaddudm 30, 30, 31
1289     + vxor 20, 30, 27
1290     + vaddudm 30, 30, 31
1291     + vxor 21, 30, 27
1292     + vaddudm 30, 30, 31
1293     + vxor 22, 30, 27
1294     + addi 12, 12, -128
1295     + addi 11, 11, 128
1296     +
1297     + bdnz Loop_8x_block_dec
1298     +
1299     + vmr 30, 29
1300     +
1301     +Loop_last_block_dec:
1302     + cmpdi 12, 0
1303     + beq aes_gcm_out
1304     +
1305     + # loop last few blocks
1306     + li 10, 16
1307     + divdu 10, 12, 10
1308     +
1309     + mtctr 10
1310     +
1311     + lwz 10,240(6)
1312     +
1313     + cmpdi 12, 16
1314     + blt Final_block_dec
1315     +
1316     +Next_rem_block_dec:
1317     + lxvb16x 15, 0, 14 # load block
1318     +
1319     + Loop_aes_middle_1x
1320     +
1321     + xxlor 23+32, 10, 10
1322     +
1323     + cmpdi 10, 10
1324     + beq Do_next_1x_dec
1325     +
1326     + # 192 bits
1327     + xxlor 24+32, 11, 11
1328     +
1329     + vcipher 15, 15, 23
1330     + vcipher 15, 15, 24
1331     +
1332     + xxlor 23+32, 12, 12
1333     +
1334     + cmpdi 10, 12
1335     + beq Do_next_1x_dec
1336     +
1337     + # 256 bits
1338     + xxlor 24+32, 13, 13
1339     +
1340     + vcipher 15, 15, 23
1341     + vcipher 15, 15, 24
1342     +
1343     + xxlor 23+32, 14, 14
1344     +
1345     + cmpdi 10, 14
1346     + beq Do_next_1x_dec
1347     +
1348     +Do_next_1x_dec:
1349     + vcipherlast 15, 15, 23
1350     +
1351     + xxlxor 47, 47, 15
1352     + stxvb16x 47, 0, 9 # store output
1353     + addi 14, 14, 16
1354     + addi 9, 9, 16
1355     +
1356     + xxlor 28+32, 15, 15
1357     + ppc_update_hash_1x
1358     +
1359     + addi 12, 12, -16
1360     + addi 11, 11, 16
1361     + xxlor 19+32, 0, 0
1362     + vaddudm 30, 30, 31 # IV + counter
1363     + vxor 15, 30, 19 # add round key
1364     +
1365     + bdnz Next_rem_block_dec
1366     +
1367     + cmpdi 12, 0
1368     + beq aes_gcm_out
1369     +
1370     +Final_block_dec:
1371     + Loop_aes_middle_1x
1372     +
1373     + xxlor 23+32, 10, 10
1374     +
1375     + cmpdi 10, 10
1376     + beq Do_final_1x_dec
1377     +
1378     + # 192 bits
1379     + xxlor 24+32, 11, 11
1380     +
1381     + vcipher 15, 15, 23
1382     + vcipher 15, 15, 24
1383     +
1384     + xxlor 23+32, 12, 12
1385     +
1386     + cmpdi 10, 12
1387     + beq Do_final_1x_dec
1388     +
1389     + # 256 bits
1390     + xxlor 24+32, 13, 13
1391     +
1392     + vcipher 15, 15, 23
1393     + vcipher 15, 15, 24
1394     +
1395     + xxlor 23+32, 14, 14
1396     +
1397     + cmpdi 10, 14
1398     + beq Do_final_1x_dec
1399     +
1400     +Do_final_1x_dec:
1401     + vcipherlast 15, 15, 23
1402     +
1403     + lxvb16x 15, 0, 14 # load block
1404     + xxlxor 47, 47, 15
1405     +
1406     + # create partial block mask
1407     + li 15, 16
1408     + sub 15, 15, 12 # index to the mask
1409     +
1410     + vspltisb 16, -1 # first 16 bytes - 0xffff...ff
1411     + vspltisb 17, 0 # second 16 bytes - 0x0000...00
1412     + li 10, 192
1413     + stvx 16, 10, 1
1414     + addi 10, 10, 16
1415     + stvx 17, 10, 1
1416     +
1417     + addi 10, 1, 192
1418     + lxvb16x 16, 15, 10 # load block mask
1419     + xxland 47, 47, 16
1420     +
1421     + xxlor 28+32, 15, 15
1422     + ppc_update_hash_1x
1423     +
1424     + # * should store only the remaining bytes.
1425     + bl Write_partial_block
1426     +
1427     + b aes_gcm_out
1428     +
1429     +
1430     +___
1431     +
1432     +foreach (split("\n",$code)) {
1433     + s/\`([^\`]*)\`/eval $1/geo;
1434     +
1435     + if ($flavour =~ /le$/o) { # little-endian
1436     + s/le\?//o or
1437     + s/be\?/#be#/o;
1438     + } else {
1439     + s/le\?/#le#/o or
1440     + s/be\?//o;
1441     + }
1442     + print $_,"\n";
1443     +}
1444     +
1445     +close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1446     diff --git a/crypto/modes/build.info b/crypto/modes/build.info
1447     index 687e872..0ea122e 100644
1448     --- a/crypto/modes/build.info
1449     +++ b/crypto/modes/build.info
1450     @@ -32,7 +32,7 @@ IF[{- !$disabled{asm} -}]
1451     $MODESASM_parisc20_64=$MODESASM_parisc11
1452     $MODESDEF_parisc20_64=$MODESDEF_parisc11
1453    
1454     - $MODESASM_ppc32=ghashp8-ppc.s
1455     + $MODESASM_ppc32=ghashp8-ppc.s aes-gcm-ppc.s
1456     $MODESDEF_ppc32=
1457     $MODESASM_ppc64=$MODESASM_ppc32
1458     $MODESDEF_ppc64=$MODESDEF_ppc32
1459     @@ -71,6 +71,7 @@ INCLUDE[ghash-sparcv9.o]=..
1460     GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl
1461     GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl
1462     GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl
1463     +GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl
1464     GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl
1465     INCLUDE[ghash-armv4.o]=..
1466     GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
1467     diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h
1468     index e95ad5a..0c281a3 100644
1469     --- a/include/crypto/aes_platform.h
1470     +++ b/include/crypto/aes_platform.h
1471     @@ -74,6 +74,26 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
1472     # define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
1473     # define HWAES_xts_encrypt aes_p8_xts_encrypt
1474     # define HWAES_xts_decrypt aes_p8_xts_decrypt
1475     +# define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
1476     +# define AES_GCM_ENC_BYTES 128
1477     +# define AES_GCM_DEC_BYTES 128
1478     +size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
1479     + size_t len, const void *key, unsigned char ivec[16],
1480     + u64 *Xi);
1481     +size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
1482     + size_t len, const void *key, unsigned char ivec[16],
1483     + u64 *Xi);
1484     +size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out,
1485     + size_t len, const void *key,
1486     + unsigned char ivec[16], u64 *Xi);
1487     +size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out,
1488     + size_t len, const void *key,
1489     + unsigned char ivec[16], u64 *Xi);
1490     +# define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap
1491     +# define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap
1492     +# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
1493     + (gctx)->gcm.ghash==gcm_ghash_p8)
1494     +void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
1495     # endif /* PPC */
1496    
1497     # if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
1498     diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw.c b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
1499     index 44fa9d4..789ec12 100644
1500     --- a/providers/implementations/ciphers/cipher_aes_gcm_hw.c
1501     +++ b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
1502     @@ -141,6 +141,8 @@ static const PROV_GCM_HW aes_gcm = {
1503     # include "cipher_aes_gcm_hw_t4.inc"
1504     #elif defined(AES_PMULL_CAPABLE) && defined(AES_GCM_ASM)
1505     # include "cipher_aes_gcm_hw_armv8.inc"
1506     +#elif defined(PPC_AES_GCM_CAPABLE)
1507     +# include "cipher_aes_gcm_hw_ppc.inc"
1508     #else
1509     const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
1510     {
1511     diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
1512     new file mode 100644
1513     index 0000000..4eed0f4
1514     --- /dev/null
1515     +++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
1516     @@ -0,0 +1,119 @@
1517     +/*
1518     + * Copyright 2001-2021 The OpenSSL Project Authors. All Rights Reserved.
1519     + *
1520     + * Licensed under the Apache License 2.0 (the "License"). You may not use
1521     + * this file except in compliance with the License. You can obtain a copy
1522     + * in the file LICENSE in the source distribution or at
1523     + * https://www.openssl.org/source/license.html
1524     + */
1525     +
1526     +/*-
1527     + * PPC support for AES GCM.
1528     + * This file is included by cipher_aes_gcm_hw.c
1529     + */
1530     +
1531     +static int aes_ppc_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
1532     + size_t keylen)
1533     +{
1534     + PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx;
1535     + AES_KEY *ks = &actx->ks.ks;
1536     +
1537     + GCM_HW_SET_KEY_CTR_FN(ks, aes_p8_set_encrypt_key, aes_p8_encrypt,
1538     + aes_p8_ctr32_encrypt_blocks);
1539     + return 1;
1540     +}
1541     +
1542     +
1543     +extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
1544     + const void *key, unsigned char ivec[16], u64 *Xi);
1545     +extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
1546     + const void *key, unsigned char ivec[16], u64 *Xi);
1547     +
1548     +static inline u32 UTO32(unsigned char *buf)
1549     +{
1550     + return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]);
1551     +}
1552     +
1553     +static inline u32 add32TOU(unsigned char buf[4], u32 n)
1554     +{
1555     + u32 r;
1556     +
1557     + r = UTO32(buf);
1558     + r += n;
1559     + buf[0] = (unsigned char) (r >> 24) & 0xFF;
1560     + buf[1] = (unsigned char) (r >> 16) & 0xFF;
1561     + buf[2] = (unsigned char) (r >> 8) & 0xFF;
1562     + buf[3] = (unsigned char) r & 0xFF;
1563     + return r;
1564     +}
1565     +
1566     +static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
1567     + const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
1568     +{
1569     + int s = 0;
1570     + int ndone = 0;
1571     + int ctr_reset = 0;
1572     + u64 blocks_unused;
1573     + u64 nb = len / 16;
1574     + u64 next_ctr = 0;
1575     + unsigned char ctr_saved[12];
1576     +
1577     + memcpy(ctr_saved, ivec, 12);
1578     +
1579     + while (nb) {
1580     + blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12);
1581     + if (nb > blocks_unused) {
1582     + len = blocks_unused * 16;
1583     + nb -= blocks_unused;
1584     + next_ctr = blocks_unused;
1585     + ctr_reset = 1;
1586     + } else {
1587     + len = nb * 16;
1588     + next_ctr = nb;
1589     + nb = 0;
1590     + }
1591     +
1592     + s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi)
1593     + : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi);
1594     +
1595     + /* add counter to ivec */
1596     + add32TOU(ivec + 12, (u32) next_ctr);
1597     + if (ctr_reset) {
1598     + ctr_reset = 0;
1599     + in += len;
1600     + out += len;
1601     + }
1602     + memcpy(ivec, ctr_saved, 12);
1603     + ndone += s;
1604     + }
1605     +
1606     + return ndone;
1607     +}
1608     +
1609     +size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
1610     + const void *key, unsigned char ivec[16], u64 *Xi)
1611     +{
1612     + return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1);
1613     +}
1614     +
1615     +size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
1616     + const void *key, unsigned char ivec[16], u64 *Xi)
1617     +{
1618     + return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0);
1619     +}
1620     +
1621     +
1622     +static const PROV_GCM_HW aes_ppc_gcm = {
1623     + aes_ppc_gcm_initkey,
1624     + ossl_gcm_setiv,
1625     + ossl_gcm_aad_update,
1626     + generic_aes_gcm_cipher_update,
1627     + ossl_gcm_cipher_final,
1628     + ossl_gcm_one_shot
1629     +};
1630     +
1631     +const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
1632     +{
1633     + return PPC_AES_GCM_CAPABLE ? &aes_ppc_gcm : &aes_gcm;
1634     +}
1635     +

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed