/[smecontribs]/rpms/openssl3/contribs10/0067-ppc64le-Montgomery-multiply.patch
ViewVC logotype

Contents of /rpms/openssl3/contribs10/0067-ppc64le-Montgomery-multiply.patch

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download)
Wed Jan 31 17:24:43 2024 UTC (9 months, 3 weeks ago) by jpp
Branch: MAIN
CVS Tags: openssl3-3_0_7-5_el7_sme_1, HEAD
Initial import

1 From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001
2 From: Rohan McLure <rohanmclure@linux.ibm.com>
3 Date: Mon, 27 Jun 2022 12:14:55 +1000
4 Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC
5 Montgomery Multiplication""
6
7 This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e.
8 ---
9 crypto/bn/asm/ppc64-mont-fixed.pl | 581 ++++++++++++++++++++++++++++++
10 crypto/bn/bn_ppc.c | 15 +
11 crypto/bn/build.info | 3 +-
12 3 files changed, 598 insertions(+), 1 deletion(-)
13
14 diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
15 index e69de29bb2d1..0fb397bc5f12 100755
16 --- a/crypto/bn/asm/ppc64-mont-fixed.pl
17 +++ b/crypto/bn/asm/ppc64-mont-fixed.pl
18 @@ -0,0 +1,581 @@
19 +#! /usr/bin/env perl
20 +# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
21 +#
22 +# Licensed under the Apache License 2.0 (the "License"). You may not use
23 +# this file except in compliance with the License. You can obtain a copy
24 +# in the file LICENSE in the source distribution or at
25 +# https://www.openssl.org/source/license.html
26 +
27 +# ====================================================================
28 +# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
29 +# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
30 +# the OpenSSL project.
31 +# ====================================================================
32 +
33 +#
34 +# Fixed length (n=6), unrolled PPC Montgomery Multiplication
35 +#
36 +
37 +# 2021
38 +#
39 +# Although this is a generic implementation for unrolling Montgomery
40 +# Multiplication for arbitrary values of n, this is currently only
41 +# used for n = 6 to improve the performance of ECC p384.
42 +#
43 +# Unrolling allows intermediate results to be stored in registers,
44 +# rather than on the stack, improving performance by ~7% compared to
45 +# the existing PPC assembly code.
46 +#
47 +# The ISA 3.0 implementation uses combination multiply/add
48 +# instructions (maddld, maddhdu) to improve performance by an
49 +# additional ~10% on Power 9.
50 +#
51 +# Finally, saving non-volatile registers into volatile vector
52 +# registers instead of onto the stack saves a little more.
53 +#
54 +# On a Power 9 machine we see an overall improvement of ~18%.
55 +#
56 +
57 +use strict;
58 +use warnings;
59 +
60 +my ($flavour, $output, $dir, $xlate);
61 +
62 +# $output is the last argument if it looks like a file (it has an extension)
63 +# $flavour is the first argument if it doesn't look like a file
64 +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
65 +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66 +
67 +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68 +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
69 +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
70 +die "can't locate ppc-xlate.pl";
71 +
72 +open STDOUT,"| $^X $xlate $flavour \"$output\""
73 + or die "can't call $xlate: $!";
74 +
75 +if ($flavour !~ /64/) {
76 + die "bad flavour ($flavour) - only ppc64 permitted";
77 +}
78 +
79 +my $SIZE_T= 8;
80 +
81 +# Registers are global so the code is remotely readable
82 +
83 +# Parameters for Montgomery multiplication
84 +my $sp = "r1";
85 +my $toc = "r2";
86 +my $rp = "r3";
87 +my $ap = "r4";
88 +my $bp = "r5";
89 +my $np = "r6";
90 +my $n0 = "r7";
91 +my $num = "r8";
92 +
93 +my $i = "r9";
94 +my $c0 = "r10";
95 +my $bp0 = "r11";
96 +my $bpi = "r11";
97 +my $bpj = "r11";
98 +my $tj = "r12";
99 +my $apj = "r12";
100 +my $npj = "r12";
101 +my $lo = "r14";
102 +my $c1 = "r14";
103 +
104 +# Non-volatile registers used for tp[i]
105 +#
106 +# 12 registers are available but the limit on unrolling is 10,
107 +# since registers from $tp[0] to $tp[$n+1] are used.
108 +my @tp = ("r20" .. "r31");
109 +
110 +# volatile VSRs for saving non-volatile GPRs - faster than stack
111 +my @vsrs = ("v32" .. "v46");
112 +
113 +package Mont;
114 +
115 +sub new($$)
116 +{
117 + my ($class, $n) = @_;
118 +
119 + if ($n > 10) {
120 + die "Can't unroll for BN length ${n} (maximum 10)"
121 + }
122 +
123 + my $self = {
124 + code => "",
125 + n => $n,
126 + };
127 + bless $self, $class;
128 +
129 + return $self;
130 +}
131 +
132 +sub add_code($$)
133 +{
134 + my ($self, $c) = @_;
135 +
136 + $self->{code} .= $c;
137 +}
138 +
139 +sub get_code($)
140 +{
141 + my ($self) = @_;
142 +
143 + return $self->{code};
144 +}
145 +
146 +sub get_function_name($)
147 +{
148 + my ($self) = @_;
149 +
150 + return "bn_mul_mont_fixed_n" . $self->{n};
151 +}
152 +
153 +sub get_label($$)
154 +{
155 + my ($self, $l) = @_;
156 +
157 + return "L" . $l . "_" . $self->{n};
158 +}
159 +
160 +sub get_labels($@)
161 +{
162 + my ($self, @labels) = @_;
163 +
164 + my %out = ();
165 +
166 + foreach my $l (@labels) {
167 + $out{"$l"} = $self->get_label("$l");
168 + }
169 +
170 + return \%out;
171 +}
172 +
173 +sub nl($)
174 +{
175 + my ($self) = @_;
176 +
177 + $self->add_code("\n");
178 +}
179 +
180 +sub copy_result($)
181 +{
182 + my ($self) = @_;
183 +
184 + my ($n) = $self->{n};
185 +
186 + for (my $j = 0; $j < $n; $j++) {
187 + $self->add_code(<<___);
188 + std $tp[$j],`$j*$SIZE_T`($rp)
189 +___
190 + }
191 +
192 +}
193 +
194 +sub mul_mont_fixed($)
195 +{
196 + my ($self) = @_;
197 +
198 + my ($n) = $self->{n};
199 + my $fname = $self->get_function_name();
200 + my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
201 +
202 + $self->add_code(<<___);
203 +
204 +.globl .${fname}
205 +.align 5
206 +.${fname}:
207 +
208 +___
209 +
210 + $self->save_registers();
211 +
212 + $self->add_code(<<___);
213 + ld $n0,0($n0)
214 +
215 + ld $bp0,0($bp)
216 +
217 + ld $apj,0($ap)
218 +___
219 +
220 + $self->mul_c_0($tp[0], $apj, $bp0, $c0);
221 +
222 + for (my $j = 1; $j < $n - 1; $j++) {
223 + $self->add_code(<<___);
224 + ld $apj,`$j*$SIZE_T`($ap)
225 +___
226 + $self->mul($tp[$j], $apj, $bp0, $c0);
227 + }
228 +
229 + $self->add_code(<<___);
230 + ld $apj,`($n-1)*$SIZE_T`($ap)
231 +___
232 +
233 + $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
234 +
235 + $self->add_code(<<___);
236 + li $tp[$n+1],0
237 +
238 +___
239 +
240 + $self->add_code(<<___);
241 + li $i,0
242 + mtctr $num
243 + b $label->{"enter"}
244 +
245 +.align 4
246 +$label->{"outer"}:
247 + ldx $bpi,$bp,$i
248 +
249 + ld $apj,0($ap)
250 +___
251 +
252 + $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
253 +
254 + for (my $j = 1; $j < $n; $j++) {
255 + $self->add_code(<<___);
256 + ld $apj,`$j*$SIZE_T`($ap)
257 +___
258 + $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
259 + }
260 +
261 + $self->add_code(<<___);
262 + addc $tp[$n],$tp[$n],$c0
263 + addze $tp[$n+1],$tp[$n+1]
264 +___
265 +
266 + $self->add_code(<<___);
267 +.align 4
268 +$label->{"enter"}:
269 + mulld $bpi,$tp[0],$n0
270 +
271 + ld $npj,0($np)
272 +___
273 +
274 + $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
275 +
276 + for (my $j = 1; $j < $n; $j++) {
277 + $self->add_code(<<___);
278 + ld $npj,`$j*$SIZE_T`($np)
279 +___
280 + $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
281 + }
282 +
283 + $self->add_code(<<___);
284 + addc $tp[$n-1],$tp[$n],$c0
285 + addze $tp[$n],$tp[$n+1]
286 +
287 + addi $i,$i,$SIZE_T
288 + bdnz $label->{"outer"}
289 +
290 + and. $tp[$n],$tp[$n],$tp[$n]
291 + bne $label->{"sub"}
292 +
293 + cmpld $tp[$n-1],$npj
294 + blt $label->{"copy"}
295 +
296 +$label->{"sub"}:
297 +___
298 +
299 + #
300 + # Reduction
301 + #
302 +
303 + $self->add_code(<<___);
304 + ld $bpj,`0*$SIZE_T`($np)
305 + subfc $c1,$bpj,$tp[0]
306 + std $c1,`0*$SIZE_T`($rp)
307 +
308 +___
309 + for (my $j = 1; $j < $n - 1; $j++) {
310 + $self->add_code(<<___);
311 + ld $bpj,`$j*$SIZE_T`($np)
312 + subfe $c1,$bpj,$tp[$j]
313 + std $c1,`$j*$SIZE_T`($rp)
314 +
315 +___
316 + }
317 +
318 + $self->add_code(<<___);
319 + subfe $c1,$npj,$tp[$n-1]
320 + std $c1,`($n-1)*$SIZE_T`($rp)
321 +
322 +___
323 +
324 + $self->add_code(<<___);
325 + addme. $tp[$n],$tp[$n]
326 + beq $label->{"end"}
327 +
328 +$label->{"copy"}:
329 +___
330 +
331 + $self->copy_result();
332 +
333 + $self->add_code(<<___);
334 +
335 +$label->{"end"}:
336 +___
337 +
338 + $self->restore_registers();
339 +
340 + $self->add_code(<<___);
341 + li r3,1
342 + blr
343 +.size .${fname},.-.${fname}
344 +___
345 +
346 +}
347 +
348 +package Mont::GPR;
349 +
350 +our @ISA = ('Mont');
351 +
352 +sub new($$)
353 +{
354 + my ($class, $n) = @_;
355 +
356 + return $class->SUPER::new($n);
357 +}
358 +
359 +sub save_registers($)
360 +{
361 + my ($self) = @_;
362 +
363 + my $n = $self->{n};
364 +
365 + $self->add_code(<<___);
366 + std $lo,-8($sp)
367 +___
368 +
369 + for (my $j = 0; $j <= $n+1; $j++) {
370 + $self->{code}.=<<___;
371 + std $tp[$j],-`($j+2)*8`($sp)
372 +___
373 + }
374 +
375 + $self->add_code(<<___);
376 +
377 +___
378 +}
379 +
380 +sub restore_registers($)
381 +{
382 + my ($self) = @_;
383 +
384 + my $n = $self->{n};
385 +
386 + $self->add_code(<<___);
387 + ld $lo,-8($sp)
388 +___
389 +
390 + for (my $j = 0; $j <= $n+1; $j++) {
391 + $self->{code}.=<<___;
392 + ld $tp[$j],-`($j+2)*8`($sp)
393 +___
394 + }
395 +
396 + $self->{code} .=<<___;
397 +
398 +___
399 +}
400 +
401 +# Direct translation of C mul()
402 +sub mul($$$$$)
403 +{
404 + my ($self, $r, $a, $w, $c) = @_;
405 +
406 + $self->add_code(<<___);
407 + mulld $lo,$a,$w
408 + addc $r,$lo,$c
409 + mulhdu $c,$a,$w
410 + addze $c,$c
411 +
412 +___
413 +}
414 +
415 +# Like mul() but $c is ignored as an input - an optimisation to save a
416 +# preliminary instruction that would set input $c to 0
417 +sub mul_c_0($$$$$)
418 +{
419 + my ($self, $r, $a, $w, $c) = @_;
420 +
421 + $self->add_code(<<___);
422 + mulld $r,$a,$w
423 + mulhdu $c,$a,$w
424 +
425 +___
426 +}
427 +
428 +# Like mul() but does not to the final addition of CA into $c - an
429 +# optimisation to save an instruction
430 +sub mul_last($$$$$$)
431 +{
432 + my ($self, $r1, $r2, $a, $w, $c) = @_;
433 +
434 + $self->add_code(<<___);
435 + mulld $lo,$a,$w
436 + addc $r1,$lo,$c
437 + mulhdu $c,$a,$w
438 +
439 + addze $r2,$c
440 +___
441 +}
442 +
443 +# Like C mul_add() but allow $r_out and $r_in to be different
444 +sub mul_add($$$$$$)
445 +{
446 + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
447 +
448 + $self->add_code(<<___);
449 + mulld $lo,$a,$w
450 + addc $lo,$lo,$c
451 + mulhdu $c,$a,$w
452 + addze $c,$c
453 + addc $r_out,$r_in,$lo
454 + addze $c,$c
455 +
456 +___
457 +}
458 +
459 +# Like mul_add() but $c is ignored as an input - an optimisation to save a
460 +# preliminary instruction that would set input $c to 0
461 +sub mul_add_c_0($$$$$$)
462 +{
463 + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
464 +
465 + $self->add_code(<<___);
466 + mulld $lo,$a,$w
467 + addc $r_out,$r_in,$lo
468 + mulhdu $c,$a,$w
469 + addze $c,$c
470 +
471 +___
472 +}
473 +
474 +package Mont::GPR_300;
475 +
476 +our @ISA = ('Mont::GPR');
477 +
478 +sub new($$)
479 +{
480 + my ($class, $n) = @_;
481 +
482 + my $mont = $class->SUPER::new($n);
483 +
484 + return $mont;
485 +}
486 +
487 +sub get_function_name($)
488 +{
489 + my ($self) = @_;
490 +
491 + return "bn_mul_mont_300_fixed_n" . $self->{n};
492 +}
493 +
494 +sub get_label($$)
495 +{
496 + my ($self, $l) = @_;
497 +
498 + return "L" . $l . "_300_" . $self->{n};
499 +}
500 +
501 +# Direct translation of C mul()
502 +sub mul($$$$$)
503 +{
504 + my ($self, $r, $a, $w, $c, $last) = @_;
505 +
506 + $self->add_code(<<___);
507 + maddld $r,$a,$w,$c
508 + maddhdu $c,$a,$w,$c
509 +
510 +___
511 +}
512 +
513 +# Save the last carry as the final entry
514 +sub mul_last($$$$$)
515 +{
516 + my ($self, $r1, $r2, $a, $w, $c) = @_;
517 +
518 + $self->add_code(<<___);
519 + maddld $r1,$a,$w,$c
520 + maddhdu $r2,$a,$w,$c
521 +
522 +___
523 +}
524 +
525 +# Like mul() but $c is ignored as an input - an optimisation to save a
526 +# preliminary instruction that would set input $c to 0
527 +sub mul_c_0($$$$$)
528 +{
529 + my ($self, $r, $a, $w, $c) = @_;
530 +
531 + $self->add_code(<<___);
532 + mulld $r,$a,$w
533 + mulhdu $c,$a,$w
534 +
535 +___
536 +}
537 +
538 +# Like C mul_add() but allow $r_out and $r_in to be different
539 +sub mul_add($$$$$$)
540 +{
541 + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
542 +
543 + $self->add_code(<<___);
544 + maddld $lo,$a,$w,$c
545 + maddhdu $c,$a,$w,$c
546 + addc $r_out,$r_in,$lo
547 + addze $c,$c
548 +
549 +___
550 +}
551 +
552 +# Like mul_add() but $c is ignored as an input - an optimisation to save a
553 +# preliminary instruction that would set input $c to 0
554 +sub mul_add_c_0($$$$$$)
555 +{
556 + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
557 +
558 + $self->add_code(<<___);
559 + maddld $lo,$a,$w,$r_in
560 + maddhdu $c,$a,$w,$r_in
561 +___
562 +
563 + if ($r_out ne $lo) {
564 + $self->add_code(<<___);
565 + mr $r_out,$lo
566 +___
567 + }
568 +
569 + $self->nl();
570 +}
571 +
572 +
573 +package main;
574 +
575 +my $code;
576 +
577 +$code.=<<___;
578 +.machine "any"
579 +.text
580 +___
581 +
582 +my $mont;
583 +
584 +$mont = new Mont::GPR(6);
585 +$mont->mul_mont_fixed();
586 +$code .= $mont->get_code();
587 +
588 +$mont = new Mont::GPR_300(6);
589 +$mont->mul_mont_fixed();
590 +$code .= $mont->get_code();
591 +
592 +$code =~ s/\`([^\`]*)\`/eval $1/gem;
593 +
594 +$code.=<<___;
595 +.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
596 +___
597 +
598 +print $code;
599 +close STDOUT or die "error closing STDOUT: $!";
600 diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c
601 index 3ee76ea96574..1e9421bee213 100644
602 --- a/crypto/bn/bn_ppc.c
603 +++ b/crypto/bn/bn_ppc.c
604 @@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
605 const BN_ULONG *np, const BN_ULONG *n0, int num);
606 int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
607 const BN_ULONG *np, const BN_ULONG *n0, int num);
608 + int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
609 + const BN_ULONG *bp, const BN_ULONG *np,
610 + const BN_ULONG *n0, int num);
611 + int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
612 + const BN_ULONG *bp, const BN_ULONG *np,
613 + const BN_ULONG *n0, int num);
614
615 if (num < 4)
616 return 0;
617 @@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
618 * no opportunity to figure it out...
619 */
620
621 +#if defined(_ARCH_PPC64) && !defined(__ILP32__)
622 + if (num == 6) {
623 + if (OPENSSL_ppccap_P & PPC_MADD300)
624 + return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
625 + else
626 + return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
627 + }
628 +#endif
629 +
630 return bn_mul_mont_int(rp, ap, bp, np, n0, num);
631 }
632 diff --git a/crypto/bn/build.info b/crypto/bn/build.info
633 index 4f8d0689b5ea..987a70ae263b 100644
634 --- a/crypto/bn/build.info
635 +++ b/crypto/bn/build.info
636 @@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]
637
638 $BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s
639 $BNDEF_ppc32=OPENSSL_BN_ASM_MONT
640 - $BNASM_ppc64=$BNASM_ppc32
641 + $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s
642 $BNDEF_ppc64=$BNDEF_ppc32
643
644 $BNASM_c64xplus=asm/bn-c64xplus.asm
645 @@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl
646 GENERATE[bn-ppc.s]=asm/ppc.pl
647 GENERATE[ppc-mont.s]=asm/ppc-mont.pl
648 GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl
649 +GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl
650
651 GENERATE[alpha-mont.S]=asm/alpha-mont.pl
652
653
654 From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001
655 From: Rohan McLure <rohanmclure@linux.ibm.com>
656 Date: Thu, 30 Jun 2022 16:21:06 +1000
657 Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9
658
659 In the reference C implementation in bn_asm.c, tp[num + 1] contains the
660 carry bit for accumulations into tp[num]. tp[num + 1] is only ever
661 assigned, never itself incremented.
662 ---
663 crypto/bn/asm/ppc64-mont-fixed.pl | 6 ++++--
664 1 file changed, 4 insertions(+), 2 deletions(-)
665
666 diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
667 index 0fb397bc5f12..e27d0ad93d85 100755
668 --- a/crypto/bn/asm/ppc64-mont-fixed.pl
669 +++ b/crypto/bn/asm/ppc64-mont-fixed.pl
670 @@ -63,6 +63,7 @@
671 # Registers are global so the code is remotely readable
672
673 # Parameters for Montgomery multiplication
674 +my $ze = "r0";
675 my $sp = "r1";
676 my $toc = "r2";
677 my $rp = "r3";
678 @@ -192,6 +193,7 @@ ($)
679 $self->save_registers();
680
681 $self->add_code(<<___);
682 + li $ze,0
683 ld $n0,0($n0)
684
685 ld $bp0,0($bp)
686 @@ -242,7 +244,7 @@ ($)
687
688 $self->add_code(<<___);
689 addc $tp[$n],$tp[$n],$c0
690 - addze $tp[$n+1],$tp[$n+1]
691 + addze $tp[$n+1],$ze
692 ___
693
694 $self->add_code(<<___);
695 @@ -272,7 +274,7 @@ ($)
696 and. $tp[$n],$tp[$n],$tp[$n]
697 bne $label->{"sub"}
698
699 - cmpld $tp[$n-1],$npj
700 + cmpld $tp[$n-1],$npj
701 blt $label->{"copy"}
702
703 $label->{"sub"}:

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed