/[smecontribs]/rpms/openssl3/contribs10/0067-ppc64le-Montgomery-multiply.patch
ViewVC logotype

Annotation of /rpms/openssl3/contribs10/0067-ppc64le-Montgomery-multiply.patch

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Wed Jan 31 17:24:43 2024 UTC (10 months ago) by jpp
Branch: MAIN
CVS Tags: openssl3-3_0_7-5_el7_sme_1, HEAD
Initial import

1 jpp 1.1 From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001
2     From: Rohan McLure <rohanmclure@linux.ibm.com>
3     Date: Mon, 27 Jun 2022 12:14:55 +1000
4     Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC
5     Montgomery Multiplication""
6    
7     This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e.
8     ---
9     crypto/bn/asm/ppc64-mont-fixed.pl | 581 ++++++++++++++++++++++++++++++
10     crypto/bn/bn_ppc.c | 15 +
11     crypto/bn/build.info | 3 +-
12     3 files changed, 598 insertions(+), 1 deletion(-)
13    
14     diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
15     index e69de29bb2d1..0fb397bc5f12 100755
16     --- a/crypto/bn/asm/ppc64-mont-fixed.pl
17     +++ b/crypto/bn/asm/ppc64-mont-fixed.pl
18     @@ -0,0 +1,581 @@
19     +#! /usr/bin/env perl
20     +# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
21     +#
22     +# Licensed under the Apache License 2.0 (the "License"). You may not use
23     +# this file except in compliance with the License. You can obtain a copy
24     +# in the file LICENSE in the source distribution or at
25     +# https://www.openssl.org/source/license.html
26     +
27     +# ====================================================================
28     +# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
29     +# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
30     +# the OpenSSL project.
31     +# ====================================================================
32     +
33     +#
34     +# Fixed length (n=6), unrolled PPC Montgomery Multiplication
35     +#
36     +
37     +# 2021
38     +#
39     +# Although this is a generic implementation for unrolling Montgomery
40     +# Multiplication for arbitrary values of n, this is currently only
41     +# used for n = 6 to improve the performance of ECC p384.
42     +#
43     +# Unrolling allows intermediate results to be stored in registers,
44     +# rather than on the stack, improving performance by ~7% compared to
45     +# the existing PPC assembly code.
46     +#
47     +# The ISA 3.0 implementation uses combination multiply/add
48     +# instructions (maddld, maddhdu) to improve performance by an
49     +# additional ~10% on Power 9.
50     +#
51     +# Finally, saving non-volatile registers into volatile vector
52     +# registers instead of onto the stack saves a little more.
53     +#
54     +# On a Power 9 machine we see an overall improvement of ~18%.
55     +#
56     +
57     +use strict;
58     +use warnings;
59     +
60     +my ($flavour, $output, $dir, $xlate);
61     +
62     +# $output is the last argument if it looks like a file (it has an extension)
63     +# $flavour is the first argument if it doesn't look like a file
64     +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
65     +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66     +
67     +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68     +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
69     +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
70     +die "can't locate ppc-xlate.pl";
71     +
72     +open STDOUT,"| $^X $xlate $flavour \"$output\""
73     + or die "can't call $xlate: $!";
74     +
75     +if ($flavour !~ /64/) {
76     + die "bad flavour ($flavour) - only ppc64 permitted";
77     +}
78     +
79     +my $SIZE_T= 8;
80     +
81     +# Registers are global so the code is remotely readable
82     +
83     +# Parameters for Montgomery multiplication
84     +my $sp = "r1";
85     +my $toc = "r2";
86     +my $rp = "r3";
87     +my $ap = "r4";
88     +my $bp = "r5";
89     +my $np = "r6";
90     +my $n0 = "r7";
91     +my $num = "r8";
92     +
93     +my $i = "r9";
94     +my $c0 = "r10";
95     +my $bp0 = "r11";
96     +my $bpi = "r11";
97     +my $bpj = "r11";
98     +my $tj = "r12";
99     +my $apj = "r12";
100     +my $npj = "r12";
101     +my $lo = "r14";
102     +my $c1 = "r14";
103     +
104     +# Non-volatile registers used for tp[i]
105     +#
106     +# 12 registers are available but the limit on unrolling is 10,
107     +# since registers from $tp[0] to $tp[$n+1] are used.
108     +my @tp = ("r20" .. "r31");
109     +
110     +# volatile VSRs for saving non-volatile GPRs - faster than stack
111     +my @vsrs = ("v32" .. "v46");
112     +
113     +package Mont;
114     +
115     +sub new($$)
116     +{
117     + my ($class, $n) = @_;
118     +
119     + if ($n > 10) {
120     + die "Can't unroll for BN length ${n} (maximum 10)"
121     + }
122     +
123     + my $self = {
124     + code => "",
125     + n => $n,
126     + };
127     + bless $self, $class;
128     +
129     + return $self;
130     +}
131     +
132     +sub add_code($$)
133     +{
134     + my ($self, $c) = @_;
135     +
136     + $self->{code} .= $c;
137     +}
138     +
139     +sub get_code($)
140     +{
141     + my ($self) = @_;
142     +
143     + return $self->{code};
144     +}
145     +
146     +sub get_function_name($)
147     +{
148     + my ($self) = @_;
149     +
150     + return "bn_mul_mont_fixed_n" . $self->{n};
151     +}
152     +
153     +sub get_label($$)
154     +{
155     + my ($self, $l) = @_;
156     +
157     + return "L" . $l . "_" . $self->{n};
158     +}
159     +
160     +sub get_labels($@)
161     +{
162     + my ($self, @labels) = @_;
163     +
164     + my %out = ();
165     +
166     + foreach my $l (@labels) {
167     + $out{"$l"} = $self->get_label("$l");
168     + }
169     +
170     + return \%out;
171     +}
172     +
173     +sub nl($)
174     +{
175     + my ($self) = @_;
176     +
177     + $self->add_code("\n");
178     +}
179     +
180     +sub copy_result($)
181     +{
182     + my ($self) = @_;
183     +
184     + my ($n) = $self->{n};
185     +
186     + for (my $j = 0; $j < $n; $j++) {
187     + $self->add_code(<<___);
188     + std $tp[$j],`$j*$SIZE_T`($rp)
189     +___
190     + }
191     +
192     +}
193     +
194     +sub mul_mont_fixed($)
195     +{
196     + my ($self) = @_;
197     +
198     + my ($n) = $self->{n};
199     + my $fname = $self->get_function_name();
200     + my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
201     +
202     + $self->add_code(<<___);
203     +
204     +.globl .${fname}
205     +.align 5
206     +.${fname}:
207     +
208     +___
209     +
210     + $self->save_registers();
211     +
212     + $self->add_code(<<___);
213     + ld $n0,0($n0)
214     +
215     + ld $bp0,0($bp)
216     +
217     + ld $apj,0($ap)
218     +___
219     +
220     + $self->mul_c_0($tp[0], $apj, $bp0, $c0);
221     +
222     + for (my $j = 1; $j < $n - 1; $j++) {
223     + $self->add_code(<<___);
224     + ld $apj,`$j*$SIZE_T`($ap)
225     +___
226     + $self->mul($tp[$j], $apj, $bp0, $c0);
227     + }
228     +
229     + $self->add_code(<<___);
230     + ld $apj,`($n-1)*$SIZE_T`($ap)
231     +___
232     +
233     + $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
234     +
235     + $self->add_code(<<___);
236     + li $tp[$n+1],0
237     +
238     +___
239     +
240     + $self->add_code(<<___);
241     + li $i,0
242     + mtctr $num
243     + b $label->{"enter"}
244     +
245     +.align 4
246     +$label->{"outer"}:
247     + ldx $bpi,$bp,$i
248     +
249     + ld $apj,0($ap)
250     +___
251     +
252     + $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
253     +
254     + for (my $j = 1; $j < $n; $j++) {
255     + $self->add_code(<<___);
256     + ld $apj,`$j*$SIZE_T`($ap)
257     +___
258     + $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
259     + }
260     +
261     + $self->add_code(<<___);
262     + addc $tp[$n],$tp[$n],$c0
263     + addze $tp[$n+1],$tp[$n+1]
264     +___
265     +
266     + $self->add_code(<<___);
267     +.align 4
268     +$label->{"enter"}:
269     + mulld $bpi,$tp[0],$n0
270     +
271     + ld $npj,0($np)
272     +___
273     +
274     + $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
275     +
276     + for (my $j = 1; $j < $n; $j++) {
277     + $self->add_code(<<___);
278     + ld $npj,`$j*$SIZE_T`($np)
279     +___
280     + $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
281     + }
282     +
283     + $self->add_code(<<___);
284     + addc $tp[$n-1],$tp[$n],$c0
285     + addze $tp[$n],$tp[$n+1]
286     +
287     + addi $i,$i,$SIZE_T
288     + bdnz $label->{"outer"}
289     +
290     + and. $tp[$n],$tp[$n],$tp[$n]
291     + bne $label->{"sub"}
292     +
293     + cmpld $tp[$n-1],$npj
294     + blt $label->{"copy"}
295     +
296     +$label->{"sub"}:
297     +___
298     +
299     + #
300     + # Reduction
301     + #
302     +
303     + $self->add_code(<<___);
304     + ld $bpj,`0*$SIZE_T`($np)
305     + subfc $c1,$bpj,$tp[0]
306     + std $c1,`0*$SIZE_T`($rp)
307     +
308     +___
309     + for (my $j = 1; $j < $n - 1; $j++) {
310     + $self->add_code(<<___);
311     + ld $bpj,`$j*$SIZE_T`($np)
312     + subfe $c1,$bpj,$tp[$j]
313     + std $c1,`$j*$SIZE_T`($rp)
314     +
315     +___
316     + }
317     +
318     + $self->add_code(<<___);
319     + subfe $c1,$npj,$tp[$n-1]
320     + std $c1,`($n-1)*$SIZE_T`($rp)
321     +
322     +___
323     +
324     + $self->add_code(<<___);
325     + addme. $tp[$n],$tp[$n]
326     + beq $label->{"end"}
327     +
328     +$label->{"copy"}:
329     +___
330     +
331     + $self->copy_result();
332     +
333     + $self->add_code(<<___);
334     +
335     +$label->{"end"}:
336     +___
337     +
338     + $self->restore_registers();
339     +
340     + $self->add_code(<<___);
341     + li r3,1
342     + blr
343     +.size .${fname},.-.${fname}
344     +___
345     +
346     +}
347     +
348     +package Mont::GPR;
349     +
350     +our @ISA = ('Mont');
351     +
352     +sub new($$)
353     +{
354     + my ($class, $n) = @_;
355     +
356     + return $class->SUPER::new($n);
357     +}
358     +
359     +sub save_registers($)
360     +{
361     + my ($self) = @_;
362     +
363     + my $n = $self->{n};
364     +
365     + $self->add_code(<<___);
366     + std $lo,-8($sp)
367     +___
368     +
369     + for (my $j = 0; $j <= $n+1; $j++) {
370     + $self->{code}.=<<___;
371     + std $tp[$j],-`($j+2)*8`($sp)
372     +___
373     + }
374     +
375     + $self->add_code(<<___);
376     +
377     +___
378     +}
379     +
380     +sub restore_registers($)
381     +{
382     + my ($self) = @_;
383     +
384     + my $n = $self->{n};
385     +
386     + $self->add_code(<<___);
387     + ld $lo,-8($sp)
388     +___
389     +
390     + for (my $j = 0; $j <= $n+1; $j++) {
391     + $self->{code}.=<<___;
392     + ld $tp[$j],-`($j+2)*8`($sp)
393     +___
394     + }
395     +
396     + $self->{code} .=<<___;
397     +
398     +___
399     +}
400     +
401     +# Direct translation of C mul()
402     +sub mul($$$$$)
403     +{
404     + my ($self, $r, $a, $w, $c) = @_;
405     +
406     + $self->add_code(<<___);
407     + mulld $lo,$a,$w
408     + addc $r,$lo,$c
409     + mulhdu $c,$a,$w
410     + addze $c,$c
411     +
412     +___
413     +}
414     +
415     +# Like mul() but $c is ignored as an input - an optimisation to save a
416     +# preliminary instruction that would set input $c to 0
417     +sub mul_c_0($$$$$)
418     +{
419     + my ($self, $r, $a, $w, $c) = @_;
420     +
421     + $self->add_code(<<___);
422     + mulld $r,$a,$w
423     + mulhdu $c,$a,$w
424     +
425     +___
426     +}
427     +
428     +# Like mul() but does not to the final addition of CA into $c - an
429     +# optimisation to save an instruction
430     +sub mul_last($$$$$$)
431     +{
432     + my ($self, $r1, $r2, $a, $w, $c) = @_;
433     +
434     + $self->add_code(<<___);
435     + mulld $lo,$a,$w
436     + addc $r1,$lo,$c
437     + mulhdu $c,$a,$w
438     +
439     + addze $r2,$c
440     +___
441     +}
442     +
443     +# Like C mul_add() but allow $r_out and $r_in to be different
444     +sub mul_add($$$$$$)
445     +{
446     + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
447     +
448     + $self->add_code(<<___);
449     + mulld $lo,$a,$w
450     + addc $lo,$lo,$c
451     + mulhdu $c,$a,$w
452     + addze $c,$c
453     + addc $r_out,$r_in,$lo
454     + addze $c,$c
455     +
456     +___
457     +}
458     +
459     +# Like mul_add() but $c is ignored as an input - an optimisation to save a
460     +# preliminary instruction that would set input $c to 0
461     +sub mul_add_c_0($$$$$$)
462     +{
463     + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
464     +
465     + $self->add_code(<<___);
466     + mulld $lo,$a,$w
467     + addc $r_out,$r_in,$lo
468     + mulhdu $c,$a,$w
469     + addze $c,$c
470     +
471     +___
472     +}
473     +
474     +package Mont::GPR_300;
475     +
476     +our @ISA = ('Mont::GPR');
477     +
478     +sub new($$)
479     +{
480     + my ($class, $n) = @_;
481     +
482     + my $mont = $class->SUPER::new($n);
483     +
484     + return $mont;
485     +}
486     +
487     +sub get_function_name($)
488     +{
489     + my ($self) = @_;
490     +
491     + return "bn_mul_mont_300_fixed_n" . $self->{n};
492     +}
493     +
494     +sub get_label($$)
495     +{
496     + my ($self, $l) = @_;
497     +
498     + return "L" . $l . "_300_" . $self->{n};
499     +}
500     +
501     +# Direct translation of C mul()
502     +sub mul($$$$$)
503     +{
504     + my ($self, $r, $a, $w, $c, $last) = @_;
505     +
506     + $self->add_code(<<___);
507     + maddld $r,$a,$w,$c
508     + maddhdu $c,$a,$w,$c
509     +
510     +___
511     +}
512     +
513     +# Save the last carry as the final entry
514     +sub mul_last($$$$$)
515     +{
516     + my ($self, $r1, $r2, $a, $w, $c) = @_;
517     +
518     + $self->add_code(<<___);
519     + maddld $r1,$a,$w,$c
520     + maddhdu $r2,$a,$w,$c
521     +
522     +___
523     +}
524     +
525     +# Like mul() but $c is ignored as an input - an optimisation to save a
526     +# preliminary instruction that would set input $c to 0
527     +sub mul_c_0($$$$$)
528     +{
529     + my ($self, $r, $a, $w, $c) = @_;
530     +
531     + $self->add_code(<<___);
532     + mulld $r,$a,$w
533     + mulhdu $c,$a,$w
534     +
535     +___
536     +}
537     +
538     +# Like C mul_add() but allow $r_out and $r_in to be different
539     +sub mul_add($$$$$$)
540     +{
541     + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
542     +
543     + $self->add_code(<<___);
544     + maddld $lo,$a,$w,$c
545     + maddhdu $c,$a,$w,$c
546     + addc $r_out,$r_in,$lo
547     + addze $c,$c
548     +
549     +___
550     +}
551     +
552     +# Like mul_add() but $c is ignored as an input - an optimisation to save a
553     +# preliminary instruction that would set input $c to 0
554     +sub mul_add_c_0($$$$$$)
555     +{
556     + my ($self, $r_out, $r_in, $a, $w, $c) = @_;
557     +
558     + $self->add_code(<<___);
559     + maddld $lo,$a,$w,$r_in
560     + maddhdu $c,$a,$w,$r_in
561     +___
562     +
563     + if ($r_out ne $lo) {
564     + $self->add_code(<<___);
565     + mr $r_out,$lo
566     +___
567     + }
568     +
569     + $self->nl();
570     +}
571     +
572     +
573     +package main;
574     +
575     +my $code;
576     +
577     +$code.=<<___;
578     +.machine "any"
579     +.text
580     +___
581     +
582     +my $mont;
583     +
584     +$mont = new Mont::GPR(6);
585     +$mont->mul_mont_fixed();
586     +$code .= $mont->get_code();
587     +
588     +$mont = new Mont::GPR_300(6);
589     +$mont->mul_mont_fixed();
590     +$code .= $mont->get_code();
591     +
592     +$code =~ s/\`([^\`]*)\`/eval $1/gem;
593     +
594     +$code.=<<___;
595     +.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
596     +___
597     +
598     +print $code;
599     +close STDOUT or die "error closing STDOUT: $!";
600     diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c
601     index 3ee76ea96574..1e9421bee213 100644
602     --- a/crypto/bn/bn_ppc.c
603     +++ b/crypto/bn/bn_ppc.c
604     @@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
605     const BN_ULONG *np, const BN_ULONG *n0, int num);
606     int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
607     const BN_ULONG *np, const BN_ULONG *n0, int num);
608     + int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
609     + const BN_ULONG *bp, const BN_ULONG *np,
610     + const BN_ULONG *n0, int num);
611     + int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
612     + const BN_ULONG *bp, const BN_ULONG *np,
613     + const BN_ULONG *n0, int num);
614    
615     if (num < 4)
616     return 0;
617     @@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
618     * no opportunity to figure it out...
619     */
620    
621     +#if defined(_ARCH_PPC64) && !defined(__ILP32__)
622     + if (num == 6) {
623     + if (OPENSSL_ppccap_P & PPC_MADD300)
624     + return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
625     + else
626     + return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
627     + }
628     +#endif
629     +
630     return bn_mul_mont_int(rp, ap, bp, np, n0, num);
631     }
632     diff --git a/crypto/bn/build.info b/crypto/bn/build.info
633     index 4f8d0689b5ea..987a70ae263b 100644
634     --- a/crypto/bn/build.info
635     +++ b/crypto/bn/build.info
636     @@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]
637    
638     $BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s
639     $BNDEF_ppc32=OPENSSL_BN_ASM_MONT
640     - $BNASM_ppc64=$BNASM_ppc32
641     + $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s
642     $BNDEF_ppc64=$BNDEF_ppc32
643    
644     $BNASM_c64xplus=asm/bn-c64xplus.asm
645     @@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl
646     GENERATE[bn-ppc.s]=asm/ppc.pl
647     GENERATE[ppc-mont.s]=asm/ppc-mont.pl
648     GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl
649     +GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl
650    
651     GENERATE[alpha-mont.S]=asm/alpha-mont.pl
652    
653    
654     From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001
655     From: Rohan McLure <rohanmclure@linux.ibm.com>
656     Date: Thu, 30 Jun 2022 16:21:06 +1000
657     Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9
658    
659     In the reference C implementation in bn_asm.c, tp[num + 1] contains the
660     carry bit for accumulations into tp[num]. tp[num + 1] is only ever
661     assigned, never itself incremented.
662     ---
663     crypto/bn/asm/ppc64-mont-fixed.pl | 6 ++++--
664     1 file changed, 4 insertions(+), 2 deletions(-)
665    
666     diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
667     index 0fb397bc5f12..e27d0ad93d85 100755
668     --- a/crypto/bn/asm/ppc64-mont-fixed.pl
669     +++ b/crypto/bn/asm/ppc64-mont-fixed.pl
670     @@ -63,6 +63,7 @@
671     # Registers are global so the code is remotely readable
672    
673     # Parameters for Montgomery multiplication
674     +my $ze = "r0";
675     my $sp = "r1";
676     my $toc = "r2";
677     my $rp = "r3";
678     @@ -192,6 +193,7 @@ ($)
679     $self->save_registers();
680    
681     $self->add_code(<<___);
682     + li $ze,0
683     ld $n0,0($n0)
684    
685     ld $bp0,0($bp)
686     @@ -242,7 +244,7 @@ ($)
687    
688     $self->add_code(<<___);
689     addc $tp[$n],$tp[$n],$c0
690     - addze $tp[$n+1],$tp[$n+1]
691     + addze $tp[$n+1],$ze
692     ___
693    
694     $self->add_code(<<___);
695     @@ -272,7 +274,7 @@ ($)
696     and. $tp[$n],$tp[$n],$tp[$n]
697     bne $label->{"sub"}
698    
699     - cmpld $tp[$n-1],$npj
700     + cmpld $tp[$n-1],$npj
701     blt $label->{"copy"}
702    
703     $label->{"sub"}:

admin@koozali.org
ViewVC Help
Powered by ViewVC 1.2.1 RSS 2.0 feed