1 |
From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001 |
2 |
From: Rohan McLure <rohanmclure@linux.ibm.com> |
3 |
Date: Mon, 27 Jun 2022 12:14:55 +1000 |
4 |
Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC |
5 |
Montgomery Multiplication"" |
6 |
|
7 |
This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e. |
8 |
--- |
9 |
crypto/bn/asm/ppc64-mont-fixed.pl | 581 ++++++++++++++++++++++++++++++ |
10 |
crypto/bn/bn_ppc.c | 15 + |
11 |
crypto/bn/build.info | 3 +- |
12 |
3 files changed, 598 insertions(+), 1 deletion(-) |
13 |
|
14 |
diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl |
15 |
index e69de29bb2d1..0fb397bc5f12 100755 |
16 |
--- a/crypto/bn/asm/ppc64-mont-fixed.pl |
17 |
+++ b/crypto/bn/asm/ppc64-mont-fixed.pl |
18 |
@@ -0,0 +1,581 @@ |
19 |
+#! /usr/bin/env perl |
20 |
+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. |
21 |
+# |
22 |
+# Licensed under the Apache License 2.0 (the "License"). You may not use |
23 |
+# this file except in compliance with the License. You can obtain a copy |
24 |
+# in the file LICENSE in the source distribution or at |
25 |
+# https://www.openssl.org/source/license.html |
26 |
+ |
27 |
+# ==================================================================== |
28 |
+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke |
29 |
+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for |
30 |
+# the OpenSSL project. |
31 |
+# ==================================================================== |
32 |
+ |
33 |
+# |
34 |
+# Fixed length (n=6), unrolled PPC Montgomery Multiplication |
35 |
+# |
36 |
+ |
37 |
+# 2021 |
38 |
+# |
39 |
+# Although this is a generic implementation for unrolling Montgomery |
40 |
+# Multiplication for arbitrary values of n, this is currently only |
41 |
+# used for n = 6 to improve the performance of ECC p384. |
42 |
+# |
43 |
+# Unrolling allows intermediate results to be stored in registers, |
44 |
+# rather than on the stack, improving performance by ~7% compared to |
45 |
+# the existing PPC assembly code. |
46 |
+# |
47 |
+# The ISA 3.0 implementation uses combination multiply/add |
48 |
+# instructions (maddld, maddhdu) to improve performance by an |
49 |
+# additional ~10% on Power 9. |
50 |
+# |
51 |
+# Finally, saving non-volatile registers into volatile vector |
52 |
+# registers instead of onto the stack saves a little more. |
53 |
+# |
54 |
+# On a Power 9 machine we see an overall improvement of ~18%. |
55 |
+# |
56 |
+ |
57 |
+use strict; |
58 |
+use warnings; |
59 |
+ |
60 |
+my ($flavour, $output, $dir, $xlate); |
61 |
+ |
62 |
+# $output is the last argument if it looks like a file (it has an extension) |
63 |
+# $flavour is the first argument if it doesn't look like a file |
64 |
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; |
65 |
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; |
66 |
+ |
67 |
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
68 |
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
69 |
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
70 |
+die "can't locate ppc-xlate.pl"; |
71 |
+ |
72 |
+open STDOUT,"| $^X $xlate $flavour \"$output\"" |
73 |
+ or die "can't call $xlate: $!"; |
74 |
+ |
75 |
+if ($flavour !~ /64/) { |
76 |
+ die "bad flavour ($flavour) - only ppc64 permitted"; |
77 |
+} |
78 |
+ |
79 |
+my $SIZE_T= 8; |
80 |
+ |
81 |
+# Registers are global so the code is remotely readable |
82 |
+ |
83 |
+# Parameters for Montgomery multiplication |
84 |
+my $sp = "r1"; |
85 |
+my $toc = "r2"; |
86 |
+my $rp = "r3"; |
87 |
+my $ap = "r4"; |
88 |
+my $bp = "r5"; |
89 |
+my $np = "r6"; |
90 |
+my $n0 = "r7"; |
91 |
+my $num = "r8"; |
92 |
+ |
93 |
+my $i = "r9"; |
94 |
+my $c0 = "r10"; |
95 |
+my $bp0 = "r11"; |
96 |
+my $bpi = "r11"; |
97 |
+my $bpj = "r11"; |
98 |
+my $tj = "r12"; |
99 |
+my $apj = "r12"; |
100 |
+my $npj = "r12"; |
101 |
+my $lo = "r14"; |
102 |
+my $c1 = "r14"; |
103 |
+ |
104 |
+# Non-volatile registers used for tp[i] |
105 |
+# |
106 |
+# 12 registers are available but the limit on unrolling is 10, |
107 |
+# since registers from $tp[0] to $tp[$n+1] are used. |
108 |
+my @tp = ("r20" .. "r31"); |
109 |
+ |
110 |
+# volatile VSRs for saving non-volatile GPRs - faster than stack |
111 |
+my @vsrs = ("v32" .. "v46"); |
112 |
+ |
113 |
+package Mont; |
114 |
+ |
115 |
+sub new($$) |
116 |
+{ |
117 |
+ my ($class, $n) = @_; |
118 |
+ |
119 |
+ if ($n > 10) { |
120 |
+ die "Can't unroll for BN length ${n} (maximum 10)" |
121 |
+ } |
122 |
+ |
123 |
+ my $self = { |
124 |
+ code => "", |
125 |
+ n => $n, |
126 |
+ }; |
127 |
+ bless $self, $class; |
128 |
+ |
129 |
+ return $self; |
130 |
+} |
131 |
+ |
132 |
+sub add_code($$) |
133 |
+{ |
134 |
+ my ($self, $c) = @_; |
135 |
+ |
136 |
+ $self->{code} .= $c; |
137 |
+} |
138 |
+ |
139 |
+sub get_code($) |
140 |
+{ |
141 |
+ my ($self) = @_; |
142 |
+ |
143 |
+ return $self->{code}; |
144 |
+} |
145 |
+ |
146 |
+sub get_function_name($) |
147 |
+{ |
148 |
+ my ($self) = @_; |
149 |
+ |
150 |
+ return "bn_mul_mont_fixed_n" . $self->{n}; |
151 |
+} |
152 |
+ |
153 |
+sub get_label($$) |
154 |
+{ |
155 |
+ my ($self, $l) = @_; |
156 |
+ |
157 |
+ return "L" . $l . "_" . $self->{n}; |
158 |
+} |
159 |
+ |
160 |
+sub get_labels($@) |
161 |
+{ |
162 |
+ my ($self, @labels) = @_; |
163 |
+ |
164 |
+ my %out = (); |
165 |
+ |
166 |
+ foreach my $l (@labels) { |
167 |
+ $out{"$l"} = $self->get_label("$l"); |
168 |
+ } |
169 |
+ |
170 |
+ return \%out; |
171 |
+} |
172 |
+ |
173 |
+sub nl($) |
174 |
+{ |
175 |
+ my ($self) = @_; |
176 |
+ |
177 |
+ $self->add_code("\n"); |
178 |
+} |
179 |
+ |
180 |
+sub copy_result($) |
181 |
+{ |
182 |
+ my ($self) = @_; |
183 |
+ |
184 |
+ my ($n) = $self->{n}; |
185 |
+ |
186 |
+ for (my $j = 0; $j < $n; $j++) { |
187 |
+ $self->add_code(<<___); |
188 |
+ std $tp[$j],`$j*$SIZE_T`($rp) |
189 |
+___ |
190 |
+ } |
191 |
+ |
192 |
+} |
193 |
+ |
194 |
+sub mul_mont_fixed($) |
195 |
+{ |
196 |
+ my ($self) = @_; |
197 |
+ |
198 |
+ my ($n) = $self->{n}; |
199 |
+ my $fname = $self->get_function_name(); |
200 |
+ my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); |
201 |
+ |
202 |
+ $self->add_code(<<___); |
203 |
+ |
204 |
+.globl .${fname} |
205 |
+.align 5 |
206 |
+.${fname}: |
207 |
+ |
208 |
+___ |
209 |
+ |
210 |
+ $self->save_registers(); |
211 |
+ |
212 |
+ $self->add_code(<<___); |
213 |
+ ld $n0,0($n0) |
214 |
+ |
215 |
+ ld $bp0,0($bp) |
216 |
+ |
217 |
+ ld $apj,0($ap) |
218 |
+___ |
219 |
+ |
220 |
+ $self->mul_c_0($tp[0], $apj, $bp0, $c0); |
221 |
+ |
222 |
+ for (my $j = 1; $j < $n - 1; $j++) { |
223 |
+ $self->add_code(<<___); |
224 |
+ ld $apj,`$j*$SIZE_T`($ap) |
225 |
+___ |
226 |
+ $self->mul($tp[$j], $apj, $bp0, $c0); |
227 |
+ } |
228 |
+ |
229 |
+ $self->add_code(<<___); |
230 |
+ ld $apj,`($n-1)*$SIZE_T`($ap) |
231 |
+___ |
232 |
+ |
233 |
+ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); |
234 |
+ |
235 |
+ $self->add_code(<<___); |
236 |
+ li $tp[$n+1],0 |
237 |
+ |
238 |
+___ |
239 |
+ |
240 |
+ $self->add_code(<<___); |
241 |
+ li $i,0 |
242 |
+ mtctr $num |
243 |
+ b $label->{"enter"} |
244 |
+ |
245 |
+.align 4 |
246 |
+$label->{"outer"}: |
247 |
+ ldx $bpi,$bp,$i |
248 |
+ |
249 |
+ ld $apj,0($ap) |
250 |
+___ |
251 |
+ |
252 |
+ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); |
253 |
+ |
254 |
+ for (my $j = 1; $j < $n; $j++) { |
255 |
+ $self->add_code(<<___); |
256 |
+ ld $apj,`$j*$SIZE_T`($ap) |
257 |
+___ |
258 |
+ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); |
259 |
+ } |
260 |
+ |
261 |
+ $self->add_code(<<___); |
262 |
+ addc $tp[$n],$tp[$n],$c0 |
263 |
+ addze $tp[$n+1],$tp[$n+1] |
264 |
+___ |
265 |
+ |
266 |
+ $self->add_code(<<___); |
267 |
+.align 4 |
268 |
+$label->{"enter"}: |
269 |
+ mulld $bpi,$tp[0],$n0 |
270 |
+ |
271 |
+ ld $npj,0($np) |
272 |
+___ |
273 |
+ |
274 |
+ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); |
275 |
+ |
276 |
+ for (my $j = 1; $j < $n; $j++) { |
277 |
+ $self->add_code(<<___); |
278 |
+ ld $npj,`$j*$SIZE_T`($np) |
279 |
+___ |
280 |
+ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); |
281 |
+ } |
282 |
+ |
283 |
+ $self->add_code(<<___); |
284 |
+ addc $tp[$n-1],$tp[$n],$c0 |
285 |
+ addze $tp[$n],$tp[$n+1] |
286 |
+ |
287 |
+ addi $i,$i,$SIZE_T |
288 |
+ bdnz $label->{"outer"} |
289 |
+ |
290 |
+ and. $tp[$n],$tp[$n],$tp[$n] |
291 |
+ bne $label->{"sub"} |
292 |
+ |
293 |
+ cmpld $tp[$n-1],$npj |
294 |
+ blt $label->{"copy"} |
295 |
+ |
296 |
+$label->{"sub"}: |
297 |
+___ |
298 |
+ |
299 |
+ # |
300 |
+ # Reduction |
301 |
+ # |
302 |
+ |
303 |
+ $self->add_code(<<___); |
304 |
+ ld $bpj,`0*$SIZE_T`($np) |
305 |
+ subfc $c1,$bpj,$tp[0] |
306 |
+ std $c1,`0*$SIZE_T`($rp) |
307 |
+ |
308 |
+___ |
309 |
+ for (my $j = 1; $j < $n - 1; $j++) { |
310 |
+ $self->add_code(<<___); |
311 |
+ ld $bpj,`$j*$SIZE_T`($np) |
312 |
+ subfe $c1,$bpj,$tp[$j] |
313 |
+ std $c1,`$j*$SIZE_T`($rp) |
314 |
+ |
315 |
+___ |
316 |
+ } |
317 |
+ |
318 |
+ $self->add_code(<<___); |
319 |
+ subfe $c1,$npj,$tp[$n-1] |
320 |
+ std $c1,`($n-1)*$SIZE_T`($rp) |
321 |
+ |
322 |
+___ |
323 |
+ |
324 |
+ $self->add_code(<<___); |
325 |
+ addme. $tp[$n],$tp[$n] |
326 |
+ beq $label->{"end"} |
327 |
+ |
328 |
+$label->{"copy"}: |
329 |
+___ |
330 |
+ |
331 |
+ $self->copy_result(); |
332 |
+ |
333 |
+ $self->add_code(<<___); |
334 |
+ |
335 |
+$label->{"end"}: |
336 |
+___ |
337 |
+ |
338 |
+ $self->restore_registers(); |
339 |
+ |
340 |
+ $self->add_code(<<___); |
341 |
+ li r3,1 |
342 |
+ blr |
343 |
+.size .${fname},.-.${fname} |
344 |
+___ |
345 |
+ |
346 |
+} |
347 |
+ |
348 |
+package Mont::GPR; |
349 |
+ |
350 |
+our @ISA = ('Mont'); |
351 |
+ |
352 |
+sub new($$) |
353 |
+{ |
354 |
+ my ($class, $n) = @_; |
355 |
+ |
356 |
+ return $class->SUPER::new($n); |
357 |
+} |
358 |
+ |
359 |
+sub save_registers($) |
360 |
+{ |
361 |
+ my ($self) = @_; |
362 |
+ |
363 |
+ my $n = $self->{n}; |
364 |
+ |
365 |
+ $self->add_code(<<___); |
366 |
+ std $lo,-8($sp) |
367 |
+___ |
368 |
+ |
369 |
+ for (my $j = 0; $j <= $n+1; $j++) { |
370 |
+ $self->{code}.=<<___; |
371 |
+ std $tp[$j],-`($j+2)*8`($sp) |
372 |
+___ |
373 |
+ } |
374 |
+ |
375 |
+ $self->add_code(<<___); |
376 |
+ |
377 |
+___ |
378 |
+} |
379 |
+ |
380 |
+sub restore_registers($) |
381 |
+{ |
382 |
+ my ($self) = @_; |
383 |
+ |
384 |
+ my $n = $self->{n}; |
385 |
+ |
386 |
+ $self->add_code(<<___); |
387 |
+ ld $lo,-8($sp) |
388 |
+___ |
389 |
+ |
390 |
+ for (my $j = 0; $j <= $n+1; $j++) { |
391 |
+ $self->{code}.=<<___; |
392 |
+ ld $tp[$j],-`($j+2)*8`($sp) |
393 |
+___ |
394 |
+ } |
395 |
+ |
396 |
+ $self->{code} .=<<___; |
397 |
+ |
398 |
+___ |
399 |
+} |
400 |
+ |
401 |
+# Direct translation of C mul() |
402 |
+sub mul($$$$$) |
403 |
+{ |
404 |
+ my ($self, $r, $a, $w, $c) = @_; |
405 |
+ |
406 |
+ $self->add_code(<<___); |
407 |
+ mulld $lo,$a,$w |
408 |
+ addc $r,$lo,$c |
409 |
+ mulhdu $c,$a,$w |
410 |
+ addze $c,$c |
411 |
+ |
412 |
+___ |
413 |
+} |
414 |
+ |
415 |
+# Like mul() but $c is ignored as an input - an optimisation to save a |
416 |
+# preliminary instruction that would set input $c to 0 |
417 |
+sub mul_c_0($$$$$) |
418 |
+{ |
419 |
+ my ($self, $r, $a, $w, $c) = @_; |
420 |
+ |
421 |
+ $self->add_code(<<___); |
422 |
+ mulld $r,$a,$w |
423 |
+ mulhdu $c,$a,$w |
424 |
+ |
425 |
+___ |
426 |
+} |
427 |
+ |
428 |
+# Like mul() but does not to the final addition of CA into $c - an |
429 |
+# optimisation to save an instruction |
430 |
+sub mul_last($$$$$$) |
431 |
+{ |
432 |
+ my ($self, $r1, $r2, $a, $w, $c) = @_; |
433 |
+ |
434 |
+ $self->add_code(<<___); |
435 |
+ mulld $lo,$a,$w |
436 |
+ addc $r1,$lo,$c |
437 |
+ mulhdu $c,$a,$w |
438 |
+ |
439 |
+ addze $r2,$c |
440 |
+___ |
441 |
+} |
442 |
+ |
443 |
+# Like C mul_add() but allow $r_out and $r_in to be different |
444 |
+sub mul_add($$$$$$) |
445 |
+{ |
446 |
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_; |
447 |
+ |
448 |
+ $self->add_code(<<___); |
449 |
+ mulld $lo,$a,$w |
450 |
+ addc $lo,$lo,$c |
451 |
+ mulhdu $c,$a,$w |
452 |
+ addze $c,$c |
453 |
+ addc $r_out,$r_in,$lo |
454 |
+ addze $c,$c |
455 |
+ |
456 |
+___ |
457 |
+} |
458 |
+ |
459 |
+# Like mul_add() but $c is ignored as an input - an optimisation to save a |
460 |
+# preliminary instruction that would set input $c to 0 |
461 |
+sub mul_add_c_0($$$$$$) |
462 |
+{ |
463 |
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_; |
464 |
+ |
465 |
+ $self->add_code(<<___); |
466 |
+ mulld $lo,$a,$w |
467 |
+ addc $r_out,$r_in,$lo |
468 |
+ mulhdu $c,$a,$w |
469 |
+ addze $c,$c |
470 |
+ |
471 |
+___ |
472 |
+} |
473 |
+ |
474 |
+package Mont::GPR_300; |
475 |
+ |
476 |
+our @ISA = ('Mont::GPR'); |
477 |
+ |
478 |
+sub new($$) |
479 |
+{ |
480 |
+ my ($class, $n) = @_; |
481 |
+ |
482 |
+ my $mont = $class->SUPER::new($n); |
483 |
+ |
484 |
+ return $mont; |
485 |
+} |
486 |
+ |
487 |
+sub get_function_name($) |
488 |
+{ |
489 |
+ my ($self) = @_; |
490 |
+ |
491 |
+ return "bn_mul_mont_300_fixed_n" . $self->{n}; |
492 |
+} |
493 |
+ |
494 |
+sub get_label($$) |
495 |
+{ |
496 |
+ my ($self, $l) = @_; |
497 |
+ |
498 |
+ return "L" . $l . "_300_" . $self->{n}; |
499 |
+} |
500 |
+ |
501 |
+# Direct translation of C mul() |
502 |
+sub mul($$$$$) |
503 |
+{ |
504 |
+ my ($self, $r, $a, $w, $c, $last) = @_; |
505 |
+ |
506 |
+ $self->add_code(<<___); |
507 |
+ maddld $r,$a,$w,$c |
508 |
+ maddhdu $c,$a,$w,$c |
509 |
+ |
510 |
+___ |
511 |
+} |
512 |
+ |
513 |
+# Save the last carry as the final entry |
514 |
+sub mul_last($$$$$) |
515 |
+{ |
516 |
+ my ($self, $r1, $r2, $a, $w, $c) = @_; |
517 |
+ |
518 |
+ $self->add_code(<<___); |
519 |
+ maddld $r1,$a,$w,$c |
520 |
+ maddhdu $r2,$a,$w,$c |
521 |
+ |
522 |
+___ |
523 |
+} |
524 |
+ |
525 |
+# Like mul() but $c is ignored as an input - an optimisation to save a |
526 |
+# preliminary instruction that would set input $c to 0 |
527 |
+sub mul_c_0($$$$$) |
528 |
+{ |
529 |
+ my ($self, $r, $a, $w, $c) = @_; |
530 |
+ |
531 |
+ $self->add_code(<<___); |
532 |
+ mulld $r,$a,$w |
533 |
+ mulhdu $c,$a,$w |
534 |
+ |
535 |
+___ |
536 |
+} |
537 |
+ |
538 |
+# Like C mul_add() but allow $r_out and $r_in to be different |
539 |
+sub mul_add($$$$$$) |
540 |
+{ |
541 |
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_; |
542 |
+ |
543 |
+ $self->add_code(<<___); |
544 |
+ maddld $lo,$a,$w,$c |
545 |
+ maddhdu $c,$a,$w,$c |
546 |
+ addc $r_out,$r_in,$lo |
547 |
+ addze $c,$c |
548 |
+ |
549 |
+___ |
550 |
+} |
551 |
+ |
552 |
+# Like mul_add() but $c is ignored as an input - an optimisation to save a |
553 |
+# preliminary instruction that would set input $c to 0 |
554 |
+sub mul_add_c_0($$$$$$) |
555 |
+{ |
556 |
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_; |
557 |
+ |
558 |
+ $self->add_code(<<___); |
559 |
+ maddld $lo,$a,$w,$r_in |
560 |
+ maddhdu $c,$a,$w,$r_in |
561 |
+___ |
562 |
+ |
563 |
+ if ($r_out ne $lo) { |
564 |
+ $self->add_code(<<___); |
565 |
+ mr $r_out,$lo |
566 |
+___ |
567 |
+ } |
568 |
+ |
569 |
+ $self->nl(); |
570 |
+} |
571 |
+ |
572 |
+ |
573 |
+package main; |
574 |
+ |
575 |
+my $code; |
576 |
+ |
577 |
+$code.=<<___; |
578 |
+.machine "any" |
579 |
+.text |
580 |
+___ |
581 |
+ |
582 |
+my $mont; |
583 |
+ |
584 |
+$mont = new Mont::GPR(6); |
585 |
+$mont->mul_mont_fixed(); |
586 |
+$code .= $mont->get_code(); |
587 |
+ |
588 |
+$mont = new Mont::GPR_300(6); |
589 |
+$mont->mul_mont_fixed(); |
590 |
+$code .= $mont->get_code(); |
591 |
+ |
592 |
+$code =~ s/\`([^\`]*)\`/eval $1/gem; |
593 |
+ |
594 |
+$code.=<<___; |
595 |
+.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>" |
596 |
+___ |
597 |
+ |
598 |
+print $code; |
599 |
+close STDOUT or die "error closing STDOUT: $!"; |
600 |
diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c |
601 |
index 3ee76ea96574..1e9421bee213 100644 |
602 |
--- a/crypto/bn/bn_ppc.c |
603 |
+++ b/crypto/bn/bn_ppc.c |
604 |
@@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
605 |
const BN_ULONG *np, const BN_ULONG *n0, int num); |
606 |
int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
607 |
const BN_ULONG *np, const BN_ULONG *n0, int num); |
608 |
+ int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, |
609 |
+ const BN_ULONG *bp, const BN_ULONG *np, |
610 |
+ const BN_ULONG *n0, int num); |
611 |
+ int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, |
612 |
+ const BN_ULONG *bp, const BN_ULONG *np, |
613 |
+ const BN_ULONG *n0, int num); |
614 |
|
615 |
if (num < 4) |
616 |
return 0; |
617 |
@@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
618 |
* no opportunity to figure it out... |
619 |
*/ |
620 |
|
621 |
+#if defined(_ARCH_PPC64) && !defined(__ILP32__) |
622 |
+ if (num == 6) { |
623 |
+ if (OPENSSL_ppccap_P & PPC_MADD300) |
624 |
+ return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num); |
625 |
+ else |
626 |
+ return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num); |
627 |
+ } |
628 |
+#endif |
629 |
+ |
630 |
return bn_mul_mont_int(rp, ap, bp, np, n0, num); |
631 |
} |
632 |
diff --git a/crypto/bn/build.info b/crypto/bn/build.info |
633 |
index 4f8d0689b5ea..987a70ae263b 100644 |
634 |
--- a/crypto/bn/build.info |
635 |
+++ b/crypto/bn/build.info |
636 |
@@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}] |
637 |
|
638 |
$BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s |
639 |
$BNDEF_ppc32=OPENSSL_BN_ASM_MONT |
640 |
- $BNASM_ppc64=$BNASM_ppc32 |
641 |
+ $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s |
642 |
$BNDEF_ppc64=$BNDEF_ppc32 |
643 |
|
644 |
$BNASM_c64xplus=asm/bn-c64xplus.asm |
645 |
@@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl |
646 |
GENERATE[bn-ppc.s]=asm/ppc.pl |
647 |
GENERATE[ppc-mont.s]=asm/ppc-mont.pl |
648 |
GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl |
649 |
+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl |
650 |
|
651 |
GENERATE[alpha-mont.S]=asm/alpha-mont.pl |
652 |
|
653 |
|
654 |
From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001 |
655 |
From: Rohan McLure <rohanmclure@linux.ibm.com> |
656 |
Date: Thu, 30 Jun 2022 16:21:06 +1000 |
657 |
Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9 |
658 |
|
659 |
In the reference C implementation in bn_asm.c, tp[num + 1] contains the |
660 |
carry bit for accumulations into tp[num]. tp[num + 1] is only ever |
661 |
assigned, never itself incremented. |
662 |
--- |
663 |
crypto/bn/asm/ppc64-mont-fixed.pl | 6 ++++-- |
664 |
1 file changed, 4 insertions(+), 2 deletions(-) |
665 |
|
666 |
diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl |
667 |
index 0fb397bc5f12..e27d0ad93d85 100755 |
668 |
--- a/crypto/bn/asm/ppc64-mont-fixed.pl |
669 |
+++ b/crypto/bn/asm/ppc64-mont-fixed.pl |
670 |
@@ -63,6 +63,7 @@ |
671 |
# Registers are global so the code is remotely readable |
672 |
|
673 |
# Parameters for Montgomery multiplication |
674 |
+my $ze = "r0"; |
675 |
my $sp = "r1"; |
676 |
my $toc = "r2"; |
677 |
my $rp = "r3"; |
678 |
@@ -192,6 +193,7 @@ ($) |
679 |
$self->save_registers(); |
680 |
|
681 |
$self->add_code(<<___); |
682 |
+ li $ze,0 |
683 |
ld $n0,0($n0) |
684 |
|
685 |
ld $bp0,0($bp) |
686 |
@@ -242,7 +244,7 @@ ($) |
687 |
|
688 |
$self->add_code(<<___); |
689 |
addc $tp[$n],$tp[$n],$c0 |
690 |
- addze $tp[$n+1],$tp[$n+1] |
691 |
+ addze $tp[$n+1],$ze |
692 |
___ |
693 |
|
694 |
$self->add_code(<<___); |
695 |
@@ -272,7 +274,7 @@ ($) |
696 |
and. $tp[$n],$tp[$n],$tp[$n] |
697 |
bne $label->{"sub"} |
698 |
|
699 |
- cmpld $tp[$n-1],$npj |
700 |
+ cmpld $tp[$n-1],$npj |
701 |
blt $label->{"copy"} |
702 |
|
703 |
$label->{"sub"}: |