openssl3/contribs10/0067-ppc64le-Montgomery-multiply.patch

From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001
From: Rohan McLure <rohanmclure@linux.ibm.com>
Date: Mon, 27 Jun 2022 12:14:55 +1000
Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC
 Montgomery Multiplication""

This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e.
---
 crypto/bn/asm/ppc64-mont-fixed.pl | 581 ++++++++++++++++++++++++++++++
 crypto/bn/bn_ppc.c                |  15 +
 crypto/bn/build.info              |   3 +-
 3 files changed, 598 insertions(+), 1 deletion(-)

diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
index e69de29bb2d1..0fb397bc5f12 100755
--- a/crypto/bn/asm/ppc64-mont-fixed.pl
+++ b/crypto/bn/asm/ppc64-mont-fixed.pl
@@ -0,0 +1,581 @@
+#! /usr/bin/env perl
+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
+# the OpenSSL project.
+# ====================================================================
+
+#
+# Fixed length (n=6), unrolled PPC Montgomery Multiplication
+#
+
+# 2021
+#
+# Although this is a generic implementation for unrolling Montgomery
+# Multiplication for arbitrary values of n, this is currently only
+# used for n = 6 to improve the performance of ECC p384.
+#
+# Unrolling allows intermediate results to be stored in registers,
+# rather than on the stack, improving performance by ~7% compared to
+# the existing PPC assembly code.
+#
+# The ISA 3.0 implementation uses combination multiply/add
+# instructions (maddld, maddhdu) to improve performance by an
+# additional ~10% on Power 9.
+#
+# Finally, saving non-volatile registers into volatile vector
+# registers instead of onto the stack saves a little more.
+#
+# On a Power 9 machine we see an overall improvement of ~18%.
+#
+
+use strict;
+use warnings;
+
+my ($flavour, $output, $dir, $xlate);
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+if ($flavour !~ /64/) {
+       die "bad flavour ($flavour) - only ppc64 permitted";
+}
+
+my $SIZE_T= 8;
+
+# Registers are global so the code is remotely readable
+
+# Parameters for Montgomery multiplication
+my $sp = "r1";
+my $toc        = "r2";
+my $rp = "r3";
+my $ap = "r4";
+my $bp = "r5";
+my $np = "r6";
+my $n0 = "r7";
+my $num        = "r8";
+
+my $i  = "r9";
+my $c0 = "r10";
+my $bp0        = "r11";
+my $bpi        = "r11";
+my $bpj        = "r11";
+my $tj = "r12";
+my $apj        = "r12";
+my $npj        = "r12";
+my $lo = "r14";
+my $c1 = "r14";
+
+# Non-volatile registers used for tp[i]
+#
+# 12 registers are available but the limit on unrolling is 10,
+# since registers from $tp[0] to $tp[$n+1] are used.
+my @tp = ("r20" .. "r31");
+
+# volatile VSRs for saving non-volatile GPRs - faster than stack
+my @vsrs = ("v32" .. "v46");
+
+package Mont;
+
+sub new($$)
+{
+       my ($class, $n) = @_;
+
+       if ($n > 10) {
+               die "Can't unroll for BN length ${n} (maximum 10)"
+       }
+
+       my $self = {
+               code => "",
+               n => $n,
+       };
+       bless $self, $class;
+
+       return $self;
+}
+
+sub add_code($$)
+{
+       my ($self, $c) = @_;
+
+       $self->{code} .= $c;
+}
+
+sub get_code($)
+{
+       my ($self) = @_;
+
+       return $self->{code};
+}
+
+sub get_function_name($)
+{
+       my ($self) = @_;
+
+       return "bn_mul_mont_fixed_n" . $self->{n};
+}
+
+sub get_label($$)
+{
+       my ($self, $l) = @_;
+
+       return "L" . $l . "_" . $self->{n};
+}
+
+sub get_labels($@)
+{
+       my ($self, @labels) = @_;
+
+       my %out = ();
+
+       foreach my $l (@labels) {
+               $out{"$l"} = $self->get_label("$l");
+       }
+
+       return \%out;
+}
+
+sub nl($)
+{
+       my ($self) = @_;
+
+       $self->add_code("\n");
+}
+
+sub copy_result($)
+{
+       my ($self) = @_;
+
+       my ($n) = $self->{n};
+
+       for (my $j = 0; $j < $n; $j++) {
+               $self->add_code(<<___);
+       std             $tp[$j],`$j*$SIZE_T`($rp)
+___
+       }
+
+}
+
+sub mul_mont_fixed($)
+{
+       my ($self) = @_;
+
+       my ($n) = $self->{n};
+       my $fname = $self->get_function_name();
+       my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
+
+       $self->add_code(<<___);
+
+.globl .${fname}
+.align 5
+.${fname}:
+
+___
+
+       $self->save_registers();
+
+       $self->add_code(<<___);
+       ld              $n0,0($n0)
+
+       ld              $bp0,0($bp)
+
+       ld              $apj,0($ap)
+___
+
+       $self->mul_c_0($tp[0], $apj, $bp0, $c0);
+
+       for (my $j = 1; $j < $n - 1; $j++) {
+               $self->add_code(<<___);
+       ld              $apj,`$j*$SIZE_T`($ap)
+___
+               $self->mul($tp[$j], $apj, $bp0, $c0);
+       }
+
+       $self->add_code(<<___);
+       ld              $apj,`($n-1)*$SIZE_T`($ap)
+___
+
+       $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
+
+       $self->add_code(<<___);
+       li              $tp[$n+1],0
+
+___
+
+       $self->add_code(<<___);
+       li              $i,0
+       mtctr           $num
+       b               $label->{"enter"}
+
+.align 4
+$label->{"outer"}:
+       ldx             $bpi,$bp,$i
+
+       ld              $apj,0($ap)
+___
+
+       $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
+
+       for (my $j = 1; $j < $n; $j++) {
+               $self->add_code(<<___);
+       ld              $apj,`$j*$SIZE_T`($ap)
+___
+               $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
+       }
+
+       $self->add_code(<<___);
+       addc            $tp[$n],$tp[$n],$c0
+       addze           $tp[$n+1],$tp[$n+1]
+___
+
+       $self->add_code(<<___);
+.align 4
+$label->{"enter"}:
+       mulld           $bpi,$tp[0],$n0
+
+       ld              $npj,0($np)
+___
+
+       $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
+
+       for (my $j = 1; $j < $n; $j++) {
+               $self->add_code(<<___);
+       ld              $npj,`$j*$SIZE_T`($np)
+___
+               $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
+       }
+
+       $self->add_code(<<___);
+       addc            $tp[$n-1],$tp[$n],$c0
+       addze           $tp[$n],$tp[$n+1]
+
+       addi            $i,$i,$SIZE_T
+       bdnz            $label->{"outer"}
+
+       and.            $tp[$n],$tp[$n],$tp[$n]
+       bne             $label->{"sub"}
+
+       cmpld   $tp[$n-1],$npj
+       blt             $label->{"copy"}
+
+$label->{"sub"}:
+___
+
+       #
+       # Reduction
+       #
+
+               $self->add_code(<<___);
+       ld              $bpj,`0*$SIZE_T`($np)
+       subfc           $c1,$bpj,$tp[0]
+       std             $c1,`0*$SIZE_T`($rp)
+
+___
+       for (my $j = 1; $j < $n - 1; $j++) {
+               $self->add_code(<<___);
+       ld              $bpj,`$j*$SIZE_T`($np)
+       subfe           $c1,$bpj,$tp[$j]
+       std             $c1,`$j*$SIZE_T`($rp)
+
+___
+       }
+
+               $self->add_code(<<___);
+       subfe           $c1,$npj,$tp[$n-1]
+       std             $c1,`($n-1)*$SIZE_T`($rp)
+
+___
+
+       $self->add_code(<<___);
+       addme.          $tp[$n],$tp[$n]
+       beq             $label->{"end"}
+
+$label->{"copy"}:
+___
+
+       $self->copy_result();
+
+       $self->add_code(<<___);
+
+$label->{"end"}:
+___
+
+       $self->restore_registers();
+
+       $self->add_code(<<___);
+       li              r3,1
+       blr
+.size .${fname},.-.${fname}
+___
+
+}
+
+package Mont::GPR;
+
+our @ISA = ('Mont');
+
+sub new($$)
+{
+    my ($class, $n) = @_;
+
+    return $class->SUPER::new($n);
+}
+
+sub save_registers($)
+{
+       my ($self) = @_;
+
+       my $n = $self->{n};
+
+       $self->add_code(<<___);
+       std     $lo,-8($sp)
+___
+
+       for (my $j = 0; $j <= $n+1; $j++) {
+               $self->{code}.=<<___;
+       std     $tp[$j],-`($j+2)*8`($sp)
+___
+       }
+
+       $self->add_code(<<___);
+
+___
+}
+
+sub restore_registers($)
+{
+       my ($self) = @_;
+
+       my $n = $self->{n};
+
+       $self->add_code(<<___);
+       ld      $lo,-8($sp)
+___
+
+       for (my $j = 0; $j <= $n+1; $j++) {
+               $self->{code}.=<<___;
+       ld      $tp[$j],-`($j+2)*8`($sp)
+___
+       }
+
+       $self->{code} .=<<___;
+
+___
+}
+
+# Direct translation of C mul()
+sub mul($$$$$)
+{
+       my ($self, $r, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       mulld           $lo,$a,$w
+       addc            $r,$lo,$c
+       mulhdu          $c,$a,$w
+       addze           $c,$c
+
+___
+}
+
+# Like mul() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_c_0($$$$$)
+{
+       my ($self, $r, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       mulld           $r,$a,$w
+       mulhdu          $c,$a,$w
+
+___
+}
+
+# Like mul() but does not to the final addition of CA into $c - an
+# optimisation to save an instruction
+sub mul_last($$$$$$)
+{
+       my ($self, $r1, $r2, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       mulld           $lo,$a,$w
+       addc            $r1,$lo,$c
+       mulhdu          $c,$a,$w
+
+       addze           $r2,$c
+___
+}
+
+# Like C mul_add() but allow $r_out and $r_in to be different
+sub mul_add($$$$$$)
+{
+       my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       mulld           $lo,$a,$w
+       addc            $lo,$lo,$c
+       mulhdu          $c,$a,$w
+       addze           $c,$c
+       addc            $r_out,$r_in,$lo
+       addze           $c,$c
+
+___
+}
+
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_add_c_0($$$$$$)
+{
+       my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       mulld           $lo,$a,$w
+       addc            $r_out,$r_in,$lo
+       mulhdu          $c,$a,$w
+       addze           $c,$c
+
+___
+}
+
+package Mont::GPR_300;
+
+our @ISA = ('Mont::GPR');
+
+sub new($$)
+{
+       my ($class, $n) = @_;
+
+       my $mont = $class->SUPER::new($n);
+
+       return $mont;
+}
+
+sub get_function_name($)
+{
+       my ($self) = @_;
+
+       return "bn_mul_mont_300_fixed_n" . $self->{n};
+}
+
+sub get_label($$)
+{
+       my ($self, $l) = @_;
+
+       return "L" . $l . "_300_" . $self->{n};
+}
+
+# Direct translation of C mul()
+sub mul($$$$$)
+{
+       my ($self, $r, $a, $w, $c, $last) = @_;
+
+       $self->add_code(<<___);
+       maddld          $r,$a,$w,$c
+       maddhdu         $c,$a,$w,$c
+
+___
+}
+
+# Save the last carry as the final entry
+sub mul_last($$$$$)
+{
+       my ($self, $r1, $r2, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       maddld          $r1,$a,$w,$c
+       maddhdu         $r2,$a,$w,$c
+
+___
+}
+
+# Like mul() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_c_0($$$$$)
+{
+       my ($self, $r, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       mulld          $r,$a,$w
+       mulhdu          $c,$a,$w
+
+___
+}
+
+# Like C mul_add() but allow $r_out and $r_in to be different
+sub mul_add($$$$$$)
+{
+       my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       maddld          $lo,$a,$w,$c
+       maddhdu         $c,$a,$w,$c
+       addc            $r_out,$r_in,$lo
+       addze           $c,$c
+
+___
+}
+
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_add_c_0($$$$$$)
+{
+       my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+       $self->add_code(<<___);
+       maddld          $lo,$a,$w,$r_in
+       maddhdu         $c,$a,$w,$r_in
+___
+
+       if ($r_out ne $lo) {
+               $self->add_code(<<___);
+       mr                      $r_out,$lo
+___
+       }
+
+       $self->nl();
+}
+
+
+package main;
+
+my $code;
+
+$code.=<<___;
+.machine "any"
+.text
+___
+
+my $mont;
+
+$mont = new Mont::GPR(6);
+$mont->mul_mont_fixed();
+$code .= $mont->get_code();
+
+$mont = new Mont::GPR_300(6);
+$mont->mul_mont_fixed();
+$code .= $mont->get_code();
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+$code.=<<___;
+.asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c
index 3ee76ea96574..1e9421bee213 100644
--- a/crypto/bn/bn_ppc.c
+++ b/crypto/bn/bn_ppc.c
@@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                         const BN_ULONG *np, const BN_ULONG *n0, int num);
     int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                           const BN_ULONG *np, const BN_ULONG *n0, int num);
+    int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
+                             const BN_ULONG *bp, const BN_ULONG *np,
+                             const BN_ULONG *n0, int num);
+    int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
+                                 const BN_ULONG *bp, const BN_ULONG *np,
+                                 const BN_ULONG *n0, int num);
 
     if (num < 4)
         return 0;
@@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
      * no opportunity to figure it out...
      */
 
+#if defined(_ARCH_PPC64) && !defined(__ILP32__)
+    if (num == 6) {
+        if (OPENSSL_ppccap_P & PPC_MADD300)
+            return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
+        else
+            return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
+    }
+#endif
+
     return bn_mul_mont_int(rp, ap, bp, np, n0, num);
 }
diff --git a/crypto/bn/build.info b/crypto/bn/build.info
index 4f8d0689b5ea..987a70ae263b 100644
--- a/crypto/bn/build.info
+++ b/crypto/bn/build.info
@@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]
 
   $BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s
   $BNDEF_ppc32=OPENSSL_BN_ASM_MONT
-  $BNASM_ppc64=$BNASM_ppc32
+  $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s
   $BNDEF_ppc64=$BNDEF_ppc32
 
   $BNASM_c64xplus=asm/bn-c64xplus.asm
@@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl
 GENERATE[bn-ppc.s]=asm/ppc.pl
 GENERATE[ppc-mont.s]=asm/ppc-mont.pl
 GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl
+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl
 
 GENERATE[alpha-mont.S]=asm/alpha-mont.pl
 

From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001
From: Rohan McLure <rohanmclure@linux.ibm.com>
Date: Thu, 30 Jun 2022 16:21:06 +1000
Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9

In the reference C implementation in bn_asm.c, tp[num + 1] contains the
carry bit for accumulations into tp[num]. tp[num + 1] is only ever
assigned, never itself incremented.
---
 crypto/bn/asm/ppc64-mont-fixed.pl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
index 0fb397bc5f12..e27d0ad93d85 100755
--- a/crypto/bn/asm/ppc64-mont-fixed.pl
+++ b/crypto/bn/asm/ppc64-mont-fixed.pl
@@ -63,6 +63,7 @@
 # Registers are global so the code is remotely readable
 
 # Parameters for Montgomery multiplication
+my $ze = "r0";
 my $sp = "r1";
 my $toc        = "r2";
 my $rp = "r3";
@@ -192,6 +193,7 @@ ($)
        $self->save_registers();
 
        $self->add_code(<<___);
+       li              $ze,0
        ld              $n0,0($n0)
 
        ld              $bp0,0($bp)
@@ -242,7 +244,7 @@ ($)
 
        $self->add_code(<<___);
        addc            $tp[$n],$tp[$n],$c0
-       addze           $tp[$n+1],$tp[$n+1]
+       addze           $tp[$n+1],$ze
 ___
 
        $self->add_code(<<___);
@@ -272,7 +274,7 @@ ($)
        and.            $tp[$n],$tp[$n],$tp[$n]
        bne             $label->{"sub"}
 
-       cmpld   $tp[$n-1],$npj
+       cmpld           $tp[$n-1],$npj
        blt             $label->{"copy"}
 
 $label->{"sub"}:
1	From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001
2	From: Rohan McLure <rohanmclure@linux.ibm.com>
3	Date: Mon, 27 Jun 2022 12:14:55 +1000
4	Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC
5	Montgomery Multiplication""
6
7	This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e.
8	---
9	crypto/bn/asm/ppc64-mont-fixed.pl \| 581 ++++++++++++++++++++++++++++++
10	crypto/bn/bn_ppc.c \| 15 +
11	crypto/bn/build.info \| 3 +-
12	3 files changed, 598 insertions(+), 1 deletion(-)
13
14	diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
15	index e69de29bb2d1..0fb397bc5f12 100755
16	--- a/crypto/bn/asm/ppc64-mont-fixed.pl
17	+++ b/crypto/bn/asm/ppc64-mont-fixed.pl
18	@@ -0,0 +1,581 @@
19	+#! /usr/bin/env perl
20	+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
21	+#
22	+# Licensed under the Apache License 2.0 (the "License"). You may not use
23	+# this file except in compliance with the License. You can obtain a copy
24	+# in the file LICENSE in the source distribution or at
25	+# https://www.openssl.org/source/license.html
26	+
27	+# ====================================================================
28	+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
29	+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
30	+# the OpenSSL project.
31	+# ====================================================================
32	+
33	+#
34	+# Fixed length (n=6), unrolled PPC Montgomery Multiplication
35	+#
36	+
37	+# 2021
38	+#
39	+# Although this is a generic implementation for unrolling Montgomery
40	+# Multiplication for arbitrary values of n, this is currently only
41	+# used for n = 6 to improve the performance of ECC p384.
42	+#
43	+# Unrolling allows intermediate results to be stored in registers,
44	+# rather than on the stack, improving performance by ~7% compared to
45	+# the existing PPC assembly code.
46	+#
47	+# The ISA 3.0 implementation uses combination multiply/add
48	+# instructions (maddld, maddhdu) to improve performance by an
49	+# additional ~10% on Power 9.
50	+#
51	+# Finally, saving non-volatile registers into volatile vector
52	+# registers instead of onto the stack saves a little more.
53	+#
54	+# On a Power 9 machine we see an overall improvement of ~18%.
55	+#
56	+
57	+use strict;
58	+use warnings;
59	+
60	+my ($flavour, $output, $dir, $xlate);
61	+
62	+# $output is the last argument if it looks like a file (it has an extension)
63	+# $flavour is the first argument if it doesn't look like a file
64	+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
65	+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
66	+
67	+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68	+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
69	+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
70	+die "can't locate ppc-xlate.pl";
71	+
72	+open STDOUT,"\| $^X $xlate $flavour \"$output\""
73	+ or die "can't call $xlate: $!";
74	+
75	+if ($flavour !~ /64/) {
76	+ die "bad flavour ($flavour) - only ppc64 permitted";
77	+}
78	+
79	+my $SIZE_T= 8;
80	+
81	+# Registers are global so the code is remotely readable
82	+
83	+# Parameters for Montgomery multiplication
84	+my $sp = "r1";
85	+my $toc = "r2";
86	+my $rp = "r3";
87	+my $ap = "r4";
88	+my $bp = "r5";
89	+my $np = "r6";
90	+my $n0 = "r7";
91	+my $num = "r8";
92	+
93	+my $i = "r9";
94	+my $c0 = "r10";
95	+my $bp0 = "r11";
96	+my $bpi = "r11";
97	+my $bpj = "r11";
98	+my $tj = "r12";
99	+my $apj = "r12";
100	+my $npj = "r12";
101	+my $lo = "r14";
102	+my $c1 = "r14";
103	+
104	+# Non-volatile registers used for tp[i]
105	+#
106	+# 12 registers are available but the limit on unrolling is 10,
107	+# since registers from $tp[0] to $tp[$n+1] are used.
108	+my @tp = ("r20" .. "r31");
109	+
110	+# volatile VSRs for saving non-volatile GPRs - faster than stack
111	+my @vsrs = ("v32" .. "v46");
112	+
113	+package Mont;
114	+
115	+sub new($$)
116	+{
117	+ my ($class, $n) = @_;
118	+
119	+ if ($n > 10) {
120	+ die "Can't unroll for BN length ${n} (maximum 10)"
121	+ }
122	+
123	+ my $self = {
124	+ code => "",
125	+ n => $n,
126	+ };
127	+ bless $self, $class;
128	+
129	+ return $self;
130	+}
131	+
132	+sub add_code($$)
133	+{
134	+ my ($self, $c) = @_;
135	+
136	+ $self->{code} .= $c;
137	+}
138	+
139	+sub get_code($)
140	+{
141	+ my ($self) = @_;
142	+
143	+ return $self->{code};
144	+}
145	+
146	+sub get_function_name($)
147	+{
148	+ my ($self) = @_;
149	+
150	+ return "bn_mul_mont_fixed_n" . $self->{n};
151	+}
152	+
153	+sub get_label($$)
154	+{
155	+ my ($self, $l) = @_;
156	+
157	+ return "L" . $l . "_" . $self->{n};
158	+}
159	+
160	+sub get_labels($@)
161	+{
162	+ my ($self, @labels) = @_;
163	+
164	+ my %out = ();
165	+
166	+ foreach my $l (@labels) {
167	+ $out{"$l"} = $self->get_label("$l");
168	+ }
169	+
170	+ return \%out;
171	+}
172	+
173	+sub nl($)
174	+{
175	+ my ($self) = @_;
176	+
177	+ $self->add_code("\n");
178	+}
179	+
180	+sub copy_result($)
181	+{
182	+ my ($self) = @_;
183	+
184	+ my ($n) = $self->{n};
185	+
186	+ for (my $j = 0; $j < $n; $j++) {
187	+ $self->add_code(<<___);
188	+ std $tp[$j],`$j*$SIZE_T`($rp)
189	+___
190	+ }
191	+
192	+}
193	+
194	+sub mul_mont_fixed($)
195	+{
196	+ my ($self) = @_;
197	+
198	+ my ($n) = $self->{n};
199	+ my $fname = $self->get_function_name();
200	+ my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
201	+
202	+ $self->add_code(<<___);
203	+
204	+.globl .${fname}
205	+.align 5
206	+.${fname}:
207	+
208	+___
209	+
210	+ $self->save_registers();
211	+
212	+ $self->add_code(<<___);
213	+ ld $n0,0($n0)
214	+
215	+ ld $bp0,0($bp)
216	+
217	+ ld $apj,0($ap)
218	+___
219	+
220	+ $self->mul_c_0($tp[0], $apj, $bp0, $c0);
221	+
222	+ for (my $j = 1; $j < $n - 1; $j++) {
223	+ $self->add_code(<<___);
224	+ ld $apj,`$j*$SIZE_T`($ap)
225	+___
226	+ $self->mul($tp[$j], $apj, $bp0, $c0);
227	+ }
228	+
229	+ $self->add_code(<<___);
230	+ ld $apj,`($n-1)*$SIZE_T`($ap)
231	+___
232	+
233	+ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
234	+
235	+ $self->add_code(<<___);
236	+ li $tp[$n+1],0
237	+
238	+___
239	+
240	+ $self->add_code(<<___);
241	+ li $i,0
242	+ mtctr $num
243	+ b $label->{"enter"}
244	+
245	+.align 4
246	+$label->{"outer"}:
247	+ ldx $bpi,$bp,$i
248	+
249	+ ld $apj,0($ap)
250	+___
251	+
252	+ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
253	+
254	+ for (my $j = 1; $j < $n; $j++) {
255	+ $self->add_code(<<___);
256	+ ld $apj,`$j*$SIZE_T`($ap)
257	+___
258	+ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
259	+ }
260	+
261	+ $self->add_code(<<___);
262	+ addc $tp[$n],$tp[$n],$c0
263	+ addze $tp[$n+1],$tp[$n+1]
264	+___
265	+
266	+ $self->add_code(<<___);
267	+.align 4
268	+$label->{"enter"}:
269	+ mulld $bpi,$tp[0],$n0
270	+
271	+ ld $npj,0($np)
272	+___
273	+
274	+ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
275	+
276	+ for (my $j = 1; $j < $n; $j++) {
277	+ $self->add_code(<<___);
278	+ ld $npj,`$j*$SIZE_T`($np)
279	+___
280	+ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
281	+ }
282	+
283	+ $self->add_code(<<___);
284	+ addc $tp[$n-1],$tp[$n],$c0
285	+ addze $tp[$n],$tp[$n+1]
286	+
287	+ addi $i,$i,$SIZE_T
288	+ bdnz $label->{"outer"}
289	+
290	+ and. $tp[$n],$tp[$n],$tp[$n]
291	+ bne $label->{"sub"}
292	+
293	+ cmpld $tp[$n-1],$npj
294	+ blt $label->{"copy"}
295	+
296	+$label->{"sub"}:
297	+___
298	+
299	+ #
300	+ # Reduction
301	+ #
302	+
303	+ $self->add_code(<<___);
304	+ ld $bpj,`0*$SIZE_T`($np)
305	+ subfc $c1,$bpj,$tp[0]
306	+ std $c1,`0*$SIZE_T`($rp)
307	+
308	+___
309	+ for (my $j = 1; $j < $n - 1; $j++) {
310	+ $self->add_code(<<___);
311	+ ld $bpj,`$j*$SIZE_T`($np)
312	+ subfe $c1,$bpj,$tp[$j]
313	+ std $c1,`$j*$SIZE_T`($rp)
314	+
315	+___
316	+ }
317	+
318	+ $self->add_code(<<___);
319	+ subfe $c1,$npj,$tp[$n-1]
320	+ std $c1,`($n-1)*$SIZE_T`($rp)
321	+
322	+___
323	+
324	+ $self->add_code(<<___);
325	+ addme. $tp[$n],$tp[$n]
326	+ beq $label->{"end"}
327	+
328	+$label->{"copy"}:
329	+___
330	+
331	+ $self->copy_result();
332	+
333	+ $self->add_code(<<___);
334	+
335	+$label->{"end"}:
336	+___
337	+
338	+ $self->restore_registers();
339	+
340	+ $self->add_code(<<___);
341	+ li r3,1
342	+ blr
343	+.size .${fname},.-.${fname}
344	+___
345	+
346	+}
347	+
348	+package Mont::GPR;
349	+
350	+our @ISA = ('Mont');
351	+
352	+sub new($$)
353	+{
354	+ my ($class, $n) = @_;
355	+
356	+ return $class->SUPER::new($n);
357	+}
358	+
359	+sub save_registers($)
360	+{
361	+ my ($self) = @_;
362	+
363	+ my $n = $self->{n};
364	+
365	+ $self->add_code(<<___);
366	+ std $lo,-8($sp)
367	+___
368	+
369	+ for (my $j = 0; $j <= $n+1; $j++) {
370	+ $self->{code}.=<<___;
371	+ std $tp[$j],-`($j+2)*8`($sp)
372	+___
373	+ }
374	+
375	+ $self->add_code(<<___);
376	+
377	+___
378	+}
379	+
380	+sub restore_registers($)
381	+{
382	+ my ($self) = @_;
383	+
384	+ my $n = $self->{n};
385	+
386	+ $self->add_code(<<___);
387	+ ld $lo,-8($sp)
388	+___
389	+
390	+ for (my $j = 0; $j <= $n+1; $j++) {
391	+ $self->{code}.=<<___;
392	+ ld $tp[$j],-`($j+2)*8`($sp)
393	+___
394	+ }
395	+
396	+ $self->{code} .=<<___;
397	+
398	+___
399	+}
400	+
401	+# Direct translation of C mul()
402	+sub mul($$$$$)
403	+{
404	+ my ($self, $r, $a, $w, $c) = @_;
405	+
406	+ $self->add_code(<<___);
407	+ mulld $lo,$a,$w
408	+ addc $r,$lo,$c
409	+ mulhdu $c,$a,$w
410	+ addze $c,$c
411	+
412	+___
413	+}
414	+
415	+# Like mul() but $c is ignored as an input - an optimisation to save a
416	+# preliminary instruction that would set input $c to 0
417	+sub mul_c_0($$$$$)
418	+{
419	+ my ($self, $r, $a, $w, $c) = @_;
420	+
421	+ $self->add_code(<<___);
422	+ mulld $r,$a,$w
423	+ mulhdu $c,$a,$w
424	+
425	+___
426	+}
427	+
428	+# Like mul() but does not to the final addition of CA into $c - an
429	+# optimisation to save an instruction
430	+sub mul_last($$$$$$)
431	+{
432	+ my ($self, $r1, $r2, $a, $w, $c) = @_;
433	+
434	+ $self->add_code(<<___);
435	+ mulld $lo,$a,$w
436	+ addc $r1,$lo,$c
437	+ mulhdu $c,$a,$w
438	+
439	+ addze $r2,$c
440	+___
441	+}
442	+
443	+# Like C mul_add() but allow $r_out and $r_in to be different
444	+sub mul_add($$$$$$)
445	+{
446	+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
447	+
448	+ $self->add_code(<<___);
449	+ mulld $lo,$a,$w
450	+ addc $lo,$lo,$c
451	+ mulhdu $c,$a,$w
452	+ addze $c,$c
453	+ addc $r_out,$r_in,$lo
454	+ addze $c,$c
455	+
456	+___
457	+}
458	+
459	+# Like mul_add() but $c is ignored as an input - an optimisation to save a
460	+# preliminary instruction that would set input $c to 0
461	+sub mul_add_c_0($$$$$$)
462	+{
463	+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
464	+
465	+ $self->add_code(<<___);
466	+ mulld $lo,$a,$w
467	+ addc $r_out,$r_in,$lo
468	+ mulhdu $c,$a,$w
469	+ addze $c,$c
470	+
471	+___
472	+}
473	+
474	+package Mont::GPR_300;
475	+
476	+our @ISA = ('Mont::GPR');
477	+
478	+sub new($$)
479	+{
480	+ my ($class, $n) = @_;
481	+
482	+ my $mont = $class->SUPER::new($n);
483	+
484	+ return $mont;
485	+}
486	+
487	+sub get_function_name($)
488	+{
489	+ my ($self) = @_;
490	+
491	+ return "bn_mul_mont_300_fixed_n" . $self->{n};
492	+}
493	+
494	+sub get_label($$)
495	+{
496	+ my ($self, $l) = @_;
497	+
498	+ return "L" . $l . "_300_" . $self->{n};
499	+}
500	+
501	+# Direct translation of C mul()
502	+sub mul($$$$$)
503	+{
504	+ my ($self, $r, $a, $w, $c, $last) = @_;
505	+
506	+ $self->add_code(<<___);
507	+ maddld $r,$a,$w,$c
508	+ maddhdu $c,$a,$w,$c
509	+
510	+___
511	+}
512	+
513	+# Save the last carry as the final entry
514	+sub mul_last($$$$$)
515	+{
516	+ my ($self, $r1, $r2, $a, $w, $c) = @_;
517	+
518	+ $self->add_code(<<___);
519	+ maddld $r1,$a,$w,$c
520	+ maddhdu $r2,$a,$w,$c
521	+
522	+___
523	+}
524	+
525	+# Like mul() but $c is ignored as an input - an optimisation to save a
526	+# preliminary instruction that would set input $c to 0
527	+sub mul_c_0($$$$$)
528	+{
529	+ my ($self, $r, $a, $w, $c) = @_;
530	+
531	+ $self->add_code(<<___);
532	+ mulld $r,$a,$w
533	+ mulhdu $c,$a,$w
534	+
535	+___
536	+}
537	+
538	+# Like C mul_add() but allow $r_out and $r_in to be different
539	+sub mul_add($$$$$$)
540	+{
541	+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
542	+
543	+ $self->add_code(<<___);
544	+ maddld $lo,$a,$w,$c
545	+ maddhdu $c,$a,$w,$c
546	+ addc $r_out,$r_in,$lo
547	+ addze $c,$c
548	+
549	+___
550	+}
551	+
552	+# Like mul_add() but $c is ignored as an input - an optimisation to save a
553	+# preliminary instruction that would set input $c to 0
554	+sub mul_add_c_0($$$$$$)
555	+{
556	+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
557	+
558	+ $self->add_code(<<___);
559	+ maddld $lo,$a,$w,$r_in
560	+ maddhdu $c,$a,$w,$r_in
561	+___
562	+
563	+ if ($r_out ne $lo) {
564	+ $self->add_code(<<___);
565	+ mr $r_out,$lo
566	+___
567	+ }
568	+
569	+ $self->nl();
570	+}
571	+
572	+
573	+package main;
574	+
575	+my $code;
576	+
577	+$code.=<<___;
578	+.machine "any"
579	+.text
580	+___
581	+
582	+my $mont;
583	+
584	+$mont = new Mont::GPR(6);
585	+$mont->mul_mont_fixed();
586	+$code .= $mont->get_code();
587	+
588	+$mont = new Mont::GPR_300(6);
589	+$mont->mul_mont_fixed();
590	+$code .= $mont->get_code();
591	+
592	+$code =~ s/\`([^\`]*)\`/eval $1/gem;
593	+
594	+$code.=<<___;
595	+.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
596	+___
597	+
598	+print $code;
599	+close STDOUT or die "error closing STDOUT: $!";
600	diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c
601	index 3ee76ea96574..1e9421bee213 100644
602	--- a/crypto/bn/bn_ppc.c
603	+++ b/crypto/bn/bn_ppc.c
604	@@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
605	const BN_ULONG np, const BN_ULONG n0, int num);
606	int bn_mul4x_mont_int(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
607	const BN_ULONG np, const BN_ULONG n0, int num);
608	+ int bn_mul_mont_fixed_n6(BN_ULONG rp, const BN_ULONG ap,
609	+ const BN_ULONG bp, const BN_ULONG np,
610	+ const BN_ULONG *n0, int num);
611	+ int bn_mul_mont_300_fixed_n6(BN_ULONG rp, const BN_ULONG ap,
612	+ const BN_ULONG bp, const BN_ULONG np,
613	+ const BN_ULONG *n0, int num);
614
615	if (num < 4)
616	return 0;
617	@@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
618	* no opportunity to figure it out...
619	*/
620
621	+#if defined(_ARCH_PPC64) && !defined(__ILP32__)
622	+ if (num == 6) {
623	+ if (OPENSSL_ppccap_P & PPC_MADD300)
624	+ return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
625	+ else
626	+ return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
627	+ }
628	+#endif
629	+
630	return bn_mul_mont_int(rp, ap, bp, np, n0, num);
631	}
632	diff --git a/crypto/bn/build.info b/crypto/bn/build.info
633	index 4f8d0689b5ea..987a70ae263b 100644
634	--- a/crypto/bn/build.info
635	+++ b/crypto/bn/build.info
636	@@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]
637
638	$BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s
639	$BNDEF_ppc32=OPENSSL_BN_ASM_MONT
640	- $BNASM_ppc64=$BNASM_ppc32
641	+ $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s
642	$BNDEF_ppc64=$BNDEF_ppc32
643
644	$BNASM_c64xplus=asm/bn-c64xplus.asm
645	@@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl
646	GENERATE[bn-ppc.s]=asm/ppc.pl
647	GENERATE[ppc-mont.s]=asm/ppc-mont.pl
648	GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl
649	+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl
650
651	GENERATE[alpha-mont.S]=asm/alpha-mont.pl
652
653
654	From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001
655	From: Rohan McLure <rohanmclure@linux.ibm.com>
656	Date: Thu, 30 Jun 2022 16:21:06 +1000
657	Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9
658
659	In the reference C implementation in bn_asm.c, tp[num + 1] contains the
660	carry bit for accumulations into tp[num]. tp[num + 1] is only ever
661	assigned, never itself incremented.
662	---
663	crypto/bn/asm/ppc64-mont-fixed.pl \| 6 ++++--
664	1 file changed, 4 insertions(+), 2 deletions(-)
665
666	diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
667	index 0fb397bc5f12..e27d0ad93d85 100755
668	--- a/crypto/bn/asm/ppc64-mont-fixed.pl
669	+++ b/crypto/bn/asm/ppc64-mont-fixed.pl
670	@@ -63,6 +63,7 @@
671	# Registers are global so the code is remotely readable
672
673	# Parameters for Montgomery multiplication
674	+my $ze = "r0";
675	my $sp = "r1";
676	my $toc = "r2";
677	my $rp = "r3";
678	@@ -192,6 +193,7 @@ ($)
679	$self->save_registers();
680
681	$self->add_code(<<___);
682	+ li $ze,0
683	ld $n0,0($n0)
684
685	ld $bp0,0($bp)
686	@@ -242,7 +244,7 @@ ($)
687
688	$self->add_code(<<___);
689	addc $tp[$n],$tp[$n],$c0
690	- addze $tp[$n+1],$tp[$n+1]
691	+ addze $tp[$n+1],$ze
692	___
693
694	$self->add_code(<<___);
695	@@ -272,7 +274,7 @@ ($)
696	and. $tp[$n],$tp[$n],$tp[$n]
697	bne $label->{"sub"}
698
699	- cmpld $tp[$n-1],$npj
700	+ cmpld $tp[$n-1],$npj
701	blt $label->{"copy"}
702
703	$label->{"sub"}: