This PR performs further optimization based on the already merged "Implement Montgome...

author fengpengbo <feng.pengbo@zte.com.cn>

Fri, 19 Dec 2025 03:00:59 +0000 (11:00 +0800)

committer Neil Horman <nhorman@openssl.org>

Tue, 30 Dec 2025 13:22:46 +0000 (08:22 -0500)
author fengpengbo <feng.pengbo@zte.com.cn>
Fri, 19 Dec 2025 03:00:59 +0000 (11:00 +0800)
committer Neil Horman <nhorman@openssl.org>
Tue, 30 Dec 2025 13:22:46 +0000 (08:22 -0500)
diff --git a/crypto/bn/asm/riscv64-mont.pl b/crypto/bn/asm/riscv64-mont.pl

index b512dcfd015e90da3c17b1aa13abe2fd509bb77d..af55d0c1c3138b7c0b250ed0b65ea4bfdee37678 100644 (file)
--- a/crypto/bn/asm/riscv64-mont.pl
+++ b/crypto/bn/asm/riscv64-mont.pl
@@ -57,7 +57,7 @@ $output and open STDOUT,">$output";
              $stack_offset -= 8;
              $ret.="    sd      $_,$stack_offset(sp)\n";
          }
-           return $ret;
+        return $ret;
      }
      sub load_regs {
          my $ret = '';
@@ -70,7 +70,7 @@ $output and open STDOUT,">$output";
              $stack_offset -= 8;
              $ret.="    ld      $_,$stack_offset(sp)\n";
          }
-           $ret.="    addi    sp,sp,$stack_reservation\n";
+        $ret.="    addi    sp,sp,$stack_reservation\n";
          return $ret;
      }
      sub clear_regs {
@@ -84,12 +84,12 @@ $output and open STDOUT,">$output";
  
  # Function arguments
  #      RISC-V    ABI
-# $rp  x10          a0  # BN_ULONG *rp
-# $ap  x11          a1  # const BN_ULONG *ap
-# $bp  x12          a2  # const BN_ULONG *bp
-# $np  x13          a3  # const BN_ULONG *np
-# $n0  x14      a4  # const BN_ULONG *n0
-# $num x15      a5  # int num
+# $rp   x10     a0  # BN_ULONG *rp
+# $ap   x11     a1  # const BN_ULONG *ap
+# $bp   x12     a2  # const BN_ULONG *bp
+# $np   x13     a3  # const BN_ULONG *np
+# $n0   x14     a4  # const BN_ULONG *n0
+# $num  x15     a5  # int num
  my ($rp,$ap,$bp,$np,$n0,$num) = use_regs(10,11,12,13,14,15);
  
  # Return address and Frame pointer
@@ -100,24 +100,24 @@ my ($ra,$fp) = use_regs(1,8);
  
  # Temporary variable allocation
  #      RISC-V    ABI
-# $lo0 x5           t0    the sum of partial products of a and b
-# $hi0 x6           t1    the high word of partial product of a and b + Carry
-# $aj  x7           t2    ap[j]
-# $m0  x28          t3    bp[i]
-# $alo x29          t4    the low word of partial product
-# $ahi x30      t5    the high word of partial product
-# $lo1 x31          t6    partial product + reduction term
-# $hi1 x18          s2    the high word of reduction term + Carry
-# $nj  x19          s3    np[j],modulus
-# $m1  x20          s4    montgomery reduction coefficient
-# $nlo x21          s5    the low word of reduction term
-# $nhi x22          s6    the high word of reduction term
-# $ovf x23          s7    highest carry bit,overflow flag
-# $i   x24          s8    outer loop index
-# $j   x25          s9    inner loop index
-# $tp  x26          s10   temporary result storage
-# $tj  x27          s11   tp[j],temporary result value
-# $temp x9       s1
+# $lo0  x5     t0    the sum of partial products of a and b
+# $hi0  x6     t1    the high word of partial product of a and b + Carry
+# $aj   x7     t2    ap[j]
+# $m0   x28    t3    bp[i]
+# $alo  x29    t4    the low word of partial product
+# $ahi  x30    t5    the high word of partial product
+# $lo1  x31    t6    partial product + reduction term
+# $hi1  x18    s2    the high word of reduction term + Carry
+# $nj   x19    s3    np[j],modulus
+# $m1   x20    s4    montgomery reduction coefficient
+# $nlo  x21    s5    the low word of reduction term
+# $nhi  x22    s6    the high word of reduction term
+# $ovf  x23    s7    highest carry bit,overflow flag
+# $i    x24    s8    outer loop index
+# $j    x25    s9    inner loop index
+# $tp   x26    s10   temporary result storage
+# $tj   x27    s11   tp[j],temporary result value
+# $temp x9     s1
  my ($lo0,$hi0,$aj,$m0,$alo,$ahi,$lo1,$hi1,$nj,$m1,$nlo,$nhi,$ovf,$i,$j,$tp,$tj,$temp) = use_regs(5..7,28..31,18..27,9);
  
  # Carry variable
@@ -129,7 +129,7 @@ my $code .= <<___;
  .text
  .balign 32
  .globl bn_mul_mont
-.type   bn_mul_mont,\@function
+.type bn_mul_mont,\@function
  bn_mul_mont:
  ___
  
@@ -139,26 +139,25 @@ $code .= <<___;
      mv $fp, sp
  ___
  
-$code .= <<___;        
+$code .= <<___;
      ld $m0, 0($bp)    # bp[0]
      addi $bp, $bp,8
      ld $hi0, 0($ap)    # ap[0]
+    slli $num, $num, 3
+    sub $tp, sp, $num
      ld $aj, 8($ap)    # ap[1]
      addi $ap, $ap, 16
      ld $n0, 0($n0)    # n0,precomputed modular inverse
+    andi $tp, $tp, -16    # address alignment
      ld $hi1, 0($np)    # np[0]
+    mv sp, $tp    # alloca
      ld $nj, 8($np)    # np[1]
      addi $np, $np, 16
  
-    slli $num, $num, 3
-    sub $tp, sp, $num
-    andi $tp, $tp, -16    # address alignment
-    mv sp, $tp    # alloca
-
-    addi $j, $num, -16    # $j=(num-2)*8
  
      mul $lo0, $hi0, $m0    # ap[0]*bp[0]
-    mulhu $hi0, $hi0, $m0
+    addi $j, $num, -16    # $j=(num-2)*8
+    mulhu $hi0, $hi0, $m0    
      mul $alo, $aj, $m0    # ap[1]*bp[0]
      mulhu $ahi, $aj, $m0
  
@@ -166,12 +165,12 @@ $code .= <<___;
      # montgomery optimization: np[0]*m1 ensures (np[0]*m1+lo0) has zero lower bits
      # only carry status needed, not full lo1 result
      # eliminates mul/adds instructions → Saves cycles & power
-    # mul $lo1, $hi1, $m1              // np[0]*m1
+    # mul $lo1, $hi1, $m1   // np[0]*m1
      # adds $lo1, $lo1, $lo0   // discarded
      mulhu $hi1, $hi1, $m1
      snez $carry1, $lo0
-    add $hi1, $hi1, $carry1
      mul $nlo, $nj, $m1    # np[1]*m1
+    add $hi1, $hi1, $carry1
      mulhu $nhi, $nj, $m1
      beqz $j, .L1st_last_entry
  
@@ -191,46 +190,43 @@ $code .= <<___;
      # compute the sum of reduction term
      add $lo1, $nlo, $hi1    # {np[j-1]*m1,low}+{np[j-2]*m1,high}, j ranges from 2 to num-1
      sltu $carry1, $lo1, $nlo
+    mul $alo, $aj, $m0    # ap[j]*bp[0], j ranges from 2 to num-1
      add $hi1, $nhi, $carry1    # {np[j-1]*m1,high}+C_lo1, j ranges from 2 to num-1
-
+    mulhu $ahi, $aj, $m0
      # partial product + reduction term
-    add $temp, $lo1, $lo0
-    sltu $carry1, $temp, $lo1
-    mv $lo1, $temp
+    add $lo1, $lo1, $lo0
+    sltu $carry1, $lo1, $lo0
+    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
      add $hi1, $hi1, $carry1
-
+    mulhu $nhi, $nj, $m1
      sd $lo1, 0($tp)    # tp[j-2], j ranges from 2 to num-1
      addi $tp, $tp, 8
  
-    mul $alo, $aj, $m0    # ap[j]*bp[0], j ranges from 2 to num-1
-    mulhu $ahi, $aj, $m0
-    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
-    mulhu $nhi, $nj, $m1
      bnez $j, .L1st
  
  .L1st_last_entry:
      # last partial product
      add $lo0, $alo, $hi0    # {ap[j]*bp[0],low}+{ap[j-1]*bp[0],high}, j is num-1
      sltu $carry1, $lo0, $alo
-    add $hi0, $ahi, $carry1    # {ap[j]*bp[0],high}+C_lo0, j is num-1
-
      sub $ap, $ap, $num    # rewind $ap
-    sub $np, $np, $num    # rewind $np
+    add $hi0, $ahi, $carry1    # {ap[j]*bp[0],high}+C_lo0, j is num-1
  
      # last reduction term
      add $lo1, $nlo, $hi1    # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1
      sltu $carry1, $lo1, $nlo
+    sub $np, $np, $num    # rewind $np
      add $hi1, $nhi, $carry1    # {np[j]*m1,high}+C_lo1, j is num-1
  
      # last partial product + last reduction term
      add $lo1, $lo1, $lo0
      sltu $carry1, $lo1, $lo0
  
-    add $temp, $hi1, $hi0
-    sltu $carry2, $temp, $hi1
-    add $hi1, $temp, $carry1
-    sltu $ovf, $hi1, $temp
+    add $hi1, $hi1, $hi0
+    sltu $carry2, $hi1, $hi0
+    add $hi1, $hi1, $carry1
+    sltu $ovf, $hi1, $carry1
      or $carry1, $carry2, $ovf    # carry2 and ovf are mutually exclusive, both cannot be 1 simultaneously
+
      mv $ovf, $carry1    # upmost overflow bit
  
      addi $i, $num, -8    # $i=(num-1)*8
@@ -248,18 +244,16 @@ $code .= <<___;
      addi $tp, sp, 8    # tp[1]
  
      mul $lo0, $hi0, $m0    # ap[0]*bp[i], i ranges from 1 to num-1
+    addi $j, $num,-16    # $j=(num-2)*8 
      mulhu $hi0, $hi0, $m0
-
-    addi $j, $num,-16    # $j=(num-2)*8
      ld $hi1, 0($np)
      ld $nj, 8($np)
      addi $np, $np, 16
  
      mul $alo, $aj, $m0    # ap[1]*bp[i], i ranges from 1 to num-1
-    mulhu $ahi, $aj, $m0
-
      add $lo0, $lo0, $tj    # ap[0]*bp[i] + last_tp[0] , i ranges from 1 to num-1
      sltu $carry1, $lo0, $tj
+    mulhu $ahi, $aj, $m0
      add $hi0, $hi0, $carry1    # $hi0 will not overflow
  
      # compute the modular reduction coefficient
@@ -267,8 +261,6 @@ $code .= <<___;
  
      addi $i, $i, -8    # $i--, $i ranges from (num-1)*8 to 0
  
-    # mul $lo1, $hi1, $m1       # discarded
-    # adds     $lo1, $lo1, $lo0   # discarded
      mulhu $hi1, $hi1, $m1
      snez $carry1, $lo0
      mul $nlo, $nj, $m1    # np[1]*m1
@@ -299,20 +291,18 @@ $code .= <<___;
      ld $nj, 0($np)
      addi $np, $np, 8
  
+    mul $alo, $aj, $m0    # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1 
      # partial product + reduction term
      add $lo0, $lo0, $tj
      sltu $carry1, $lo0, $tj
+    mulhu $ahi, $aj, $m0
      add $hi0, $hi0, $carry1
  
+    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
      add $lo1, $lo1, $lo0
      sltu $carry1, $lo1, $lo0
-
-    sd $lo1, -16($tp)    # tp[j-2], j ranges from 2 to num-1
-
-    mul $alo, $aj, $m0    # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1
-    mulhu $ahi, $aj, $m0
-    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
      mulhu $nhi, $nj, $m1
+    sd $lo1, -16($tp)    # tp[j-2], j ranges from 2 to num-1
  
      bnez $j, .Linner
  
@@ -324,19 +314,21 @@ $code .= <<___;
      # last partial product
      add $lo0, $alo, $hi0    # {ap[j]*bp[i],low}+{ap[j-1]*bp[i],high}, j is num-1, i ranges from 1 to num-1
      sltu $carry1, $lo0, $alo
-    add $hi0, $ahi, $carry1    # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1
-
      sub $ap, $ap, $num    # rewind $ap
-    sub        $np, $np, $num    # rewind $np
+    add $hi0, $ahi, $carry1    # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1
  
      # last reduction term
      add $lo1, $nlo, $hi1    # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1
      sltu $carry1, $lo1, $nlo
-    add $temp, $nhi, $ovf
-    sltu $carry2, $temp, $nhi
-    add $hi1, $temp, $carry1    # {np[j]*m1,high}+C_lo1, j is num-1
-    sltu $ovf, $hi1, $temp
+
+    sub $np, $np, $num    # rewind $np
+
+    add $hi1, $nhi, $ovf
+    sltu $carry2, $hi1, $ovf
+    add $hi1, $hi1, $carry1    # {np[j]*m1,high}+C_lo1, j is num-1
+    sltu $ovf, $hi1, $carry1
      or $carry1, $carry2, $ovf
+
      mv $ovf, $carry1    # update the upmost overflow bit
  
      # last partial product + last reduction term
@@ -346,10 +338,11 @@ $code .= <<___;
  
      add $lo1, $lo1, $lo0
      sltu $carry1, $lo1, $lo0
-    add $temp, $hi1, $hi0
-    sltu $carry2, $temp, $hi1
-    add $hi1, $temp, $carry1
-    sltu $carry1, $hi1, $temp
+
+    add $hi1, $hi1, $hi0
+    sltu $carry2, $hi1, $hi0
+    add $hi1, $hi1, $carry1
+    sltu $carry1, $hi1, $carry1
      or $carry1, $carry2, $carry1
  
      add $ovf, $ovf, $carry1    # upmost overflow bit
@@ -363,18 +356,16 @@ $code .= <<___;
      ld $nj, 0($np)    # np[0]
      addi $np, $np, 8
      addi $j, $num, -8    # $j=(num-1)*8 and clear borrow
-    sltu $carry1, $num, 8
-    xori $carry1, $carry1, 1
+
+    li $carry1,0   # Custom, no borrow, C=0 (normal case, with borrow C=1)
      mv $ap, $rp
  .Lsub:
      # tp[j]-np[j], j ranges from 0 to num-2, set carry flag
-    xori $carry1, $carry1,1
      sub $temp, $tj, $nj
      sltu $carry2, $tj, $temp
      sub $aj, $temp, $carry1
      sltu $carry1, $temp, $aj
      or $carry1, $carry2, $carry1
-    xori $carry1, $carry1, 1
  
      ld $tj, 0($tp)    # tp[j], j ranges from 1 to num-1
      addi $tp, $tp, 8
@@ -387,21 +378,15 @@ $code .= <<___;
      bnez $j, .Lsub
  
      # process the last word, tp[j]-np[j], j is num-1
-    xori $carry1, $carry1,1
      sub $temp, $tj, $nj
      sltu $carry2, $tj, $temp
      sub $aj, $temp, $carry1
      sltu $carry1, $temp, $aj
      or $carry1, $carry2, $carry1
-    xori $carry1, $carry1, 1
-
+    
      # whether there is a borrow
-    xori $carry1, $carry1, 1
-    sub $temp, $ovf, zero
-    sltu $carry2, $ovf, $temp
-    sub $ovf, $temp, $carry1
-    sltu $carry1, $temp, $ovf
-    or $carry1, $carry2, $carry1
+    sub $temp, $ovf, $carry1
+    sltu $carry1, $ovf, $temp
      xori $carry1, $carry1, 1
  
      sd $aj, 0($ap)    # rp[j], j is num-1
@@ -414,7 +399,6 @@ $code .= <<___;
      addi $rp, $rp, 8
      addi $num, $num, -8    # num--
      nop
-
  .Lcond_copy:
      addi $num,$num, -8    # num--
      # conditionally selects value based on borrow flag:
@@ -453,7 +437,7 @@ $code .= load_regs();
  
  $code .= <<___;
      ret
-.size  bn_mul_mont,.-bn_mul_mont
+.size bn_mul_mont,.-bn_mul_mont
  ___
  
  print $code;
author	fengpengbo <feng.pengbo@zte.com.cn>
	Fri, 19 Dec 2025 03:00:59 +0000 (11:00 +0800)
committer	Neil Horman <nhorman@openssl.org>
	Tue, 30 Dec 2025 13:22:46 +0000 (08:22 -0500)