From: fengpengbo Date: Fri, 19 Dec 2025 03:00:59 +0000 (+0800) Subject: This PR performs further optimization based on the already merged "Implement Montgome... X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b34e890c2104550c9a02b29eeb81da44240a2aac;p=thirdparty%2Fopenssl.git This PR performs further optimization based on the already merged "Implement Montgomery multiplication assembly optimization for RV64GC" (#28012). The key improvements include:code formatting unification、instruction scheduling optimization、register allocation strategy update、updates to addition/subtraction carry and borrow flag handling. These enhancements aim to improve both the code maintainability and its execution performance, particularly on out-of-order RISC-V cores. Reviewed-by: Neil Horman Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/29438) --- diff --git a/crypto/bn/asm/riscv64-mont.pl b/crypto/bn/asm/riscv64-mont.pl index b512dcfd015..af55d0c1c31 100644 --- a/crypto/bn/asm/riscv64-mont.pl +++ b/crypto/bn/asm/riscv64-mont.pl @@ -57,7 +57,7 @@ $output and open STDOUT,">$output"; $stack_offset -= 8; $ret.=" sd $_,$stack_offset(sp)\n"; } - return $ret; + return $ret; } sub load_regs { my $ret = ''; @@ -70,7 +70,7 @@ $output and open STDOUT,">$output"; $stack_offset -= 8; $ret.=" ld $_,$stack_offset(sp)\n"; } - $ret.=" addi sp,sp,$stack_reservation\n"; + $ret.=" addi sp,sp,$stack_reservation\n"; return $ret; } sub clear_regs { @@ -84,12 +84,12 @@ $output and open STDOUT,">$output"; # Function arguments # RISC-V ABI -# $rp x10 a0 # BN_ULONG *rp -# $ap x11 a1 # const BN_ULONG *ap -# $bp x12 a2 # const BN_ULONG *bp -# $np x13 a3 # const BN_ULONG *np -# $n0 x14 a4 # const BN_ULONG *n0 -# $num x15 a5 # int num +# $rp x10 a0 # BN_ULONG *rp +# $ap x11 a1 # const BN_ULONG *ap +# $bp x12 a2 # const BN_ULONG *bp +# $np x13 a3 # const BN_ULONG *np +# $n0 x14 a4 # const BN_ULONG *n0 +# $num x15 a5 # int num my ($rp,$ap,$bp,$np,$n0,$num) = use_regs(10,11,12,13,14,15); # Return address and Frame pointer @@ -100,24 +100,24 @@ my ($ra,$fp) = use_regs(1,8); # Temporary variable allocation # RISC-V ABI -# $lo0 x5 t0 the sum of partial products of a and b -# $hi0 x6 t1 the high word of partial product of a and b + Carry -# $aj x7 t2 ap[j] -# $m0 x28 t3 bp[i] -# $alo x29 t4 the low word of partial product -# $ahi x30 t5 the high word of partial product -# $lo1 x31 t6 partial product + reduction term -# $hi1 x18 s2 the high word of reduction term + Carry -# $nj x19 s3 np[j],modulus -# $m1 x20 s4 montgomery reduction coefficient -# $nlo x21 s5 the low word of reduction term -# $nhi x22 s6 the high word of reduction term -# $ovf x23 s7 highest carry bit,overflow flag -# $i x24 s8 outer loop index -# $j x25 s9 inner loop index -# $tp x26 s10 temporary result storage -# $tj x27 s11 tp[j],temporary result value -# $temp x9 s1 +# $lo0 x5 t0 the sum of partial products of a and b +# $hi0 x6 t1 the high word of partial product of a and b + Carry +# $aj x7 t2 ap[j] +# $m0 x28 t3 bp[i] +# $alo x29 t4 the low word of partial product +# $ahi x30 t5 the high word of partial product +# $lo1 x31 t6 partial product + reduction term +# $hi1 x18 s2 the high word of reduction term + Carry +# $nj x19 s3 np[j],modulus +# $m1 x20 s4 montgomery reduction coefficient +# $nlo x21 s5 the low word of reduction term +# $nhi x22 s6 the high word of reduction term +# $ovf x23 s7 highest carry bit,overflow flag +# $i x24 s8 outer loop index +# $j x25 s9 inner loop index +# $tp x26 s10 temporary result storage +# $tj x27 s11 tp[j],temporary result value +# $temp x9 s1 my ($lo0,$hi0,$aj,$m0,$alo,$ahi,$lo1,$hi1,$nj,$m1,$nlo,$nhi,$ovf,$i,$j,$tp,$tj,$temp) = use_regs(5..7,28..31,18..27,9); # Carry variable @@ -129,7 +129,7 @@ my $code .= <<___; .text .balign 32 .globl bn_mul_mont -.type bn_mul_mont,\@function +.type bn_mul_mont,\@function bn_mul_mont: ___ @@ -139,26 +139,25 @@ $code .= <<___; mv $fp, sp ___ -$code .= <<___; +$code .= <<___; ld $m0, 0($bp) # bp[0] addi $bp, $bp,8 ld $hi0, 0($ap) # ap[0] + slli $num, $num, 3 + sub $tp, sp, $num ld $aj, 8($ap) # ap[1] addi $ap, $ap, 16 ld $n0, 0($n0) # n0,precomputed modular inverse + andi $tp, $tp, -16 # address alignment ld $hi1, 0($np) # np[0] + mv sp, $tp # alloca ld $nj, 8($np) # np[1] addi $np, $np, 16 - slli $num, $num, 3 - sub $tp, sp, $num - andi $tp, $tp, -16 # address alignment - mv sp, $tp # alloca - - addi $j, $num, -16 # $j=(num-2)*8 mul $lo0, $hi0, $m0 # ap[0]*bp[0] - mulhu $hi0, $hi0, $m0 + addi $j, $num, -16 # $j=(num-2)*8 + mulhu $hi0, $hi0, $m0 mul $alo, $aj, $m0 # ap[1]*bp[0] mulhu $ahi, $aj, $m0 @@ -166,12 +165,12 @@ $code .= <<___; # montgomery optimization: np[0]*m1 ensures (np[0]*m1+lo0) has zero lower bits # only carry status needed, not full lo1 result # eliminates mul/adds instructions → Saves cycles & power - # mul $lo1, $hi1, $m1 // np[0]*m1 + # mul $lo1, $hi1, $m1 // np[0]*m1 # adds $lo1, $lo1, $lo0 // discarded mulhu $hi1, $hi1, $m1 snez $carry1, $lo0 - add $hi1, $hi1, $carry1 mul $nlo, $nj, $m1 # np[1]*m1 + add $hi1, $hi1, $carry1 mulhu $nhi, $nj, $m1 beqz $j, .L1st_last_entry @@ -191,46 +190,43 @@ $code .= <<___; # compute the sum of reduction term add $lo1, $nlo, $hi1 # {np[j-1]*m1,low}+{np[j-2]*m1,high}, j ranges from 2 to num-1 sltu $carry1, $lo1, $nlo + mul $alo, $aj, $m0 # ap[j]*bp[0], j ranges from 2 to num-1 add $hi1, $nhi, $carry1 # {np[j-1]*m1,high}+C_lo1, j ranges from 2 to num-1 - + mulhu $ahi, $aj, $m0 # partial product + reduction term - add $temp, $lo1, $lo0 - sltu $carry1, $temp, $lo1 - mv $lo1, $temp + add $lo1, $lo1, $lo0 + sltu $carry1, $lo1, $lo0 + mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1 add $hi1, $hi1, $carry1 - + mulhu $nhi, $nj, $m1 sd $lo1, 0($tp) # tp[j-2], j ranges from 2 to num-1 addi $tp, $tp, 8 - mul $alo, $aj, $m0 # ap[j]*bp[0], j ranges from 2 to num-1 - mulhu $ahi, $aj, $m0 - mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1 - mulhu $nhi, $nj, $m1 bnez $j, .L1st .L1st_last_entry: # last partial product add $lo0, $alo, $hi0 # {ap[j]*bp[0],low}+{ap[j-1]*bp[0],high}, j is num-1 sltu $carry1, $lo0, $alo - add $hi0, $ahi, $carry1 # {ap[j]*bp[0],high}+C_lo0, j is num-1 - sub $ap, $ap, $num # rewind $ap - sub $np, $np, $num # rewind $np + add $hi0, $ahi, $carry1 # {ap[j]*bp[0],high}+C_lo0, j is num-1 # last reduction term add $lo1, $nlo, $hi1 # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1 sltu $carry1, $lo1, $nlo + sub $np, $np, $num # rewind $np add $hi1, $nhi, $carry1 # {np[j]*m1,high}+C_lo1, j is num-1 # last partial product + last reduction term add $lo1, $lo1, $lo0 sltu $carry1, $lo1, $lo0 - add $temp, $hi1, $hi0 - sltu $carry2, $temp, $hi1 - add $hi1, $temp, $carry1 - sltu $ovf, $hi1, $temp + add $hi1, $hi1, $hi0 + sltu $carry2, $hi1, $hi0 + add $hi1, $hi1, $carry1 + sltu $ovf, $hi1, $carry1 or $carry1, $carry2, $ovf # carry2 and ovf are mutually exclusive, both cannot be 1 simultaneously + mv $ovf, $carry1 # upmost overflow bit addi $i, $num, -8 # $i=(num-1)*8 @@ -248,18 +244,16 @@ $code .= <<___; addi $tp, sp, 8 # tp[1] mul $lo0, $hi0, $m0 # ap[0]*bp[i], i ranges from 1 to num-1 + addi $j, $num,-16 # $j=(num-2)*8 mulhu $hi0, $hi0, $m0 - - addi $j, $num,-16 # $j=(num-2)*8 ld $hi1, 0($np) ld $nj, 8($np) addi $np, $np, 16 mul $alo, $aj, $m0 # ap[1]*bp[i], i ranges from 1 to num-1 - mulhu $ahi, $aj, $m0 - add $lo0, $lo0, $tj # ap[0]*bp[i] + last_tp[0] , i ranges from 1 to num-1 sltu $carry1, $lo0, $tj + mulhu $ahi, $aj, $m0 add $hi0, $hi0, $carry1 # $hi0 will not overflow # compute the modular reduction coefficient @@ -267,8 +261,6 @@ $code .= <<___; addi $i, $i, -8 # $i--, $i ranges from (num-1)*8 to 0 - # mul $lo1, $hi1, $m1 # discarded - # adds $lo1, $lo1, $lo0 # discarded mulhu $hi1, $hi1, $m1 snez $carry1, $lo0 mul $nlo, $nj, $m1 # np[1]*m1 @@ -299,20 +291,18 @@ $code .= <<___; ld $nj, 0($np) addi $np, $np, 8 + mul $alo, $aj, $m0 # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1 # partial product + reduction term add $lo0, $lo0, $tj sltu $carry1, $lo0, $tj + mulhu $ahi, $aj, $m0 add $hi0, $hi0, $carry1 + mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1 add $lo1, $lo1, $lo0 sltu $carry1, $lo1, $lo0 - - sd $lo1, -16($tp) # tp[j-2], j ranges from 2 to num-1 - - mul $alo, $aj, $m0 # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1 - mulhu $ahi, $aj, $m0 - mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1 mulhu $nhi, $nj, $m1 + sd $lo1, -16($tp) # tp[j-2], j ranges from 2 to num-1 bnez $j, .Linner @@ -324,19 +314,21 @@ $code .= <<___; # last partial product add $lo0, $alo, $hi0 # {ap[j]*bp[i],low}+{ap[j-1]*bp[i],high}, j is num-1, i ranges from 1 to num-1 sltu $carry1, $lo0, $alo - add $hi0, $ahi, $carry1 # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1 - sub $ap, $ap, $num # rewind $ap - sub $np, $np, $num # rewind $np + add $hi0, $ahi, $carry1 # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1 # last reduction term add $lo1, $nlo, $hi1 # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1 sltu $carry1, $lo1, $nlo - add $temp, $nhi, $ovf - sltu $carry2, $temp, $nhi - add $hi1, $temp, $carry1 # {np[j]*m1,high}+C_lo1, j is num-1 - sltu $ovf, $hi1, $temp + + sub $np, $np, $num # rewind $np + + add $hi1, $nhi, $ovf + sltu $carry2, $hi1, $ovf + add $hi1, $hi1, $carry1 # {np[j]*m1,high}+C_lo1, j is num-1 + sltu $ovf, $hi1, $carry1 or $carry1, $carry2, $ovf + mv $ovf, $carry1 # update the upmost overflow bit # last partial product + last reduction term @@ -346,10 +338,11 @@ $code .= <<___; add $lo1, $lo1, $lo0 sltu $carry1, $lo1, $lo0 - add $temp, $hi1, $hi0 - sltu $carry2, $temp, $hi1 - add $hi1, $temp, $carry1 - sltu $carry1, $hi1, $temp + + add $hi1, $hi1, $hi0 + sltu $carry2, $hi1, $hi0 + add $hi1, $hi1, $carry1 + sltu $carry1, $hi1, $carry1 or $carry1, $carry2, $carry1 add $ovf, $ovf, $carry1 # upmost overflow bit @@ -363,18 +356,16 @@ $code .= <<___; ld $nj, 0($np) # np[0] addi $np, $np, 8 addi $j, $num, -8 # $j=(num-1)*8 and clear borrow - sltu $carry1, $num, 8 - xori $carry1, $carry1, 1 + + li $carry1,0 # Custom, no borrow, C=0 (normal case, with borrow C=1) mv $ap, $rp .Lsub: # tp[j]-np[j], j ranges from 0 to num-2, set carry flag - xori $carry1, $carry1,1 sub $temp, $tj, $nj sltu $carry2, $tj, $temp sub $aj, $temp, $carry1 sltu $carry1, $temp, $aj or $carry1, $carry2, $carry1 - xori $carry1, $carry1, 1 ld $tj, 0($tp) # tp[j], j ranges from 1 to num-1 addi $tp, $tp, 8 @@ -387,21 +378,15 @@ $code .= <<___; bnez $j, .Lsub # process the last word, tp[j]-np[j], j is num-1 - xori $carry1, $carry1,1 sub $temp, $tj, $nj sltu $carry2, $tj, $temp sub $aj, $temp, $carry1 sltu $carry1, $temp, $aj or $carry1, $carry2, $carry1 - xori $carry1, $carry1, 1 - + # whether there is a borrow - xori $carry1, $carry1, 1 - sub $temp, $ovf, zero - sltu $carry2, $ovf, $temp - sub $ovf, $temp, $carry1 - sltu $carry1, $temp, $ovf - or $carry1, $carry2, $carry1 + sub $temp, $ovf, $carry1 + sltu $carry1, $ovf, $temp xori $carry1, $carry1, 1 sd $aj, 0($ap) # rp[j], j is num-1 @@ -414,7 +399,6 @@ $code .= <<___; addi $rp, $rp, 8 addi $num, $num, -8 # num-- nop - .Lcond_copy: addi $num,$num, -8 # num-- # conditionally selects value based on borrow flag: @@ -453,7 +437,7 @@ $code .= load_regs(); $code .= <<___; ret -.size bn_mul_mont,.-bn_mul_mont +.size bn_mul_mont,.-bn_mul_mont ___ print $code;