From: WANG Xuerui Date: Tue, 5 Aug 2025 11:45:14 +0000 (+0800) Subject: LoongArch: Accelerate SHA-512 message scheduling with LSX X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fc403c7bfe6a9d92744e1cbf537632cccb7c9f2e;p=thirdparty%2Fopenssl.git LoongArch: Accelerate SHA-512 message scheduling with LSX Detect and use LSX (128-bit SIMD) to accelerate the message scheduling stage of SHA-512. The main compression round is not amenable to SIMD optimizations because horizontal data dependencies are very heavy. In the current approach, 8 vector registers are used to store the 16 active message schedule array elements, with each 128-bit register fully utilized holding 2 64-bit element each. Thankfully this means the data dependency between x[n] and x[n-2] for rounds >= 16 (0-based) is completely side-stepped, and with no computational power wasted. Performance numbers on Loongson 3C6000 (LA664 uarch) @ 2.2GHz: Before: ``` version: 3.6.0-dev built on: Sun Aug 3 10:22:36 2025 UTC options: bn(64,64) compiler: gcc -fPIC -pthread -Wa,--noexecstack -Wall -O3 -DOPENSSL_USE_NODELETE -DL_ENDIAN -DOPENSSL_PIC -DOPENSSL_BUILDING_OPENSSL -DNDEBUG CPUINFO: N/A The 'numbers' are in 1000s of bytes per second processed. type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes sha512 27701.14k 109625.41k 206773.00k 311351.64k 367442.71k 371401.72k ``` After: ``` version: 3.6.0-dev built on: Sun Aug 3 10:22:36 2025 UTC options: bn(64,64) compiler: gcc -fPIC -pthread -Wa,--noexecstack -Wall -O3 -DOPENSSL_USE_NODELETE -DL_ENDIAN -DOPENSSL_PIC -DOPENSSL_BUILDING_OPENSSL -DNDEBUG CPUINFO: N/A The 'numbers' are in 1000s of bytes per second processed. type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes sha512 33440.93k 132238.88k 272890.18k 429715.11k 518770.04k 526172.98k ``` Which is an improvement of between 20.63% (for 64B inputs; 20.72% for 16B which is very close) and 41.67% (for 16KiB inputs). This is much larger an improvement than for SHA-256. A similar attempt at utilizing LASX is also done, with 4 256-bit XRs being used to store 4 64-bit elements each, and with the sigma1 computation unrolled twice; performance actually dropped by -12.56% (64B; -12.59% for 16B) to -22.31% (16KiB) (!!). This large drop is most likely due to the fact that LASX is incapable of efficiently transferring data across the higher and lower 128-bit halves -- 3 cycles latency for every instruction capable of doing so. Specifically, with the unrolled sigma1 computation, one needs to swap halves and zero the other unused half, and do this twice; and re-aligning "1234" and "9abc" parts also requires two such moves, such as moving x[4] from m4567[0] to m1234[3]. So, ultimately I decided to remove LASX from the patchset. Signed-off-by: WANG Xuerui Reviewed-by: Tomas Mraz Reviewed-by: Paul Dale Reviewed-by: Neil Horman MergeDate: Wed Mar 11 13:56:17 2026 (Merged from https://github.com/openssl/openssl/pull/28192) --- diff --git a/crypto/sha/asm/sha512-loongarch64.pl b/crypto/sha/asm/sha512-loongarch64.pl index ac9fb03fbde..44d1736586a 100644 --- a/crypto/sha/asm/sha512-loongarch64.pl +++ b/crypto/sha/asm/sha512-loongarch64.pl @@ -42,6 +42,9 @@ use warnings; my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; +my $use_lsx = $flavour && $flavour =~ /lsx/i ? 1 : 0; +my $isaext = "_" . ( $use_lsx ? "lsx" : "la64v100" ); + $output and open STDOUT,">$output"; my $code=<<___; @@ -55,10 +58,13 @@ my ($zero,$ra,$tp,$sp,$fp)=("\$zero", "\$ra", "\$tp", "\$sp", "\$fp"); my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$a$_",(0..7)); my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8)=map("\$t$_",(0..8)); my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$s$_",(0..8)); +my ($va0, $va1, $va2, $va3, $va4, $va5, $va6, $va7) = map("\$vr$_",(0..7)); +my ($vt0, $vt1, $vt2, $vt3, $vt4, $vt5, $vt6, $vt7) = map("\$vr$_",(8..15)); my ($INP, $LEN, $ADDR) = ($a1, $a2, $sp); my ($KT, $T1, $T2, $T3, $T4, $T5, $T6) = ($t0, $t1, $t2, $t3, $t4, $t5, $t6); my ($A, $B, $C, $D, $E, $F, $G, $H) = ($s0, $s1, $s2, $s3, $s4, $s5, $s6, $s7); +my @VMSGS = ($va0, $va1, $va2, $va3, $va4, $va5, $va6, $va7); sub strip { my ($str) = @_; @@ -66,8 +72,33 @@ sub strip { return $str; } +sub MSGSCHEDULE0_lsx { + my ($index) = @_; + my $msg = $VMSGS[$index / 2]; + my $code; + + if ($index % 2 == 0) { + $code = <<___; + vld $msg, $INP, @{[8*$index]} + vshuf4i.b $msg, $msg, 0b00011011 + vshuf4i.w $msg, $msg, 0b10110001 +___ + } + + $code .= <<___; + vpickve2gr.d $T1, $msg, @{[$index%2]} +___ + + return strip($code); +} + sub MSGSCHEDULE0 { my ($index) = @_; + + if ($use_lsx) { + return MSGSCHEDULE0_lsx($index); + } + my $code=<<___; ld.d $T1, $INP, @{[8*$index]} revb.d $T1, $T1 @@ -76,8 +107,68 @@ ___ return strip($code); } +sub MSGSCHEDULE1_lsx { + my ($index) = @_; + my $msgidx = ($index / 2) % 8; + my $m01 = $VMSGS[$msgidx]; + my $m23 = $VMSGS[($msgidx + 1) % 8]; + my $m45 = $VMSGS[($msgidx + 2) % 8]; + my $m67 = $VMSGS[($msgidx + 3) % 8]; + my $m89 = $VMSGS[($msgidx + 4) % 8]; + my $mab = $VMSGS[($msgidx + 5) % 8]; + my $mcd = $VMSGS[($msgidx + 6) % 8]; + my $mef = $VMSGS[($msgidx + 7) % 8]; + my ($m12, $tmp0, $tmp1) = ($vt0, $vt1, $vt2); + my $code; + + if ($index % 2 == 0) { + # re-align to get $m12 and "$m9a" ($tmp0) + $code = <<___; + # m01 & new = $m01, m23 = $m23, m45 = $m45, m67 = $m67 + # m89 = $m89, mab = $mab, mcd = $mcd, mef = $mef + vori.b $m12, $m01, 0 + vshuf4i.d $m12, $m23, 0b1001 + vori.b $tmp0, $m89, 0 + vshuf4i.d $tmp0, $mab, 0b1001 + vadd.d $m01, $m01, $tmp0 +___ + + # $m01 += sigma0($m12) + $code .= <<___; + vrotri.d $tmp0, $m12, 1 + vrotri.d $tmp1, $m12, 8 + vsrli.d $m12, $m12, 7 + vxor.v $tmp0, $tmp0, $tmp1 + vxor.v $m12, $m12, $tmp0 + vadd.d $m01, $m01, $m12 +___ + + # $m01 += sigma1 + # now m12 can be re-used as temporary + $code .= <<___; + vrotri.d $tmp0, $mef, 19 + vrotri.d $tmp1, $mef, 61 + vsrli.d $m12, $mef, 6 + vxor.v $tmp0, $tmp0, $tmp1 + vxor.v $m12, $m12, $tmp0 + vadd.d $m01, $m01, $m12 +___ + } + + $code .= <<___; + vpickve2gr.d $T1, $m01, @{[$index%2]} +___ + + return strip($code); +} + sub MSGSCHEDULE1 { my ($index) = @_; + + if ($use_lsx) { + return MSGSCHEDULE1_lsx($index); + } + my $code=<<___; ld.d $T1, $ADDR, @{[(($index-2)&0x0f)*8]} ld.d $T2, $ADDR, @{[(($index-15)&0x0f)*8]} @@ -152,12 +243,12 @@ ___ } ################################################################################ -# void sha512_block_data_order(void *c, const void *p, size_t len) +# void sha512_block_data_order$isaext(void *c, const void *p, size_t len) $code .= <<___; .p2align 3 -.globl sha512_block_data_order -.type sha512_block_data_order,\@function -sha512_block_data_order: +.globl sha512_block_data_order@{[$isaext]} +.type sha512_block_data_order@{[$isaext]},\@function +sha512_block_data_order@{[$isaext]}: addi.d $sp, $sp, -80 @@ -171,9 +262,17 @@ sha512_block_data_order: st.d $s7, $sp, 56 st.d $s8, $sp, 64 st.d $fp, $sp, 72 +___ +# SHA512 LSX needs neither dedicated shuffle control word, nor stack space for +# internal states +if (!$use_lsx) { + $code .= <<___; addi.d $sp, $sp, -128 +___ +} +$code .= <<___; la $KT, $K512 # load ctx @@ -238,9 +337,15 @@ $code .= <<___; addi.d $INP, $INP, 128 bnez $LEN, L_round_loop +___ +if (!$use_lsx) { + $code .= <<___; addi.d $sp, $sp, 128 +___ +} +$code .= <<___; ld.d $s0, $sp, 0 ld.d $s1, $sp, 8 ld.d $s2, $sp, 16 @@ -255,7 +360,7 @@ $code .= <<___; addi.d $sp, $sp, 80 ret -.size sha512_block_data_order,.-sha512_block_data_order +.size sha512_block_data_order@{[$isaext]},.-sha512_block_data_order@{[$isaext]} .section .rodata .p2align 3 diff --git a/crypto/sha/build.info b/crypto/sha/build.info index f7e7c6d1751..457ac8d06ab 100644 --- a/crypto/sha/build.info +++ b/crypto/sha/build.info @@ -18,7 +18,9 @@ IF[{- !$disabled{asm} -}] $SHA1ASM_alpha=sha1-alpha.S $SHA1DEF_alpha=SHA1_ASM - $SHA1ASM_loongarch64=sha_loongarch.c sha256-loongarch64.S sha256-loongarch64-lsx.S sha512-loongarch64.S + $SHA1ASM_loongarch64=sha_loongarch.c \ + sha256-loongarch64.S sha256-loongarch64-lsx.S \ + sha512-loongarch64.S sha512-loongarch64-lsx.S $SHA1DEF_loongarch64=SHA256_ASM SHA512_ASM $SHA1ASM_mips32=sha1-mips.S sha256-mips.S @@ -142,6 +144,8 @@ GENERATE[sha256-loongarch64-lsx.S]=asm/sha256-loongarch64.pl lsx INCLUDE[sha256-loongarch64-lsx.o]=.. GENERATE[sha512-loongarch64.S]=asm/sha512-loongarch64.pl INCLUDE[sha512-loongarch64.o]=.. +GENERATE[sha512-loongarch64-lsx.S]=asm/sha512-loongarch64.pl lsx +INCLUDE[sha512-loongarch64-lsx.o]=.. GENERATE[sha1-mips.S]=asm/sha1-mips.pl INCLUDE[sha1-mips.o]=.. diff --git a/crypto/sha/sha_loongarch.c b/crypto/sha/sha_loongarch.c index bade69f19f3..da8dac8975f 100644 --- a/crypto/sha/sha_loongarch.c +++ b/crypto/sha/sha_loongarch.c @@ -26,3 +26,16 @@ void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) sha256_block_data_order_la64v100(ctx, in, num); } } + +void sha512_block_data_order_la64v100(void *ctx, const void *in, size_t num); +void sha512_block_data_order_lsx(void *ctx, const void *in, size_t num); +void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num); + +void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num) +{ + if (OPENSSL_loongarch_hwcap_P & LOONGARCH_HWCAP_LSX) { + sha512_block_data_order_lsx(ctx, in, num); + } else { + sha512_block_data_order_la64v100(ctx, in, num); + } +}