From: xxcui Date: Sat, 29 Nov 2025 01:46:04 +0000 (+0800) Subject: SHA512 performance optimized by RISCV RVV X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;p=thirdparty%2Fopenssl.git SHA512 performance optimized by RISCV RVV This patch is dedicated to improve SHA512 speed with RISCV Cryptographic Vector Extension. Below performance output is calculated by Xuantie C930 FPGA with VLEN256. - sha512 speed might be improved from 197032K to 1010986KB Reviewed-by: Paul Yang Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/29263) --- diff --git a/.github/workflows/riscv-more-cross-compiles.yml b/.github/workflows/riscv-more-cross-compiles.yml index 1747f56a38..b519ec2ff0 100644 --- a/.github/workflows/riscv-more-cross-compiles.yml +++ b/.github/workflows/riscv-more-cross-compiles.yml @@ -161,6 +161,26 @@ jobs: qemucpu: "rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zksed=true,zksh=true,zkr=true,zkt=true,v=true,vlen=128,zvbb=true,zvbc=true,zvkb=true,zvkg=true,zvkned=true,zvknha=true,zvknhb=true,zvksed=true,zvksh=true", opensslcapsname: riscvcap, # OPENSSL_riscvcap opensslcaps: "rv64gc_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh_zksed_zksh_zkr_zkt_v_zvbb_zvbc_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh" + }, { + # RV64GC with all currently OpenSSL-supported extensions, with zvl256 + # crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl + arch: riscv64-linux-gnu, + libs: libc6-dev-riscv64-cross, + target: linux64-riscv64, + fips: no, + qemucpu: "rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zksed=true,zksh=true,zkr=true,zkt=true,v=true,vlen=256,zvbb=true,zvbc=true,zvkb=true,zvkg=true,zvkned=true,zvknha=true,zvknhb=true,zvksed=true,zvksh=true", + opensslcapsname: riscvcap, # OPENSSL_riscvcap + opensslcaps: "rv64gc_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh_zksed_zksh_zkr_zkt_v_zvbb_zvbc_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvl256" + }, { + # RV64GC with all currently OpenSSL-supported extensions, with zvl512 + # crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl + arch: riscv64-linux-gnu, + libs: libc6-dev-riscv64-cross, + target: linux64-riscv64, + fips: no, + qemucpu: "rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true,zbkc=true,zbkx=true,zknd=true,zkne=true,zknh=true,zksed=true,zksh=true,zkr=true,zkt=true,v=true,vlen=512,zvbb=true,zvbc=true,zvkb=true,zvkg=true,zvkned=true,zvknha=true,zvknhb=true,zvksed=true,zvksh=true", + opensslcapsname: riscvcap, # OPENSSL_riscvcap + opensslcaps: "rv64gc_zba_zbb_zbc_zbs_zbkb_zbkc_zbkx_zknd_zkne_zknh_zksed_zksh_zkr_zkt_v_zvbb_zvbc_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvl512" }, { # Inline asm # zbb/zbkb: diff --git a/crypto/perlasm/riscv.pm b/crypto/perlasm/riscv.pm index bac41fb453..54833c22af 100644 --- a/crypto/perlasm/riscv.pm +++ b/crypto/perlasm/riscv.pm @@ -624,6 +624,14 @@ sub vmv_v_i { return ".word ".($template | ($imm << 15) | ($vd << 7)); } +sub vmv1r_v { + # vmv1r.v vd, vs1 + my $template = 0b1001111_00000_00000_011_00000_1010111; + my $vd = read_vreg shift; + my $vs1 = read_vreg shift; + return ".word ".($template | ($vs1 << 20) | ($vd << 7)); +} + sub vmv_v_x { # vmv.v.x vd, rs1 my $template = 0b0101111_00000_00000_100_00000_1010111; diff --git a/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl b/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl index c5df987296..29a51b2f2b 100644 --- a/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl +++ b/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl @@ -70,6 +70,7 @@ my $K512 = "K512"; # Function arguments my ($H, $INP, $LEN, $KT, $H2, $INDEX_PATTERN) = ("a0", "a1", "a2", "a3", "t3", "t4"); +my ($T0, $T1) = ("t0", "t1"); ################################################################################ # void sha512_block_data_order_zvkb_zvknhb(void *c, const void *p, size_t len) @@ -78,8 +79,6 @@ $code .= <<___; .globl sha512_block_data_order_zvkb_zvknhb .type sha512_block_data_order_zvkb_zvknhb,\@function sha512_block_data_order_zvkb_zvknhb: - @{[vsetivli "zero", 4, "e64", "m2", "ta", "ma"]} - # H is stored as {a,b,c,d},{e,f,g,h}, but we need {f,e,b,a},{h,g,d,c} # The dst vtype is e64m2 and the index vtype is e8mf4. # We use index-load with the following index pattern at v1. @@ -105,9 +104,226 @@ sha512_block_data_order_zvkb_zvknhb: @{[vsetivli "zero", 1, "e8", "m1", "ta", "ma"]} @{[vmv_v_i $V0, 0x01]} - @{[vsetivli "zero", 4, "e64", "m2", "ta", "ma"]} + # Obtain VLEN and select the corresponding branch + csrr t0, vlenb + srl t1, t0, 5 + beqz t1, sha512_block_data_order_zvkb_zvknhb_zvl128 +sha512_block_data_order_zvkb_zvknhb_zvl256_zvl512: + # When vlen=256 or 512, the round constants K512 can be loaded + # at once in vector register files. + @{[vsetivli "zero", 4, "e64", "m1", "ta", "ma"]} + # Load round constants K512 + la $KT, $K512 + @{[vle64_v $V2, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V3, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V4, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V5, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V6, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V7, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V8, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V9, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V11, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V13, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V15, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V17, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V19, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V21, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V23, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V25, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V27, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V29, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V30, ($KT)]} + addi $KT, $KT, 32 + @{[vle64_v $V31, ($KT)]} + +L_round_loop_256_512: + # Decrement length by 1 + addi $LEN, $LEN, -1 -L_round_loop: + # Keep the current state as we need it later: H' = H+{a',b',c',...,h'}. + @{[vmv1r_v $V26, $V22]} + @{[vmv1r_v $V28, $V24]} + + # Load the 1024-bits of the message block in v10, v12, v14, v16 + # and perform the endian swap. + @{[vle64_v $V10, $INP]} + @{[vrev8_v $V10, $V10]} + addi $INP, $INP, 32 + @{[vle64_v $V12, $INP]} + @{[vrev8_v $V12, $V12]} + addi $INP, $INP, 32 + @{[vle64_v $V14, $INP]} + @{[vrev8_v $V14, $V14]} + addi $INP, $INP, 32 + @{[vle64_v $V16, $INP]} + @{[vrev8_v $V16, $V16]} + addi $INP, $INP, 32 + + # Quad-round 0 (+0, v10->v12->v14->v16) + @{[vadd_vv $V18, $V2, $V10]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V14, $V12, $V0]} + @{[vsha2ms_vv $V10, $V18, $V16]} + + # Quad-round 1 (+1, v12->v14->v16->v10) + @{[vadd_vv $V18, $V3, $V12]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V16, $V14, $V0]} + @{[vsha2ms_vv $V12, $V18, $V10]} + + # Quad-round 2 (+2, v14->v16->v10->v12) + @{[vadd_vv $V18, $V4, $V14]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V10, $V16, $V0]} + @{[vsha2ms_vv $V14, $V18, $V12]} + + # Quad-round 3 (+3, v16->v10->v12->v14) + @{[vadd_vv $V18, $V5, $V16]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V12, $V10, $V0]} + @{[vsha2ms_vv $V16, $V18, $V14]} + + # Quad-round 4 (+4, v10->v12->v14->v16) + @{[vadd_vv $V18, $V6, $V10]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V14, $V12, $V0]} + @{[vsha2ms_vv $V10, $V18, $V16]} + + # Quad-round 5 (+5, v12->v14->v16->v10) + @{[vadd_vv $V18, $V7, $V12]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V16, $V14, $V0]} + @{[vsha2ms_vv $V12, $V18, $V10]} + + # Quad-round 6 (+6, v14->v16->v10->v12) + @{[vadd_vv $V18, $V8, $V14]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V10, $V16, $V0]} + @{[vsha2ms_vv $V14, $V18, $V12]} + + # Quad-round 7 (+7, v16->v10->v12->v14) + @{[vadd_vv $V18, $V9, $V16]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V12, $V10, $V0]} + @{[vsha2ms_vv $V16, $V18, $V14]} + + # Quad-round 8 (+8, v10->v12->v14->v16) + @{[vadd_vv $V18, $V11, $V10]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V14, $V12, $V0]} + @{[vsha2ms_vv $V10, $V18, $V16]} + + # Quad-round 9 (+9, v12->v14->v16->v10) + @{[vadd_vv $V18, $V13, $V12]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V16, $V14, $V0]} + @{[vsha2ms_vv $V12, $V18, $V10]} + + # Quad-round 10 (+10, v14->v16->v10->v12) + @{[vadd_vv $V18, $V15, $V14]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V10, $V16, $V0]} + @{[vsha2ms_vv $V14, $V18, $V12]} + + # Quad-round 11 (+11, v16->v10->v12->v14) + @{[vadd_vv $V18, $V17, $V16]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V12, $V10, $V0]} + @{[vsha2ms_vv $V16, $V18, $V14]} + + # Quad-round 12 (+12, v10->v12->v14->v16) + @{[vadd_vv $V18, $V19, $V10]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V14, $V12, $V0]} + @{[vsha2ms_vv $V10, $V18, $V16]} + + # Quad-round 13 (+13, v12->v14->v16->v10) + @{[vadd_vv $V18, $V21, $V12]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V16, $V14, $V0]} + @{[vsha2ms_vv $V12, $V18, $V10]} + + # Quad-round 14 (+14, v14->v16->v10->v12) + @{[vadd_vv $V18, $V23, $V14]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V10, $V16, $V0]} + @{[vsha2ms_vv $V14, $V18, $V12]} + + # Quad-round 15 (+15, v16->v10->v12->v14) + @{[vadd_vv $V18, $V25, $V16]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + @{[vmerge_vvm $V18, $V12, $V10, $V0]} + @{[vsha2ms_vv $V16, $V18, $V14]} + + # Quad-round 16 (+0, v10->v12->v14->v16) + # Note that we stop generating new message schedule words (Wt, v10-16) + # as we already generated all the words we end up consuming (i.e., W[79:76]). + @{[vadd_vv $V18, $V27, $V10]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + + # Quad-round 17 (+1, v12->v14->v16->v10) + @{[vadd_vv $V18, $V29, $V12]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + + # Quad-round 18 (+2, v14->v16->v10->v12) + @{[vadd_vv $V18, $V30, $V14]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + + # Quad-round 19 (+3, v16->v10->v12->v14) + @{[vadd_vv $V18, $V31, $V16]} + @{[vsha2cl_vv $V24, $V22, $V18]} + @{[vsha2ch_vv $V22, $V24, $V18]} + + # H' = H+{a',b',c',...,h'} + @{[vadd_vv $V22, $V26, $V22]} + @{[vadd_vv $V24, $V28, $V24]} + bnez $LEN, L_round_loop_256_512 + + # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}. + @{[vsuxei8_v $V22, ($H), $V1]} + @{[vsuxei8_v $V24, ($H2), $V1]} + + ret +sha512_block_data_order_zvkb_zvknhb_zvl128: + @{[vsetivli $T0, 4, "e64", "m2", "ta", "ma"]} +L_round_loop_128: # Load round constants K512 la $KT, $K512 @@ -204,7 +420,7 @@ L_round_loop: # H' = H+{a',b',c',...,h'} @{[vadd_vv $V22, $V26, $V22]} @{[vadd_vv $V24, $V28, $V24]} - bnez $LEN, L_round_loop + bnez $LEN, L_round_loop_128 # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}. @{[vsuxei8_v $V22, ($H), $V1]}