]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Improve speed of ZSTD_compressSequencesAndLiterals() using RVV 4399/head
author李子建 <zijian.oerv@isrc.iscas.ac.cn>
Fri, 16 May 2025 06:57:32 +0000 (14:57 +0800)
committerZijian Li <zijian.oerv@isrc.iscas.ac.cn>
Mon, 2 Jun 2025 09:21:02 +0000 (17:21 +0800)
lib/common/compiler.h
lib/compress/zstd_compress.c

index 944774a7a3b590f4ae9e57af42edbdf4cf521bce..cafb35b71e40c52372ac2d0f371f7974d809833a 100644 (file)
 #  if defined(__ARM_NEON) || defined(_M_ARM64)
 #    define ZSTD_ARCH_ARM_NEON
 #  endif
+# if defined(__riscv) && defined(__riscv_vector)
+#   define ZSTD_ARCH_RISCV_RVV
+# endif
 #
 #  if defined(ZSTD_ARCH_X86_AVX2)
 #    include <immintrin.h>
 #  elif defined(ZSTD_ARCH_ARM_NEON)
 #    include <arm_neon.h>
 #  endif
+#  if defined(ZSTD_ARCH_RISCV_RVV)
+#    include <riscv_vector.h>
+#  endif
 #endif
 
 /* C-language Attributes are added in C23. */
index c8f6b28658066939138f47101ca6bc8236ac2213..9b7aaf9f4f4ad156f740a45a54917957c5117bb2 100644 (file)
@@ -7284,6 +7284,93 @@ static size_t convertSequences_noRepcodes(
     return longLen;
 }
 
+#elif defined ZSTD_ARCH_RISCV_RVV
+#include <riscv_vector.h>
+/*
+ * Convert `vl` sequences per iteration, using AVX2 intrinsics:
+ *   - offset -> offBase = offset + 2
+ *   - litLength -> (U16) litLength
+ *   - matchLength -> (U16)(matchLength - 3)
+ *   - rep is ignored
+ * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]).
+ *
+ * @returns 0 on succes, with no long length detected
+ * @returns > 0 if there is one long length (> 65535),
+ * indicating the position, and type.
+ */
+static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) {
+    size_t longLen = 0;
+
+    /* RVV depends on the specific definition of target structures */
+    ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4);
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
+    ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8);
+    ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
+    ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
+    ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
+    size_t vl = 0;
+    for (size_t i = 0; i < nbSequences; i += vl) {
+
+        vl = __riscv_vsetvl_e32m2(nbSequences-i);
+        // Loading structure member variables
+        vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
+            (const int32_t*)&inSeqs[i], 
+            vl
+        );
+        vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
+        vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
+        vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
+        // offset + ZSTD_REP_NUM
+        vuint32m2_t v_offBase = __riscv_vadd_vx_u32m2(v_offset, ZSTD_REP_NUM, vl); 
+        // Check for integer overflow
+        // Cast to a 16-bit variable
+        vbool16_t lit_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_lit, 65535, vl);
+        vuint16m1_t v_lit_clamped = __riscv_vncvt_x_x_w_u16m1(v_lit, vl);
+
+        vbool16_t ml_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_match, 65535+MINMATCH, vl);
+        vuint16m1_t v_ml_clamped = __riscv_vncvt_x_x_w_u16m1(__riscv_vsub_vx_u32m2(v_match, MINMATCH, vl), vl);
+
+        // Pack two 16-bit fields into a 32-bit value (little-endian)
+        // The lower 16 bits contain litLength, and the upper 16 bits contain mlBase
+        vuint32m2_t v_lit_ml_combined = __riscv_vsll_vx_u32m2(
+            __riscv_vwcvtu_x_x_v_u32m2(v_ml_clamped, vl), // Convert matchLength to 32-bit
+            16, 
+            vl
+        );
+        v_lit_ml_combined = __riscv_vor_vv_u32m2(
+            v_lit_ml_combined,
+            __riscv_vwcvtu_x_x_v_u32m2(v_lit_clamped, vl),
+            vl
+        );
+        // Create a vector of SeqDef structures
+        // Store the offBase, litLength, and mlBase in a vector of SeqDef
+        vuint32m2x2_t store_data = __riscv_vcreate_v_u32m2x2(
+            v_offBase,          
+            v_lit_ml_combined   
+        );
+        __riscv_vsseg2e32_v_u32m2x2(
+            (uint32_t*)&dstSeqs[i], 
+            store_data,             
+            vl                      
+        );
+        // Find the first index where an overflow occurs
+        int first_ml = __riscv_vfirst_m_b16(ml_overflow, vl);
+        int first_lit = __riscv_vfirst_m_b16(lit_overflow, vl);
+
+        if (UNLIKELY(first_ml != -1)) {
+            assert(longLen == 0);
+            longLen = i + first_ml + 1;
+        }
+        if (UNLIKELY(first_lit != -1)) {
+            assert(longLen == 0);
+            longLen = i + first_lit + 1 + nbSequences;
+        }
+    }
+    return longLen;
+}
+
 /* the vector implementation could also be ported to SSSE3,
  * but since this implementation is targeting modern systems (>= Sapphire Rapid),
  * it's not useful to develop and maintain code for older pre-AVX2 platforms */
@@ -7451,6 +7538,70 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
     }
 }
 
+#elif defined ZSTD_ARCH_RISCV_RVV
+
+BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
+{
+    size_t totalMatchSize = 0;
+    size_t litSize = 0;
+    size_t i = 0;
+    int found_terminator = 0; 
+    size_t vl_max = __riscv_vsetvlmax_e32m1();
+    vuint32m1_t v_lit_sum = __riscv_vmv_v_x_u32m1(0, vl_max);
+    vuint32m1_t v_match_sum = __riscv_vmv_v_x_u32m1(0, vl_max);
+
+    for (; i  < nbSeqs; ) {
+        size_t vl = __riscv_vsetvl_e32m2(nbSeqs - i); 
+
+        ptrdiff_t stride = sizeof(ZSTD_Sequence); // 16
+        vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4(
+            (const int32_t*)&seqs[i], 
+            vl
+        );
+        vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0);
+        vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1);
+        vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2);
+
+        // Check if any element has a matchLength of 0
+        vbool16_t mask = __riscv_vmseq_vx_u32m2_b16(v_match, 0, vl);
+        int first_zero = __riscv_vfirst_m_b16(mask, vl);
+
+        if (first_zero >= 0) {
+            // Find the first zero byte and set the effective length to that index + 1 to 
+            // recompute the cumulative vector length of literals and matches
+            vl = first_zero + 1;
+            
+            // recompute the cumulative vector length of literals and matches
+            v_lit_sum = __riscv_vredsum_vs_u32m2_u32m1(__riscv_vslidedown_vx_u32m2(v_lit, 0, vl), v_lit_sum, vl);
+            v_match_sum = __riscv_vredsum_vs_u32m2_u32m1(__riscv_vslidedown_vx_u32m2(v_match, 0, vl), v_match_sum, vl);
+
+            i += vl;
+            found_terminator = 1; 
+            assert(seqs[i - 1].offset == 0);
+            break;
+        } else {
+
+            v_lit_sum = __riscv_vredsum_vs_u32m2_u32m1(v_lit, v_lit_sum, vl);
+            v_match_sum = __riscv_vredsum_vs_u32m2_u32m1(v_match, v_match_sum, vl);
+            i += vl;
+        }
+    }
+    litSize = __riscv_vmv_x_s_u32m1_u32(v_lit_sum);
+    totalMatchSize = __riscv_vmv_x_s_u32m1_u32(v_match_sum);
+
+    if (!found_terminator && i==nbSeqs) {
+        BlockSummary bs;
+        bs.nbSequences = ERROR(externalSequences_invalid);
+        return bs;
+    }
+    {   BlockSummary bs;
+        bs.nbSequences = i;
+        bs.blockSize = litSize + totalMatchSize;
+        bs.litSize = litSize;
+        return bs;
+    }
+}
+
 #else
 
 BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)