From d95123f2e68fc5a0feb222d03cacba5a044f03b6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=E6=9D=8E=E5=AD=90=E5=BB=BA?= Date: Fri, 16 May 2025 14:57:32 +0800 Subject: [PATCH] Improve speed of ZSTD_compressSequencesAndLiterals() using RVV --- lib/common/compiler.h | 6 ++ lib/compress/zstd_compress.c | 151 +++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) diff --git a/lib/common/compiler.h b/lib/common/compiler.h index 944774a7a..cafb35b71 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -218,6 +218,9 @@ # if defined(__ARM_NEON) || defined(_M_ARM64) # define ZSTD_ARCH_ARM_NEON # endif +# if defined(__riscv) && defined(__riscv_vector) +# define ZSTD_ARCH_RISCV_RVV +# endif # # if defined(ZSTD_ARCH_X86_AVX2) # include @@ -227,6 +230,9 @@ # elif defined(ZSTD_ARCH_ARM_NEON) # include # endif +# if defined(ZSTD_ARCH_RISCV_RVV) +# include +# endif #endif /* C-language Attributes are added in C23. */ diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index c8f6b2865..9b7aaf9f4 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -7284,6 +7284,93 @@ static size_t convertSequences_noRepcodes( return longLen; } +#elif defined ZSTD_ARCH_RISCV_RVV +#include +/* + * Convert `vl` sequences per iteration, using AVX2 intrinsics: + * - offset -> offBase = offset + 2 + * - litLength -> (U16) litLength + * - matchLength -> (U16)(matchLength - 3) + * - rep is ignored + * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). + * + * @returns 0 on succes, with no long length detected + * @returns > 0 if there is one long length (> 65535), + * indicating the position, and type. + */ +static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) { + size_t longLen = 0; + + /* RVV depends on the specific definition of target structures */ + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); + ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); + size_t vl = 0; + for (size_t i = 0; i < nbSequences; i += vl) { + + vl = __riscv_vsetvl_e32m2(nbSequences-i); + // Loading structure member variables + vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4( + (const int32_t*)&inSeqs[i], + vl + ); + vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0); + vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1); + vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2); + // offset + ZSTD_REP_NUM + vuint32m2_t v_offBase = __riscv_vadd_vx_u32m2(v_offset, ZSTD_REP_NUM, vl); + // Check for integer overflow + // Cast to a 16-bit variable + vbool16_t lit_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_lit, 65535, vl); + vuint16m1_t v_lit_clamped = __riscv_vncvt_x_x_w_u16m1(v_lit, vl); + + vbool16_t ml_overflow = __riscv_vmsgtu_vx_u32m2_b16(v_match, 65535+MINMATCH, vl); + vuint16m1_t v_ml_clamped = __riscv_vncvt_x_x_w_u16m1(__riscv_vsub_vx_u32m2(v_match, MINMATCH, vl), vl); + + // Pack two 16-bit fields into a 32-bit value (little-endian) + // The lower 16 bits contain litLength, and the upper 16 bits contain mlBase + vuint32m2_t v_lit_ml_combined = __riscv_vsll_vx_u32m2( + __riscv_vwcvtu_x_x_v_u32m2(v_ml_clamped, vl), // Convert matchLength to 32-bit + 16, + vl + ); + v_lit_ml_combined = __riscv_vor_vv_u32m2( + v_lit_ml_combined, + __riscv_vwcvtu_x_x_v_u32m2(v_lit_clamped, vl), + vl + ); + // Create a vector of SeqDef structures + // Store the offBase, litLength, and mlBase in a vector of SeqDef + vuint32m2x2_t store_data = __riscv_vcreate_v_u32m2x2( + v_offBase, + v_lit_ml_combined + ); + __riscv_vsseg2e32_v_u32m2x2( + (uint32_t*)&dstSeqs[i], + store_data, + vl + ); + // Find the first index where an overflow occurs + int first_ml = __riscv_vfirst_m_b16(ml_overflow, vl); + int first_lit = __riscv_vfirst_m_b16(lit_overflow, vl); + + if (UNLIKELY(first_ml != -1)) { + assert(longLen == 0); + longLen = i + first_ml + 1; + } + if (UNLIKELY(first_lit != -1)) { + assert(longLen == 0); + longLen = i + first_lit + 1 + nbSequences; + } + } + return longLen; +} + /* the vector implementation could also be ported to SSSE3, * but since this implementation is targeting modern systems (>= Sapphire Rapid), * it's not useful to develop and maintain code for older pre-AVX2 platforms */ @@ -7451,6 +7538,70 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) } } +#elif defined ZSTD_ARCH_RISCV_RVV + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +{ + size_t totalMatchSize = 0; + size_t litSize = 0; + size_t i = 0; + int found_terminator = 0; + size_t vl_max = __riscv_vsetvlmax_e32m1(); + vuint32m1_t v_lit_sum = __riscv_vmv_v_x_u32m1(0, vl_max); + vuint32m1_t v_match_sum = __riscv_vmv_v_x_u32m1(0, vl_max); + + for (; i < nbSeqs; ) { + size_t vl = __riscv_vsetvl_e32m2(nbSeqs - i); + + ptrdiff_t stride = sizeof(ZSTD_Sequence); // 16 + vuint32m2x4_t v_tuple = __riscv_vlseg4e32_v_u32m2x4( + (const int32_t*)&seqs[i], + vl + ); + vuint32m2_t v_offset = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 0); + vuint32m2_t v_lit = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 1); + vuint32m2_t v_match = __riscv_vget_v_u32m2x4_u32m2(v_tuple, 2); + + // Check if any element has a matchLength of 0 + vbool16_t mask = __riscv_vmseq_vx_u32m2_b16(v_match, 0, vl); + int first_zero = __riscv_vfirst_m_b16(mask, vl); + + if (first_zero >= 0) { + // Find the first zero byte and set the effective length to that index + 1 to + // recompute the cumulative vector length of literals and matches + vl = first_zero + 1; + + // recompute the cumulative vector length of literals and matches + v_lit_sum = __riscv_vredsum_vs_u32m2_u32m1(__riscv_vslidedown_vx_u32m2(v_lit, 0, vl), v_lit_sum, vl); + v_match_sum = __riscv_vredsum_vs_u32m2_u32m1(__riscv_vslidedown_vx_u32m2(v_match, 0, vl), v_match_sum, vl); + + i += vl; + found_terminator = 1; + assert(seqs[i - 1].offset == 0); + break; + } else { + + v_lit_sum = __riscv_vredsum_vs_u32m2_u32m1(v_lit, v_lit_sum, vl); + v_match_sum = __riscv_vredsum_vs_u32m2_u32m1(v_match, v_match_sum, vl); + i += vl; + } + } + litSize = __riscv_vmv_x_s_u32m1_u32(v_lit_sum); + totalMatchSize = __riscv_vmv_x_s_u32m1_u32(v_match_sum); + + if (!found_terminator && i==nbSeqs) { + BlockSummary bs; + bs.nbSequences = ERROR(externalSequences_invalid); + return bs; + } + { BlockSummary bs; + bs.nbSequences = i; + bs.blockSize = litSize + totalMatchSize; + bs.litSize = litSize; + return bs; + } +} + #else BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) -- 2.47.2