From: Arpad Panyik Date: Tue, 24 Jun 2025 11:26:58 +0000 (+0000) Subject: AArch64: Improve ZSTD_decodeSequence performance X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a28e8182b1aa6e1e17bcdd099630ea67c4143d32;p=thirdparty%2Fzstd.git AArch64: Improve ZSTD_decodeSequence performance LLVM's alias-analysis sometimes fails to see that a static-array member of a struct cannot alias other members. This patch: - Reduces array accesses via struct indirection to aid load/store alias analysis under Clang. - Converts dynamic array indexing into conditional-move arithmetic, eliminating branches and extra loads/stores on out-of-order CPUs. - Reloads the bitstream only when match-length bits are consumed (assuming each reload only needs to happen once per match-length read), improving branch-prediction rates. - Removes the UNLIKELY() hint, which recent compilers already handle well without cost. Decompression uplifts on a Neoverse V2 system, using Zstd-1.5.8 compiled with "-O3 -march=armv8.2-a+sve2": Clang-19 Clang-20 Clang-* GCC-14 GCC-15 1#silesia.tar: +11.556% +16.203% +0.240% +2.216% +7.891% 2#silesia.tar: +15.493% +21.140% -0.041% +2.850% +9.926% 3#silesia.tar: +16.887% +22.570% -0.183% +3.056% +10.660% 4#silesia.tar: +17.785% +23.315% -0.262% +3.343% +11.187% 5#silesia.tar: +18.125% +24.175% -0.466% +3.350% +11.228% 6#silesia.tar: +17.607% +23.339% -0.591% +3.175% +10.851% 7#silesia.tar: +17.463% +22.837% -0.486% +3.292% +10.868% * Requires Clang-21 support from LLVM commit hash `a53003fe23cb6c871e72d70ff2d3a075a7490da2` (Clang-21 hasn’t been released as of this writing) Co-authored by: David Sherwood, David.Sherwood@arm.com Ola Liljedahl, Ola.Liljedahl@arm.com --- diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 6174a250b..b2ccd92a1 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1236,6 +1236,10 @@ FORCE_INLINE_TEMPLATE seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) { seq_t seq; +#if defined(__aarch64__) + size_t prevOffset0 = seqState->prevOffset[0]; + size_t prevOffset1 = seqState->prevOffset[1]; + size_t prevOffset2 = seqState->prevOffset[2]; /* * ZSTD_seqSymbol is a 64 bits wide structure. * It can be loaded in one operation @@ -1244,7 +1248,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, c * operations that cause performance drop. This can be avoided by using this * ZSTD_memcpy hack. */ -#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) +# if defined(__GNUC__) && !defined(__clang__) ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; ZSTD_seqSymbol* const llDInfo = &llDInfoS; ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; @@ -1252,11 +1256,119 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, c ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); -#else +# else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; +# endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + + assert(llBits <= MaxLLBits); + assert(mlBits <= MaxMLBits); + assert(ofBits <= MaxOff); + /* As GCC has better branch and block analyzers, sometimes it is only + * valuable to mark likeliness for Clang. + */ + + /* sequence */ + { size_t offset; + if (ofBits > 1) { + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); + ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + /* Always read extra bits, this keeps the logic simple, + * avoids branches, and avoids accidentally reading 0 bits. + */ + U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + prevOffset2 = prevOffset1; + prevOffset1 = prevOffset0; + prevOffset0 = offset; + } else { + U32 const ll0 = (llDInfo->baseValue == 0); + if (LIKELY((ofBits == 0))) { + if (ll0) { + offset = prevOffset1; + prevOffset1 = prevOffset0; + prevOffset0 = offset; + } else { + offset = prevOffset0; + } + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset == 1) ? prevOffset1 + : (offset == 3) ? prevOffset0 - 1 + : (offset >= 2) ? prevOffset2 + : prevOffset0; + /* 0 is not valid: input corrupted => force offset to -1 => + * corruption detected at execSequence. + */ + temp -= !temp; + prevOffset2 = (offset == 1) ? prevOffset2 : prevOffset1; + prevOffset1 = prevOffset0; + prevOffset0 = offset = temp; + } } } + seq.offset = offset; + } + + if (mlBits > 0) { + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + } + + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + if (llBits > 0) + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + if (!isLastSeq) { + /* Don't update FSE state for last sequence. */ + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ + BIT_reloadDStream(&seqState->DStream); + } + } + seqState->prevOffset[0] = prevOffset0; + seqState->prevOffset[1] = prevOffset1; + seqState->prevOffset[2] = prevOffset2; +#else /* !defined(__aarch64__) */ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; -#endif seq.matchLength = mlDInfo->baseValue; seq.litLength = llDInfo->baseValue; { U32 const ofBase = ofDInfo->baseValue; @@ -1275,10 +1387,8 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, c assert(llBits <= MaxLLBits); assert(mlBits <= MaxMLBits); assert(ofBits <= MaxOff); - /* - * As gcc has better branch and block analyzers, sometimes it is only - * valuable to mark likeliness for clang, it gives around 3-4% of - * performance. + /* As GCC has better branch and block analyzers, sometimes it is only + * valuable to mark likeliness for Clang. */ /* sequence */ @@ -1340,7 +1450,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, c (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); if (!isLastSeq) { - /* don't update FSE state for last Sequence */ + /* Don't update FSE state for last sequence. */ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ @@ -1348,6 +1458,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, c BIT_reloadDStream(&seqState->DStream); } } +#endif /* defined(__aarch64__) */ return seq; }