]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Remove clang-only branch hints from ZSTD_decodeSequence 3576/head
authorHan Zhu <zhuhan7737@gmail.com>
Tue, 28 Mar 2023 21:33:50 +0000 (14:33 -0700)
committerHan Zhu <zhuhan7737@gmail.com>
Tue, 28 Mar 2023 22:36:22 +0000 (15:36 -0700)
Looking at the __builtin_expect in ZSTD_decodeSequence:

{   size_t offset;
    #if defined(__clang__)
 if (LIKELY(ofBits > 1)) {
    #else
 if (ofBits > 1) {
    #endif
 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);

From profile-annotated assembly, the probability of ofBits > 1 is about 75%
(101k counts out of 135k counts). This is much smaller than the recommended
likelihood to use __builtin_expect which is 99%. As a result, clang moved the
else block further away which hurts cache locality. Removing this
__built_expect along with two others in ZSTD_decodeSequence gave better
performance when PGO is enabled. I suggest to remove these branch hints and
rely on PGO which leverages runtime profiles from actual workload to calculate
branch probability instead.

lib/decompress/zstd_decompress_block.c

index 2a7d70d69115bd02286095612074d98ee459a326..f018271765d96a4d79634a16dc281db362ef23fe 100644 (file)
@@ -1232,11 +1232,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
 
         /* sequence */
         {   size_t offset;
-    #if defined(__clang__)
-            if (LIKELY(ofBits > 1)) {
-    #else
             if (ofBits > 1) {
-    #endif
                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
                 ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
@@ -1273,11 +1269,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
             seq.offset = offset;
         }
 
-    #if defined(__clang__)
-        if (UNLIKELY(mlBits > 0))
-    #else
         if (mlBits > 0)
-    #endif
             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
 
         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
@@ -1287,11 +1279,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
 
-    #if defined(__clang__)
-        if (UNLIKELY(llBits > 0))
-    #else
         if (llBits > 0)
-    #endif
             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
 
         if (MEM_32bits())