]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
AArch64: Enhance struct access in Huffman decode 2X 4413/head
authorArpad Panyik <Arpad.Panyik@arm.com>
Fri, 20 Jun 2025 15:29:17 +0000 (15:29 +0000)
committerArpad Panyik <Arpad.Panyik@arm.com>
Mon, 23 Jun 2025 14:16:25 +0000 (14:16 +0000)
In the multi-stream multi-symbol Huffman decoder GCC generates
suboptimal code - emitting more loads for HUF_DEltX2 struct member
accesses. Forcing it to use 32-bit loads and bit arithmetic to extract
the necessary parts (UBFX) improves the overall decode speed.

Also avoid integer type conversions in the symbol decodes, which
leads to better instruction selection in table lookup accesses.

On AArch64 the decoder no longer runs into register-pressure limits,
so we can simplify the hot path and improve throughput

Decompression uplifts on a Neoverse V2 system, using Zstd-1.5.8
compiled with "-O3 -march=armv8.2-a+sve2":

                 Clang-20   Clang-*    GCC-13    GCC-14    GCC-15
 1#silesia.tar:   +0.820%   +1.365%   +2.480%   +1.348%   +0.987%
 2#silesia.tar:   +0.426%   +0.784%   +1.218%   +0.665%   +0.554%
 3#silesia.tar:   +0.112%   +0.389%   +0.508%   +0.188%   +0.261%

* Requires Clang-21 support from LLVM commit hash
  `a53003fe23cb6c871e72d70ff2d3a075a7490da2`
  (Clang-21 hasn’t been released as of this writing)

lib/decompress/huf_decompress.c

index c7e342648f66979e7595a93065d39be82f2407bb..920206178247ac50f0dc010e016a3ebaca8be555 100644 (file)
@@ -785,19 +785,19 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         }
 #endif
 
-#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
-    do {                                                        \
-        int const index = (int)(bits[(_stream)] >> 53);         \
-        int const entry = (int)dtable[index];                   \
-        bits[(_stream)] <<= (entry & 0x3F);                     \
-        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
+#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)        \
+    do {                                               \
+        U64 const index = bits[(_stream)] >> 53;       \
+        U16 const entry = dtable[index];               \
+        bits[(_stream)] <<= entry & 0x3F;              \
+        op[(_stream)][(_symbol)] = (BYTE)(entry >> 8); \
     } while (0)
 
-#define HUF_4X1_RELOAD_STREAM(_stream)                              \
+#define HUF_5X1_RELOAD_STREAM(_stream)                              \
     do {                                                            \
-        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
-        int const nbBits = ctz & 7;                                 \
-        int const nbBytes = ctz >> 3;                               \
+        U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+        U64 const nbBits = ctz & 7;                                 \
+        U64 const nbBytes = ctz >> 3;                               \
         op[(_stream)] += 5;                                         \
         ip[(_stream)] -= nbBytes;                                   \
         bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
@@ -816,11 +816,11 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
             HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
 
             /* Reload each of the 4 the bitstreams */
-            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
+            HUF_4X_FOR_EACH_STREAM(HUF_5X1_RELOAD_STREAM);
         } while (op[3] < olimit);
 
 #undef HUF_4X1_DECODE_SYMBOL
-#undef HUF_4X1_RELOAD_STREAM
+#undef HUF_5X1_RELOAD_STREAM
     }
 
 _out:
@@ -1603,57 +1603,65 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         }
 #endif
 
-#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
-    do {                                                              \
-        if ((_decode3) || (_stream) != 3) {                           \
-            int const index = (int)(bits[(_stream)] >> 53);           \
-            HUF_DEltX2 const entry = dtable[index];                   \
-            MEM_write16(op[(_stream)], entry.sequence); \
-            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
-            op[(_stream)] += (entry.length);                          \
-        }                                                             \
+#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)               \
+    do {                                                       \
+        if ((_decode3) || (_stream) != 3) {                    \
+            U64 const index = bits[(_stream)] >> 53;           \
+            size_t const entry = MEM_readLE32(&dtable[index]); \
+            MEM_write16(op[(_stream)], (U16)entry);            \
+            bits[(_stream)] <<= (entry >> 16) & 0x3F;          \
+            op[(_stream)] += entry >> 24;                      \
+        }                                                      \
     } while (0)
 
-#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
+#define HUF_5X2_RELOAD_STREAM(_stream, _decode3)                        \
     do {                                                                \
-        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
+        if (_decode3) HUF_4X2_DECODE_SYMBOL(3, 1);                      \
         {                                                               \
-            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
-            int const nbBits = ctz & 7;                                 \
-            int const nbBytes = ctz >> 3;                               \
+            U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+            U64 const nbBits = ctz & 7;                                 \
+            U64 const nbBytes = ctz >> 3;                               \
             ip[(_stream)] -= nbBytes;                                   \
             bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
             bits[(_stream)] <<= nbBits;                                 \
         }                                                               \
     } while (0)
 
+#if defined(__aarch64__)
+#  define HUF_4X2_4WAY 1
+#else
+#  define HUF_4X2_4WAY 0
+#endif
+#define HUF_4X2_3WAY !HUF_4X2_4WAY
+
         /* Manually unroll the loop because compilers don't consistently
          * unroll the inner loops, which destroys performance.
          */
         do {
-            /* Decode 5 symbols from each of the first 3 streams.
-             * The final stream will be decoded during the reload phase
-             * to reduce register pressure.
+            /* Decode 5 symbols from each of the first 3 or 4 streams.
+             * In the 3-way case the final stream will be decoded during
+             * the reload phase to reduce register pressure.
              */
-            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
-            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
-            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
-            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
-            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
-
-            /* Decode one symbol from the final stream */
-            HUF_4X2_DECODE_SYMBOL(3, 1);
-
-            /* Decode 4 symbols from the final stream & reload bitstreams.
-             * The final stream is reloaded last, meaning that all 5 symbols
-             * are decoded from the final stream before it is reloaded.
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
+
+            /* In the 3-way case decode one symbol from the final stream. */
+            HUF_4X2_DECODE_SYMBOL(3, HUF_4X2_3WAY);
+
+            /* In the 3-way case decode 4 symbols from the final stream &
+             * reload bitstreams. The final stream is reloaded last, meaning
+             * that all 5 symbols are decoded from the final stream before
+             * it is reloaded.
              */
-            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_5X2_RELOAD_STREAM, HUF_4X2_3WAY);
         } while (op[3] < olimit);
     }
 
 #undef HUF_4X2_DECODE_SYMBOL
-#undef HUF_4X2_RELOAD_STREAM
+#undef HUF_5X2_RELOAD_STREAM
 
 _out: