From bd38fc2c5f21536bc523f9a42c5e642ea9407ae4 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Fri, 20 Jun 2025 15:29:17 +0000 Subject: [PATCH] AArch64: Enhance struct access in Huffman decode 2X MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In the multi-stream multi-symbol Huffman decoder GCC generates suboptimal code - emitting more loads for HUF_DEltX2 struct member accesses. Forcing it to use 32-bit loads and bit arithmetic to extract the necessary parts (UBFX) improves the overall decode speed. Also avoid integer type conversions in the symbol decodes, which leads to better instruction selection in table lookup accesses. On AArch64 the decoder no longer runs into register-pressure limits, so we can simplify the hot path and improve throughput Decompression uplifts on a Neoverse V2 system, using Zstd-1.5.8 compiled with "-O3 -march=armv8.2-a+sve2": Clang-20 Clang-* GCC-13 GCC-14 GCC-15 1#silesia.tar: +0.820% +1.365% +2.480% +1.348% +0.987% 2#silesia.tar: +0.426% +0.784% +1.218% +0.665% +0.554% 3#silesia.tar: +0.112% +0.389% +0.508% +0.188% +0.261% * Requires Clang-21 support from LLVM commit hash `a53003fe23cb6c871e72d70ff2d3a075a7490da2` (Clang-21 hasn’t been released as of this writing) --- lib/decompress/huf_decompress.c | 94 ++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 43 deletions(-) diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index c7e342648..920206178 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -785,19 +785,19 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* } #endif -#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ - do { \ - int const index = (int)(bits[(_stream)] >> 53); \ - int const entry = (int)dtable[index]; \ - bits[(_stream)] <<= (entry & 0x3F); \ - op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ +#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ + do { \ + U64 const index = bits[(_stream)] >> 53; \ + U16 const entry = dtable[index]; \ + bits[(_stream)] <<= entry & 0x3F; \ + op[(_stream)][(_symbol)] = (BYTE)(entry >> 8); \ } while (0) -#define HUF_4X1_RELOAD_STREAM(_stream) \ +#define HUF_5X1_RELOAD_STREAM(_stream) \ do { \ - int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ - int const nbBits = ctz & 7; \ - int const nbBytes = ctz >> 3; \ + U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + U64 const nbBits = ctz & 7; \ + U64 const nbBytes = ctz >> 3; \ op[(_stream)] += 5; \ ip[(_stream)] -= nbBytes; \ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ @@ -816,11 +816,11 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); /* Reload each of the 4 the bitstreams */ - HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); + HUF_4X_FOR_EACH_STREAM(HUF_5X1_RELOAD_STREAM); } while (op[3] < olimit); #undef HUF_4X1_DECODE_SYMBOL -#undef HUF_4X1_RELOAD_STREAM +#undef HUF_5X1_RELOAD_STREAM } _out: @@ -1603,57 +1603,65 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* } #endif -#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ - do { \ - if ((_decode3) || (_stream) != 3) { \ - int const index = (int)(bits[(_stream)] >> 53); \ - HUF_DEltX2 const entry = dtable[index]; \ - MEM_write16(op[(_stream)], entry.sequence); \ - bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ - op[(_stream)] += (entry.length); \ - } \ +#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ + do { \ + if ((_decode3) || (_stream) != 3) { \ + U64 const index = bits[(_stream)] >> 53; \ + size_t const entry = MEM_readLE32(&dtable[index]); \ + MEM_write16(op[(_stream)], (U16)entry); \ + bits[(_stream)] <<= (entry >> 16) & 0x3F; \ + op[(_stream)] += entry >> 24; \ + } \ } while (0) -#define HUF_4X2_RELOAD_STREAM(_stream) \ +#define HUF_5X2_RELOAD_STREAM(_stream, _decode3) \ do { \ - HUF_4X2_DECODE_SYMBOL(3, 1); \ + if (_decode3) HUF_4X2_DECODE_SYMBOL(3, 1); \ { \ - int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ - int const nbBits = ctz & 7; \ - int const nbBytes = ctz >> 3; \ + U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + U64 const nbBits = ctz & 7; \ + U64 const nbBytes = ctz >> 3; \ ip[(_stream)] -= nbBytes; \ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ bits[(_stream)] <<= nbBits; \ } \ } while (0) +#if defined(__aarch64__) +# define HUF_4X2_4WAY 1 +#else +# define HUF_4X2_4WAY 0 +#endif +#define HUF_4X2_3WAY !HUF_4X2_4WAY + /* Manually unroll the loop because compilers don't consistently * unroll the inner loops, which destroys performance. */ do { - /* Decode 5 symbols from each of the first 3 streams. - * The final stream will be decoded during the reload phase - * to reduce register pressure. + /* Decode 5 symbols from each of the first 3 or 4 streams. + * In the 3-way case the final stream will be decoded during + * the reload phase to reduce register pressure. */ - HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); - HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); - HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); - HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); - HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); - - /* Decode one symbol from the final stream */ - HUF_4X2_DECODE_SYMBOL(3, 1); - - /* Decode 4 symbols from the final stream & reload bitstreams. - * The final stream is reloaded last, meaning that all 5 symbols - * are decoded from the final stream before it is reloaded. + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY); + + /* In the 3-way case decode one symbol from the final stream. */ + HUF_4X2_DECODE_SYMBOL(3, HUF_4X2_3WAY); + + /* In the 3-way case decode 4 symbols from the final stream & + * reload bitstreams. The final stream is reloaded last, meaning + * that all 5 symbols are decoded from the final stream before + * it is reloaded. */ - HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_5X2_RELOAD_STREAM, HUF_4X2_3WAY); } while (op[3] < olimit); } #undef HUF_4X2_DECODE_SYMBOL -#undef HUF_4X2_RELOAD_STREAM +#undef HUF_5X2_RELOAD_STREAM _out: -- 2.47.2