From e6dccbf48246f4e2844251972fcc0946a5de5154 Mon Sep 17 00:00:00 2001 From: Han Zhu Date: Mon, 27 Mar 2023 15:57:55 -0700 Subject: [PATCH] Inline BIT_reloadDStream Inlining `BIT_reloadDStream` provided >3% decompression speed improvement for clang PGO-optimized zstd binary, measured using the Silesia corpus with compression level 1. The win comes from improved register allocation which leads to fewer spills and reloads. Take a look at this comparison of profile-annotated hot assembly before and after this change: https://www.diffchecker.com/UjDGIyLz/. The diff is a bit messy, but notice three fewer moves after inlining. In general LLVM's register allocator works better when it can see more code. For example, when the register allocator sees a call instruction, it partitions the registers into caller registers and callee registers, and it is not free to do whatever it wants with all the registers for the current function. Inlining the callee lets the register allocation access all registers and use them more flexsibly. --- lib/common/bitstream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/common/bitstream.h b/lib/common/bitstream.h index db1b4cf13..72b0b3df2 100644 --- a/lib/common/bitstream.h +++ b/lib/common/bitstream.h @@ -396,7 +396,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) * This function is safe, it guarantees it will not read beyond src buffer. * @return : status of `BIT_DStream_t` internal register. * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ -MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) +MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) { if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ return BIT_DStream_overflow; -- 2.47.2