On 64-bit platforms, disassembling the code shows that send_huff() performs
a left shift followed by a right one, which are the result of integer
truncation and zero-extension caused solely by using different types at
different levels in the call chain. By making encode24() take a 64-bit
int on input and send_huff() take one optionally, we can remove one shift
in the hot path and gain 1% performance without affecting other platforms.
This is slz upstream commit
fd165b36c4621579c5305cf3bb3a7f5410d3720b.
* 32-bit words into output buffer. X must not contain non-zero bits above
* xbits.
*/
-static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
+static inline void enqueue24(struct slz_stream *strm, uint64_t x, uint32_t xbits)
{
- uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits);
+ uint64_t queue = strm->queue + (x << strm->qbits);
uint32_t qbits = strm->qbits + xbits;
if (__builtin_expect(qbits >= 32, 1)) {
strm->outbuf += 4;
}
-static inline void send_huff(struct slz_stream *strm, uint32_t code)
+/* Using long because faster on 64-bit (can save one shift) */
+static inline void send_huff(struct slz_stream *strm, unsigned long code)
{
uint32_t bits;