From: Sebastian Pop Date: Wed, 7 Nov 2018 21:11:27 +0000 (-0600) Subject: bug #117: speed up inflate_fast X-Git-Tag: 1.9.9-b1~578 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dd70d6c467f26d727ea730ba577809fa0668940d;p=thirdparty%2Fzlib-ng.git bug #117: speed up inflate_fast Based on a patch by Nigel Tao: https://github.com/madler/zlib/pull/292/commits/e0ff1f330cc03ee04843f857869b4036593ab39d This patch makes unzipping of files up to 1.2x faster on x86_64. The other part (1.3x speedup) of the patch by Nigel Tao is unsafe as discussed in the review of that pull request. zlib-ng already has a different way to optimize the memcpy for that missing part. The original patch was enabled only on little-endian machines. This patch adapts the loading of 64 bits at a time to big endian machines. Benchmarking notes from Hans Kristian Rosbach: https://github.com/zlib-ng/zlib-ng/pull/224#issuecomment-444837182 Benchmark runs: 7, tested levels: 0-7, testfile 100M develop at 796ad10 with -O3: Level Comp Comptime min/avg/max Decomptime min/avg/max 0 100.02% 0.01/0.01/0.02 0.08/0.09/0.11 1 47.08% 0.49/0.50/0.51 0.37/0.39/0.40 2 36.02% 1.10/1.12/1.13 0.39/0.39/0.40 3 34.77% 1.32/1.34/1.37 0.38/0.38/0.38 4 33.41% 1.50/1.53/1.56 0.37/0.37/0.38 5 33.07% 1.85/1.87/1.90 0.36/0.37/0.38 6 32.83% 2.54/2.57/2.61 0.36/0.37/0.38 avg 45.31% 1.28 0.34 tot 62.60 16.58 PR224 with -O3: Level Comp Comptime min/avg/max Decomptime min/avg/max 0 100.02% 0.01/0.01/0.02 0.09/0.09/0.10 1 47.08% 0.49/0.50/0.51 0.37/0.37/0.38 2 36.02% 1.09/1.11/1.13 0.38/0.38/0.39 3 34.77% 1.32/1.34/1.38 0.35/0.36/0.38 4 33.41% 1.49/1.52/1.54 0.36/0.36/0.37 5 33.07% 1.85/1.88/1.93 0.35/0.36/0.37 6 32.83% 2.55/2.58/2.65 0.35/0.35/0.36 avg 45.31% 1.28 0.33 tot 62.48 16.02 So I see about a 5.4% speedup on my x86_64 machine, not quite the 1.2x speedup but a nice speedup nevertheless. This benchmark measures the total execution time of minigzip, so that might have caused some inefficiencies. At -O2, I only see a 2.7% speedup. --- diff --git a/infback.c b/infback.c index 04553b72..b4fb562b 100644 --- a/infback.c +++ b/infback.c @@ -453,7 +453,8 @@ int ZEXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in_desc case LEN: /* use inflate_fast() if we have enough input and output */ - if (have >= 6 && left >= 258) { + if (have >= INFLATE_FAST_MIN_HAVE && + left >= INFLATE_FAST_MIN_LEFT) { RESTORE(); if (state->whave < state->wsize) state->whave = state->wsize - left; diff --git a/inffast.c b/inffast.c index 501a42d1..40e649a8 100644 --- a/inffast.c +++ b/inffast.c @@ -113,8 +113,8 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns Entry assumptions: state->mode == LEN - strm->avail_in >= 6 - strm->avail_out >= 258 + strm->avail_in >= INFLATE_FAST_MIN_HAVE + strm->avail_out >= INFLATE_FAST_MIN_LEFT start >= strm->avail_out state->bits < 8 @@ -132,6 +132,10 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns Therefore if strm->avail_in >= 6, then there is enough input to avoid checking for available input while decoding. + - On some architectures, it can be significantly faster (e.g. up to 1.2x + faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a + time, so INFLATE_FAST_MIN_HAVE == 8. + - The maximum bytes that a single length/distance pair can output is 258 bytes, which is the maximum length that can be coded. inflate_fast() requires strm->avail_out >= 258 for each loop to avoid checking for @@ -155,7 +159,45 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { unsigned whave; /* valid bytes in the window */ unsigned wnext; /* window write index */ unsigned char *window; /* allocated sliding window, if wsize != 0 */ - uint32_t hold; /* local strm->hold */ + + /* hold is a local copy of strm->hold. By default, hold satisfies the same + invariants that strm->hold does, namely that (hold >> bits) == 0. This + invariant is kept by loading bits into hold one byte at a time, like: + + hold |= next_byte_of_input << bits; in++; bits += 8; + + If we need to ensure that bits >= 15 then this code snippet is simply + repeated. Over one iteration of the outermost do/while loop, this + happens up to six times (48 bits of input), as described in the NOTES + above. + + However, on some little endian architectures, it can be significantly + faster to load 64 bits once instead of 8 bits six times: + + if (bits <= 16) { + hold |= next_8_bytes_of_input << bits; in += 6; bits += 48; + } + + Unlike the simpler one byte load, shifting the next_8_bytes_of_input + by bits will overflow and lose those high bits, up to 2 bytes' worth. + The conservative estimate is therefore that we have read only 6 bytes + (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the + rest of the iteration, and we will not need to load another 8 bytes. + + Inside this function, we no longer satisfy (hold >> bits) == 0, but + this is not problematic, even if that overflow does not land on an 8 bit + byte boundary. Those excess bits will eventually shift down lower as the + Huffman decoder consumes input, and when new input bits need to be loaded + into the bits variable, the same input bits will be or'ed over those + existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b). + Note that we therefore write that load operation as "hold |= etc" and not + "hold += etc". + + Outside that loop, at the end of the function, hold is bitwise and'ed + with (1<hold >> state->bits) == 0. + */ + uint64_t hold; /* local strm->hold */ unsigned bits; /* local strm->bits */ code const *lcode; /* local strm->lencode */ code const *dcode; /* local strm->distcode */ @@ -171,10 +213,10 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { /* copy state to local variables */ state = (struct inflate_state *)strm->state; in = strm->next_in; - last = in + (strm->avail_in - 5); + last = in + (strm->avail_in - (INFLATE_FAST_MIN_HAVE - 1)); out = strm->next_out; beg = out - (start - strm->avail_out); - end = out + (strm->avail_out - 257); + end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1)); #ifdef INFFAST_CHUNKSIZE safe = out + (strm->avail_out - INFFAST_CHUNKSIZE); @@ -197,9 +239,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { input data or output space */ do { if (bits < 15) { - hold += load_short(in, bits); - in += 2; - bits += 16; + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } here = lcode[hold & lmask]; dolen: @@ -215,17 +257,18 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { op &= 15; /* number of extra bits */ if (op) { if (bits < op) { - hold += (uint32_t)(*in++) << bits; - bits += 8; + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } len += BITS(op); DROPBITS(op); } Tracevv((stderr, "inflate: length %u\n", len)); if (bits < 15) { - hold += load_short(in, bits); - in += 2; - bits += 16; + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } here = dcode[hold & dmask]; dodist: @@ -235,12 +278,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { dist = here.val; op &= 15; /* number of extra bits */ if (bits < op) { - hold += (uint32_t)(*in++) << bits; - bits += 8; - if (bits < op) { - hold += (uint32_t)(*in++) << bits; - bits += 8; - } + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } dist += BITS(op); #ifdef INFLATE_STRICT @@ -412,8 +452,12 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { /* update state and return */ strm->next_in = in; strm->next_out = out; - strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); - strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end)); + strm->avail_in = + (unsigned)(in < last ? (INFLATE_FAST_MIN_HAVE - 1) + (last - in) + : (INFLATE_FAST_MIN_HAVE - 1) - (in - last)); + strm->avail_out = + (unsigned)(out < end ? (INFLATE_FAST_MIN_LEFT - 1) + (end - out) + : (INFLATE_FAST_MIN_LEFT - 1) - (out - end)); state->hold = hold; state->bits = bits; return; diff --git a/inffast.h b/inffast.h index 464f6c8d..d2a9a72e 100644 --- a/inffast.h +++ b/inffast.h @@ -19,4 +19,7 @@ typedef uint8x16_t inffast_chunk_t; # define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t) #endif +#define INFLATE_FAST_MIN_HAVE 8 +#define INFLATE_FAST_MIN_LEFT 258 + #endif /* INFFAST_H_ */ diff --git a/inflate.c b/inflate.c index d2e621ea..ee0e5308 100644 --- a/inflate.c +++ b/inflate.c @@ -1016,7 +1016,8 @@ int ZEXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int flush) { case LEN_: state->mode = LEN; case LEN: - if (have >= 6 && left >= 258) { + if (have >= INFLATE_FAST_MIN_HAVE && + left >= INFLATE_FAST_MIN_LEFT) { RESTORE(); inflate_fast(strm, out); LOAD(); diff --git a/memcopy.h b/memcopy.h index 84d3c689..7f8697b1 100644 --- a/memcopy.h +++ b/memcopy.h @@ -4,23 +4,15 @@ #ifndef MEMCOPY_H_ #define MEMCOPY_H_ -/* Load a short from IN and place the bytes at offset BITS in the result. */ -static inline uint32_t load_short(const unsigned char *in, unsigned bits) { - union { - uint16_t a; - uint8_t b[2]; - } chunk; - MEMCPY(&chunk, in, sizeof(uint16_t)); +/* Load 64 bits from IN and place the bytes at offset BITS in the result. */ +static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { + uint64_t chunk; + MEMCPY(&chunk, in, sizeof(chunk)); #if BYTE_ORDER == LITTLE_ENDIAN - uint32_t res = chunk.a; - return res << bits; + return chunk << bits; #else - uint32_t c0 = chunk.b[0]; - uint32_t c1 = chunk.b[1]; - c0 <<= bits; - c1 <<= bits + 8; - return c0 + c1; + return ZSWAP64(chunk) << bits; #endif }