From: Sebastian Pop <s.pop@samsung.com>
Date: Wed, 7 Nov 2018 21:11:27 +0000 (-0600)
Subject: bug #117: speed up inflate_fast
X-Git-Tag: 1.9.9-b1~578
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dd70d6c467f26d727ea730ba577809fa0668940d;p=thirdparty%2Fzlib-ng.git

bug #117: speed up inflate_fast

Based on a patch by Nigel Tao:
https://github.com/madler/zlib/pull/292/commits/e0ff1f330cc03ee04843f857869b4036593ab39d

This patch makes unzipping of files up to 1.2x faster on x86_64.  The other part
(1.3x speedup) of the patch by Nigel Tao is unsafe as discussed in the review of
that pull request.  zlib-ng already has a different way to optimize the memcpy
for that missing part.

The original patch was enabled only on little-endian machines. This patch adapts
the loading of 64 bits at a time to big endian machines.

Benchmarking notes from Hans Kristian Rosbach:
https://github.com/zlib-ng/zlib-ng/pull/224#issuecomment-444837182

Benchmark runs: 7, tested levels: 0-7, testfile 100M

develop at 796ad10 with -O3:
Level  Comp   Comptime min/avg/max  Decomptime min/avg/max
0     100.02%       0.01/0.01/0.02          0.08/0.09/0.11
1      47.08%       0.49/0.50/0.51          0.37/0.39/0.40
2      36.02%       1.10/1.12/1.13          0.39/0.39/0.40
3      34.77%       1.32/1.34/1.37          0.38/0.38/0.38
4      33.41%       1.50/1.53/1.56          0.37/0.37/0.38
5      33.07%       1.85/1.87/1.90          0.36/0.37/0.38
6      32.83%       2.54/2.57/2.61          0.36/0.37/0.38
avg    45.31%                 1.28                   0.34
tot                          62.60                  16.58

PR224 with -O3:
Level  Comp   Comptime min/avg/max  Decomptime min/avg/max
0     100.02%       0.01/0.01/0.02          0.09/0.09/0.10
1      47.08%       0.49/0.50/0.51          0.37/0.37/0.38
2      36.02%       1.09/1.11/1.13          0.38/0.38/0.39
3      34.77%       1.32/1.34/1.38          0.35/0.36/0.38
4      33.41%       1.49/1.52/1.54          0.36/0.36/0.37
5      33.07%       1.85/1.88/1.93          0.35/0.36/0.37
6      32.83%       2.55/2.58/2.65          0.35/0.35/0.36
avg    45.31%                 1.28                   0.33
tot                          62.48                  16.02

So I see about a 5.4% speedup on my x86_64 machine, not quite the 1.2x speedup
but a nice speedup nevertheless. This benchmark measures the total execution
time of minigzip, so that might have caused some inefficiencies.

At -O2, I only see a 2.7% speedup.
---

diff --git a/infback.c b/infback.c
index 04553b72..b4fb562b 100644
--- a/infback.c
+++ b/infback.c
@@ -453,7 +453,8 @@ int ZEXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in_desc
 
         case LEN:
             /* use inflate_fast() if we have enough input and output */
-            if (have >= 6 && left >= 258) {
+            if (have >= INFLATE_FAST_MIN_HAVE &&
+                left >= INFLATE_FAST_MIN_LEFT) {
                 RESTORE();
                 if (state->whave < state->wsize)
                     state->whave = state->wsize - left;
diff --git a/inffast.c b/inffast.c
index 501a42d1..40e649a8 100644
--- a/inffast.c
+++ b/inffast.c
@@ -113,8 +113,8 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns
    Entry assumptions:
 
         state->mode == LEN
-        strm->avail_in >= 6
-        strm->avail_out >= 258
+        strm->avail_in >= INFLATE_FAST_MIN_HAVE
+        strm->avail_out >= INFLATE_FAST_MIN_LEFT
         start >= strm->avail_out
         state->bits < 8
 
@@ -132,6 +132,10 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns
       Therefore if strm->avail_in >= 6, then there is enough input to avoid
       checking for available input while decoding.
 
+    - On some architectures, it can be significantly faster (e.g. up to 1.2x
+      faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
+      time, so INFLATE_FAST_MIN_HAVE == 8.
+
     - The maximum bytes that a single length/distance pair can output is 258
       bytes, which is the maximum length that can be coded.  inflate_fast()
       requires strm->avail_out >= 258 for each loop to avoid checking for
@@ -155,7 +159,45 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
     unsigned whave;             /* valid bytes in the window */
     unsigned wnext;             /* window write index */
     unsigned char *window;      /* allocated sliding window, if wsize != 0 */
-    uint32_t hold;              /* local strm->hold */
+
+    /* hold is a local copy of strm->hold. By default, hold satisfies the same
+       invariants that strm->hold does, namely that (hold >> bits) == 0. This
+       invariant is kept by loading bits into hold one byte at a time, like:
+
+       hold |= next_byte_of_input << bits; in++; bits += 8;
+
+       If we need to ensure that bits >= 15 then this code snippet is simply
+       repeated. Over one iteration of the outermost do/while loop, this
+       happens up to six times (48 bits of input), as described in the NOTES
+       above.
+
+       However, on some little endian architectures, it can be significantly
+       faster to load 64 bits once instead of 8 bits six times:
+
+       if (bits <= 16) {
+         hold |= next_8_bytes_of_input << bits; in += 6; bits += 48;
+       }
+
+       Unlike the simpler one byte load, shifting the next_8_bytes_of_input
+       by bits will overflow and lose those high bits, up to 2 bytes' worth.
+       The conservative estimate is therefore that we have read only 6 bytes
+       (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
+       rest of the iteration, and we will not need to load another 8 bytes.
+
+       Inside this function, we no longer satisfy (hold >> bits) == 0, but
+       this is not problematic, even if that overflow does not land on an 8 bit
+       byte boundary. Those excess bits will eventually shift down lower as the
+       Huffman decoder consumes input, and when new input bits need to be loaded
+       into the bits variable, the same input bits will be or'ed over those
+       existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b).
+       Note that we therefore write that load operation as "hold |= etc" and not
+       "hold += etc".
+
+       Outside that loop, at the end of the function, hold is bitwise and'ed
+       with (1<<bits)-1 to drop those excess bits so that, on function exit, we
+       keep the invariant that (state->hold >> state->bits) == 0.
+    */
+    uint64_t hold;              /* local strm->hold */
     unsigned bits;              /* local strm->bits */
     code const *lcode;          /* local strm->lencode */
     code const *dcode;          /* local strm->distcode */
@@ -171,10 +213,10 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
     /* copy state to local variables */
     state = (struct inflate_state *)strm->state;
     in = strm->next_in;
-    last = in + (strm->avail_in - 5);
+    last = in + (strm->avail_in - (INFLATE_FAST_MIN_HAVE - 1));
     out = strm->next_out;
     beg = out - (start - strm->avail_out);
-    end = out + (strm->avail_out - 257);
+    end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1));
 
 #ifdef INFFAST_CHUNKSIZE
     safe = out + (strm->avail_out - INFFAST_CHUNKSIZE);
@@ -197,9 +239,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
        input data or output space */
     do {
         if (bits < 15) {
-            hold += load_short(in, bits);
-            in += 2;
-            bits += 16;
+            hold |= load_64_bits(in, bits);
+            in += 6;
+            bits += 48;
         }
         here = lcode[hold & lmask];
       dolen:
@@ -215,17 +257,18 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
             op &= 15;                           /* number of extra bits */
             if (op) {
                 if (bits < op) {
-                    hold += (uint32_t)(*in++) << bits;
-                    bits += 8;
+                    hold |= load_64_bits(in, bits);
+                    in += 6;
+                    bits += 48;
                 }
                 len += BITS(op);
                 DROPBITS(op);
             }
             Tracevv((stderr, "inflate:         length %u\n", len));
             if (bits < 15) {
-                hold += load_short(in, bits);
-                in += 2;
-                bits += 16;
+                hold |= load_64_bits(in, bits);
+                in += 6;
+                bits += 48;
             }
             here = dcode[hold & dmask];
           dodist:
@@ -235,12 +278,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
                 dist = here.val;
                 op &= 15;                       /* number of extra bits */
                 if (bits < op) {
-                    hold += (uint32_t)(*in++) << bits;
-                    bits += 8;
-                    if (bits < op) {
-                        hold += (uint32_t)(*in++) << bits;
-                        bits += 8;
-                    }
+                    hold |= load_64_bits(in, bits);
+                    in += 6;
+                    bits += 48;
                 }
                 dist += BITS(op);
 #ifdef INFLATE_STRICT
@@ -412,8 +452,12 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
     /* update state and return */
     strm->next_in = in;
     strm->next_out = out;
-    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
-    strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end));
+    strm->avail_in =
+        (unsigned)(in < last ? (INFLATE_FAST_MIN_HAVE - 1) + (last - in)
+                             : (INFLATE_FAST_MIN_HAVE - 1) - (in - last));
+    strm->avail_out =
+        (unsigned)(out < end ? (INFLATE_FAST_MIN_LEFT - 1) + (end - out)
+                             : (INFLATE_FAST_MIN_LEFT - 1) - (out - end));
     state->hold = hold;
     state->bits = bits;
     return;
diff --git a/inffast.h b/inffast.h
index 464f6c8d..d2a9a72e 100644
--- a/inffast.h
+++ b/inffast.h
@@ -19,4 +19,7 @@ typedef uint8x16_t inffast_chunk_t;
 #  define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
 #endif
 
+#define INFLATE_FAST_MIN_HAVE 8
+#define INFLATE_FAST_MIN_LEFT 258
+
 #endif /* INFFAST_H_ */
diff --git a/inflate.c b/inflate.c
index d2e621ea..ee0e5308 100644
--- a/inflate.c
+++ b/inflate.c
@@ -1016,7 +1016,8 @@ int ZEXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int flush) {
         case LEN_:
             state->mode = LEN;
         case LEN:
-            if (have >= 6 && left >= 258) {
+            if (have >= INFLATE_FAST_MIN_HAVE &&
+                left >= INFLATE_FAST_MIN_LEFT) {
                 RESTORE();
                 inflate_fast(strm, out);
                 LOAD();
diff --git a/memcopy.h b/memcopy.h
index 84d3c689..7f8697b1 100644
--- a/memcopy.h
+++ b/memcopy.h
@@ -4,23 +4,15 @@
 #ifndef MEMCOPY_H_
 #define MEMCOPY_H_
 
-/* Load a short from IN and place the bytes at offset BITS in the result. */
-static inline uint32_t load_short(const unsigned char *in, unsigned bits) {
-    union {
-        uint16_t a;
-        uint8_t b[2];
-    } chunk;
-    MEMCPY(&chunk, in, sizeof(uint16_t));
+/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
+static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
+  uint64_t chunk;
+  MEMCPY(&chunk, in, sizeof(chunk));
 
 #if BYTE_ORDER == LITTLE_ENDIAN
-    uint32_t res = chunk.a;
-    return res << bits;
+  return chunk << bits;
 #else
-    uint32_t c0 = chunk.b[0];
-    uint32_t c1 = chunk.b[1];
-    c0 <<= bits;
-    c1 <<= bits + 8;
-    return c0 + c1;
+  return ZSWAP64(chunk) << bits;
 #endif
 }