bug #117: speed up inflate_fast

author Sebastian Pop <s.pop@samsung.com>

Wed, 7 Nov 2018 21:11:27 +0000 (15:11 -0600)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Tue, 18 Dec 2018 12:40:06 +0000 (13:40 +0100)
author Sebastian Pop <s.pop@samsung.com>
Wed, 7 Nov 2018 21:11:27 +0000 (15:11 -0600)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 18 Dec 2018 12:40:06 +0000 (13:40 +0100)
diff --git a/infback.c b/infback.c

index 04553b72aed2a5001c241dfa1db0f53846110167..b4fb562b1e6884a66b878ec67e2b6c3020360e6f 100644 (file)
--- a/infback.c
+++ b/infback.c
@@ -453,7 +453,8 @@ int ZEXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in_desc
  
          case LEN:
              /* use inflate_fast() if we have enough input and output */
-            if (have >= 6 && left >= 258) {
+            if (have >= INFLATE_FAST_MIN_HAVE &&
+                left >= INFLATE_FAST_MIN_LEFT) {
                  RESTORE();
                  if (state->whave < state->wsize)
                      state->whave = state->wsize - left;
diff --git a/inffast.c b/inffast.c

index 501a42d1f5ae1d1f79fd61af2bda5a269c7911c4..40e649a8b4fdf6d33f41f3a520a136bb8dffdb6d 100644 (file)
--- a/inffast.c
+++ b/inffast.c
@@ -113,8 +113,8 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns
     Entry assumptions:
  
          state->mode == LEN
-        strm->avail_in >= 6
-        strm->avail_out >= 258
+        strm->avail_in >= INFLATE_FAST_MIN_HAVE
+        strm->avail_out >= INFLATE_FAST_MIN_LEFT
          start >= strm->avail_out
          state->bits < 8
  
@@ -132,6 +132,10 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns
        Therefore if strm->avail_in >= 6, then there is enough input to avoid
        checking for available input while decoding.
  
+    - On some architectures, it can be significantly faster (e.g. up to 1.2x
+      faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
+      time, so INFLATE_FAST_MIN_HAVE == 8.
+
      - The maximum bytes that a single length/distance pair can output is 258
        bytes, which is the maximum length that can be coded.  inflate_fast()
        requires strm->avail_out >= 258 for each loop to avoid checking for
@@ -155,7 +159,45 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
      unsigned whave;             /* valid bytes in the window */
      unsigned wnext;             /* window write index */
      unsigned char *window;      /* allocated sliding window, if wsize != 0 */
-    uint32_t hold;              /* local strm->hold */
+
+    /* hold is a local copy of strm->hold. By default, hold satisfies the same
+       invariants that strm->hold does, namely that (hold >> bits) == 0. This
+       invariant is kept by loading bits into hold one byte at a time, like:
+
+       hold |= next_byte_of_input << bits; in++; bits += 8;
+
+       If we need to ensure that bits >= 15 then this code snippet is simply
+       repeated. Over one iteration of the outermost do/while loop, this
+       happens up to six times (48 bits of input), as described in the NOTES
+       above.
+
+       However, on some little endian architectures, it can be significantly
+       faster to load 64 bits once instead of 8 bits six times:
+
+       if (bits <= 16) {
+         hold |= next_8_bytes_of_input << bits; in += 6; bits += 48;
+       }
+
+       Unlike the simpler one byte load, shifting the next_8_bytes_of_input
+       by bits will overflow and lose those high bits, up to 2 bytes' worth.
+       The conservative estimate is therefore that we have read only 6 bytes
+       (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
+       rest of the iteration, and we will not need to load another 8 bytes.
+
+       Inside this function, we no longer satisfy (hold >> bits) == 0, but
+       this is not problematic, even if that overflow does not land on an 8 bit
+       byte boundary. Those excess bits will eventually shift down lower as the
+       Huffman decoder consumes input, and when new input bits need to be loaded
+       into the bits variable, the same input bits will be or'ed over those
+       existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b).
+       Note that we therefore write that load operation as "hold |= etc" and not
+       "hold += etc".
+
+       Outside that loop, at the end of the function, hold is bitwise and'ed
+       with (1<<bits)-1 to drop those excess bits so that, on function exit, we
+       keep the invariant that (state->hold >> state->bits) == 0.
+    */
+    uint64_t hold;              /* local strm->hold */
      unsigned bits;              /* local strm->bits */
      code const *lcode;          /* local strm->lencode */
      code const *dcode;          /* local strm->distcode */
@@ -171,10 +213,10 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
      /* copy state to local variables */
      state = (struct inflate_state *)strm->state;
      in = strm->next_in;
-    last = in + (strm->avail_in - 5);
+    last = in + (strm->avail_in - (INFLATE_FAST_MIN_HAVE - 1));
      out = strm->next_out;
      beg = out - (start - strm->avail_out);
-    end = out + (strm->avail_out - 257);
+    end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1));
  
  #ifdef INFFAST_CHUNKSIZE
      safe = out + (strm->avail_out - INFFAST_CHUNKSIZE);
@@ -197,9 +239,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
         input data or output space */
      do {
          if (bits < 15) {
-            hold += load_short(in, bits);
-            in += 2;
-            bits += 16;
+            hold |= load_64_bits(in, bits);
+            in += 6;
+            bits += 48;
          }
          here = lcode[hold & lmask];
        dolen:
@@ -215,17 +257,18 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
              op &= 15;                           /* number of extra bits */
              if (op) {
                  if (bits < op) {
-                    hold += (uint32_t)(*in++) << bits;
-                    bits += 8;
+                    hold |= load_64_bits(in, bits);
+                    in += 6;
+                    bits += 48;
                  }
                  len += BITS(op);
                  DROPBITS(op);
              }
              Tracevv((stderr, "inflate:         length %u\n", len));
              if (bits < 15) {
-                hold += load_short(in, bits);
-                in += 2;
-                bits += 16;
+                hold |= load_64_bits(in, bits);
+                in += 6;
+                bits += 48;
              }
              here = dcode[hold & dmask];
            dodist:
@@ -235,12 +278,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
                  dist = here.val;
                  op &= 15;                       /* number of extra bits */
                  if (bits < op) {
-                    hold += (uint32_t)(*in++) << bits;
-                    bits += 8;
-                    if (bits < op) {
-                        hold += (uint32_t)(*in++) << bits;
-                        bits += 8;
-                    }
+                    hold |= load_64_bits(in, bits);
+                    in += 6;
+                    bits += 48;
                  }
                  dist += BITS(op);
  #ifdef INFLATE_STRICT
@@ -412,8 +452,12 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) {
      /* update state and return */
      strm->next_in = in;
      strm->next_out = out;
-    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
-    strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end));
+    strm->avail_in =
+        (unsigned)(in < last ? (INFLATE_FAST_MIN_HAVE - 1) + (last - in)
+                             : (INFLATE_FAST_MIN_HAVE - 1) - (in - last));
+    strm->avail_out =
+        (unsigned)(out < end ? (INFLATE_FAST_MIN_LEFT - 1) + (end - out)
+                             : (INFLATE_FAST_MIN_LEFT - 1) - (out - end));
      state->hold = hold;
      state->bits = bits;
      return;
diff --git a/inffast.h b/inffast.h

index 464f6c8df888cd5e7788b63338b0ebfee7bb2bf7..d2a9a72e8bb66d7a8231974741b6b51c50d2cd50 100644 (file)
--- a/inffast.h
+++ b/inffast.h
@@ -19,4 +19,7 @@ typedef uint8x16_t inffast_chunk_t;
  #  define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
  #endif
  
+#define INFLATE_FAST_MIN_HAVE 8
+#define INFLATE_FAST_MIN_LEFT 258
+
  #endif /* INFFAST_H_ */
diff --git a/inflate.c b/inflate.c

index d2e621eaa67ec0f8766ceabcfe04770335baaeea..ee0e5308c549a865d54b1b2b02dcd977743ea873 100644 (file)
--- a/inflate.c
+++ b/inflate.c
@@ -1016,7 +1016,8 @@ int ZEXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int flush) {
          case LEN_:
              state->mode = LEN;
          case LEN:
-            if (have >= 6 && left >= 258) {
+            if (have >= INFLATE_FAST_MIN_HAVE &&
+                left >= INFLATE_FAST_MIN_LEFT) {
                  RESTORE();
                  inflate_fast(strm, out);
                  LOAD();
diff --git a/memcopy.h b/memcopy.h

index 84d3c689d355f373662389ded9634b44b9d1be1c..7f8697b1acd469905374c2a3e4879d2e3609cec2 100644 (file)
--- a/memcopy.h
+++ b/memcopy.h
@@ -4,23 +4,15 @@
  #ifndef MEMCOPY_H_
  #define MEMCOPY_H_
  
-/* Load a short from IN and place the bytes at offset BITS in the result. */
-static inline uint32_t load_short(const unsigned char *in, unsigned bits) {
-    union {
-        uint16_t a;
-        uint8_t b[2];
-    } chunk;
-    MEMCPY(&chunk, in, sizeof(uint16_t));
+/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
+static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
+  uint64_t chunk;
+  MEMCPY(&chunk, in, sizeof(chunk));
  
  #if BYTE_ORDER == LITTLE_ENDIAN
-    uint32_t res = chunk.a;
-    return res << bits;
+  return chunk << bits;
  #else
-    uint32_t c0 = chunk.b[0];
-    uint32_t c1 = chunk.b[1];
-    c0 <<= bits;
-    c1 <<= bits + 8;
-    return c0 + c1;
+  return ZSWAP64(chunk) << bits;
  #endif
  }
author	Sebastian Pop <s.pop@samsung.com>
	Wed, 7 Nov 2018 21:11:27 +0000 (15:11 -0600)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Tue, 18 Dec 2018 12:40:06 +0000 (13:40 +0100)
infback.c		patch \| blob \| blame \| history
inffast.c		patch \| blob \| blame \| history
inffast.h		patch \| blob \| blame \| history
inflate.c		patch \| blob \| blame \| history
memcopy.h		patch \| blob \| blame \| history