]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Inflate using wider loads and stores and a minimum of branches. (#95)
authorSimon Hosie <simhos01@users.noreply.github.com>
Wed, 22 Mar 2017 17:48:39 +0000 (10:48 -0700)
committerHans Kristian Rosbach <hk-git@circlestorm.org>
Fri, 24 Mar 2017 21:01:48 +0000 (22:01 +0100)
* Inflate using wider loads and stores.

In inflate_fast() the output pointer always has plenty of room to write.  This
means that so long as the target is capable, wide un-aligned loads and stores
can be used to transfer several bytes at once.

When the reference distance is too short simply unroll the data a little to
increase the distance.

Change-Id: I59854eb25d2b1e43561c8a2afaf9175bf10cf674

inffast.c
inffast.h
inflate.c

index 1d4048defa78565afbfcd261287e596f5e19d202..1e60b2392d23dd1d68beabb6d380746b2b485a18 100644 (file)
--- a/inffast.c
+++ b/inffast.c
         bits -= (unsigned)(n); \
     } while (0)
 
+#ifdef INFFAST_CHUNKSIZE
+/*
+   Ask the compiler to perform a wide, unaligned load with an machine
+   instruction appropriate for the inffast_chunk_t type.
+ */
+static inline inffast_chunk_t loadchunk(unsigned char const* s) {
+    inffast_chunk_t c;
+    __builtin_memcpy(&c, s, sizeof(c));
+    return c;
+}
+
+/*
+   Ask the compiler to perform a wide, unaligned store with an machine
+   instruction appropriate for the inffast_chunk_t type.
+ */
+static inline void storechunk(unsigned char* d, inffast_chunk_t c) {
+    __builtin_memcpy(d, &c, sizeof(c));
+}
+
+/*
+   Behave like memcpy, but assume that it's OK to overwrite at least
+   INFFAST_CHUNKSIZE bytes of output even if the length is shorter than this,
+   that the length is non-zero, and that `from` lags `out` by at least
+   INFFAST_CHUNKSIZE bytes (or that they don't overlap at all or simply that
+   the distance is less than the length of the copy).
+
+   Aside from better memory bus utilisation, this means that short copies
+   (INFFAST_CHUNKSIZE bytes or fewer) will fall straight through the loop
+   without iteration, which will hopefully make the branch prediction more
+   reliable.
+ */
+static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const *from, unsigned len) {
+    --len;
+    storechunk(out, loadchunk(from));
+    out += (len % INFFAST_CHUNKSIZE) + 1;
+    from += (len % INFFAST_CHUNKSIZE) + 1;
+    len /= INFFAST_CHUNKSIZE;
+    while (len-- > 0) {
+        storechunk(out, loadchunk(from));
+        out += INFFAST_CHUNKSIZE;
+        from += INFFAST_CHUNKSIZE;
+    }
+    return out;
+}
+
+/*
+   Behave like chunkcopy, but avoid writing beyond of legal output.
+ */
+static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len,
+                                           unsigned char *safe) {
+    if (out > safe) {
+        while (len-- > 0) {
+          *out++ = *from++;
+        }
+        return out;
+    }
+    return chunkcopy(out, from, len);
+}
+
+/*
+   Perform short copies until distance can be rewritten as being at least
+   INFFAST_CHUNKSIZE.
+
+   This assumes that it's OK to overwrite at least the first
+   2*INFFAST_CHUNKSIZE bytes of output even if the copy is shorter than this.
+   This assumption holds because inflate_fast() starts every iteration with at
+   least 258 bytes of output space available (258 being the maximum length
+   output from a single token; see inflate_fast()'s assumptions below).
+ */
+static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, unsigned *len) {
+    unsigned char const *from = out - *dist;
+    while (*dist < *len && *dist < INFFAST_CHUNKSIZE) {
+        storechunk(out, loadchunk(from));
+        out += *dist;
+        *len -= *dist;
+        *dist += *dist;
+    }
+    return out;
+}
+#endif
+
 /*
    Decode literal, length, and distance codes and write out the resulting
    literal and match bytes until either not enough input or output is
@@ -62,6 +143,9 @@ void ZLIB_INTERNAL inflate_fast(z_stream *strm, unsigned long start) {
     unsigned char *out;         /* local strm->next_out */
     unsigned char *beg;         /* inflate()'s initial strm->next_out */
     unsigned char *end;         /* while out < end, enough space available */
+#ifdef INFFAST_CHUNKSIZE
+    unsigned char *safe;        /* can use chunkcopy provided out < safe */
+#endif
 #ifdef INFLATE_STRICT
     unsigned dmax;              /* maximum distance from zlib header */
 #endif
@@ -89,6 +173,10 @@ void ZLIB_INTERNAL inflate_fast(z_stream *strm, unsigned long start) {
     out = strm->next_out;
     beg = out - (start - strm->avail_out);
     end = out + (strm->avail_out - 257);
+
+#ifdef INFFAST_CHUNKSIZE
+    safe = out + (strm->avail_out - INFFAST_CHUNKSIZE);
+#endif
 #ifdef INFLATE_STRICT
     dmax = state->dmax;
 #endif
@@ -193,6 +281,34 @@ void ZLIB_INTERNAL inflate_fast(z_stream *strm, unsigned long start) {
                         }
 #endif
                     }
+#ifdef INFFAST_CHUNKSIZE
+                    from = window;
+                    if (wnext == 0) {           /* very common case */
+                        from += wsize - op;
+                    } else if (wnext >= op) {   /* contiguous in window */
+                        from += wnext - op;
+                    } else {                    /* wrap around window */
+                        op -= wnext;
+                        from += wsize - op;
+                        if (op < len) {         /* some from end of window */
+                            len -= op;
+                            out = chunkcopysafe(out, from, op, safe);
+                            from = window;      /* more from start of window */
+                            op = wnext;
+                            /* This (rare) case can create a situation where
+                               the first chunkcopy below must be checked.
+                             */
+                        }
+                    }
+                    if (op < len) {             /* still need some from output */
+                        len -= op;
+                        out = chunkcopysafe(out, from, op, safe);
+                        out = chunkunroll(out, &dist, &len);
+                        out = chunkcopysafe(out, out - dist, len, safe);
+                    } else {
+                        out = chunkcopysafe(out, from, len, safe);
+                    }
+#else
                     from = window;
                     if (wnext == 0) {           /* very common case */
                         from += wsize - op;
@@ -242,7 +358,18 @@ void ZLIB_INTERNAL inflate_fast(z_stream *strm, unsigned long start) {
                         if (len > 1)
                             *out++ = *from++;
                     }
+#endif
                 } else {
+#ifdef INFFAST_CHUNKSIZE
+                    /* Whole reference is in range of current output.  No
+                       range checks are necessary because we start with room
+                       for at least 258 bytes of output, so unroll and roundoff
+                       operations can write beyond `out+len` so long as they
+                       stay within 258 bytes of `out`.
+                     */
+                    out = chunkunroll(out, &dist, &len);
+                    out = chunkcopy(out, out - dist, len);
+#else
                     from = out - dist;          /* copy direct from output */
                     if (dist == 1) {
                         memset (out, *from, len);
@@ -260,6 +387,7 @@ void ZLIB_INTERNAL inflate_fast(z_stream *strm, unsigned long start) {
                                 *out++ = *from++;
                         }
                     }
+#endif
                 }
             } else if ((op & 64) == 0) {          /* 2nd level distance code */
                 here = dcode[here.val + BITS(op)];
index 0d75614772a752050785ed9785ebded95a53d95a..fdf5a187b8375a01e0eac242e1eda7d62af3ba1d 100644 (file)
--- a/inffast.h
+++ b/inffast.h
 
 void ZLIB_INTERNAL inflate_fast(z_stream *strm, unsigned long start);
 
+
+#if (defined(__GNUC__) || defined(__clang__)) && defined(__ARM_NEON__)
+#  include <arm_neon.h>
+typedef uint8x16_t inffast_chunk_t;
+#  define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
+#endif
+
 #endif /* INFFAST_H_ */
index 84ae6e72e47962e77e7e6f36c3e184323ed8f07a..8484bc326e9e9972685f5404182c96f0e8d98e0e 100644 (file)
--- a/inflate.c
+++ b/inflate.c
@@ -371,9 +371,17 @@ static int updatewindow(z_stream *strm, const unsigned char *end, uint32_t copy)
 
     /* if it hasn't been done already, allocate space for the window */
     if (state->window == NULL) {
+#ifdef INFFAST_CHUNKSIZE
+        unsigned wsize = 1U << state->wbits;
+        state->window = (unsigned char *) ZALLOC(strm, wsize + INFFAST_CHUNKSIZE, sizeof(unsigned char));
+        if (state->window == Z_NULL)
+            return 1;
+        memset(state->window + wsize, 0, INFFAST_CHUNKSIZE);
+#else
         state->window = (unsigned char *) ZALLOC(strm, 1U << state->wbits, sizeof(unsigned char));
         if (state->window == NULL)
             return 1;
+#endif
     }
 
     /* if window not in use yet, initialize */