Speed up chunkcopy and memset

author Adam Stylinski <kungfujesus06@gmail.com>

Mon, 21 Feb 2022 21:52:17 +0000 (16:52 -0500)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Wed, 16 Mar 2022 10:42:19 +0000 (11:42 +0100)
author Adam Stylinski <kungfujesus06@gmail.com>
Mon, 21 Feb 2022 21:52:17 +0000 (16:52 -0500)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 16 Mar 2022 10:42:19 +0000 (11:42 +0100)
diff --git a/chunkset_tpl.h b/chunkset_tpl.h

index 8e6f566443c9861cd33fad09ae93dc7bb875ca19..189b5ec4576c8fd2ec3834c586d6f7f6f5f29f52 100644 (file)
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -3,6 +3,7 @@
   */
  
  #include "zbuild.h"
+#include <stdlib.h>
  
  /* Returns the chunk size */
  Z_INTERNAL uint32_t CHUNKSIZE(void) {
@@ -38,52 +39,6 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
      return out;
  }
  
-/* Behave like chunkcopy, but avoid writing beyond of legal output. */
-Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
-    unsigned safelen = (unsigned)((safe - out) + 1);
-    len = MIN(len, safelen);
-#if CHUNK_SIZE >= 32
-    while (len >= 32) {
-        memcpy(out, from, 32);
-        out += 32;
-        from += 32;
-        len -= 32;
-    }
-#endif
-#if CHUNK_SIZE >= 16
-    while (len >= 16) {
-        memcpy(out, from, 16);
-        out += 16;
-        from += 16;
-        len -= 16;
-    }
-#endif
-#if CHUNK_SIZE >= 8
-    while (len >= 8) {
-        zmemcpy_8(out, from);
-        out += 8;
-        from += 8;
-        len -= 8;
-    }
-#endif
-    if (len >= 4) {
-        zmemcpy_4(out, from);
-        out += 4;
-        from += 4;
-        len -= 4;
-    }
-    if (len >= 2) {
-        zmemcpy_2(out, from);
-        out += 2;
-        from += 2;
-        len -= 2;
-    }
-    if (len == 1) {
-        *out++ = *from++;
-    }
-    return out;
-}
-
  /* Perform short copies until distance can be rewritten as being at least
     sizeof chunk_t.
  
@@ -112,66 +67,80 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
         Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
      Assert(dist > 0, "chunkmemset cannot have a distance 0");
  
-    unsigned char *from = out - dist;
-    chunk_t chunk;
-    unsigned sz = sizeof(chunk);
-    if (len < sz) {
-        while (len != 0) {
-            *out++ = *from++;
-            --len;
-        }
-        return out;
-    }
+    uint8_t *from = out - dist;
  
-#ifdef HAVE_CHUNKMEMSET_1
      if (dist == 1) {
-        chunkmemset_1(from, &chunk);
-    } else
-#endif
+        memset(out, *from, len);
+        return out + len;
+    } else if (dist > sizeof(chunk_t)) {
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    chunk_t chunk_load;
+    uint32_t chunk_mod = 0;
+
+    /* TODO: possibly build up a permutation table for this if not an even modulus */
  #ifdef HAVE_CHUNKMEMSET_2
      if (dist == 2) {
-        chunkmemset_2(from, &chunk);
+        chunkmemset_2(from, &chunk_load);
      } else
  #endif
  #ifdef HAVE_CHUNKMEMSET_4
      if (dist == 4) {
-        chunkmemset_4(from, &chunk);
+        chunkmemset_4(from, &chunk_load);
      } else
  #endif
  #ifdef HAVE_CHUNKMEMSET_8
      if (dist == 8) {
-        chunkmemset_8(from, &chunk);
+        chunkmemset_8(from, &chunk_load);
+    } else if (dist == sizeof(chunk_t)) {
+        loadchunk(from, &chunk_load);
      } else
  #endif
-    if (dist == sz) {
-        loadchunk(from, &chunk);
-    } else if (dist < sz) {
-        unsigned char *end = out + len - 1;
-        while (len > dist) {
-            out = CHUNKCOPY_SAFE(out, from, dist, end);
-            len -= dist;
+    {
+        /* This code takes string of length dist from "from" and repeats
+         * it for as many times as can fit in a chunk_t (vector register) */
+        int32_t cpy_dist;
+        int32_t bytes_remaining = sizeof(chunk_t);
+        uint8_t *cur_chunk = (uint8_t*)&chunk_load;
+        while (bytes_remaining) {
+            cpy_dist = MIN(dist, bytes_remaining);
+            memcpy(cur_chunk, from, cpy_dist);
+            bytes_remaining -= cpy_dist;
+            cur_chunk += cpy_dist;
+            /* This allows us to bypass an expensive integer division since we're effectively
+             * counting in this loop, anyway. However, we may have to derive a similarly 
+             * sensible solution for if we use a permutation table that allows us to construct
+             * this vector in one load and one permute instruction */
+            chunk_mod = cpy_dist;
          }
-        if (len > 0) {
-            out = CHUNKCOPY_SAFE(out, from, len, end);
+    }
+
+    /* If we're lucky enough and dist happens to be an even modulus of our vector length,
+     * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
+    if (chunk_mod == 0) {
+        while (len >= (2 * sizeof(chunk_t))) {
+            storechunk(out, &chunk_load);
+            storechunk(out + sizeof(chunk_t), &chunk_load);
+            out += 2 * sizeof(chunk_t);
+            len -= 2 * sizeof(chunk_t);
          }
-        return out;
-    } else {
-        out = CHUNKUNROLL(out, &dist, &len);
-        return CHUNKCOPY(out, out - dist, len);
      }
  
-    unsigned rem = len % sz;
-    len -= rem;
-    while (len) {
-        storechunk(out, &chunk);
-        out += sz;
-        len -= sz;
+    /* If we don't have a "dist" length that divides evenly into a vector
+     * register, we can write the whole vector register but we need only
+     * advance by the amount of the whole string that fits in our chunk_t.
+     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
+    uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
+    while (len >= sizeof(chunk_t)) {
+        storechunk(out, &chunk_load);
+        len -= adv_amount;
+        out += adv_amount;
      }
  
-    /* Last, deal with the case when LEN is not a multiple of SZ. */
-    if (rem) {
-        memcpy(out, from, rem);
-        out += rem;
+    if (len) {
+        memcpy(out, &chunk_load, len);
+        out += len;
      }
  
      return out;
diff --git a/functable.c b/functable.c

index 5147d3f8d953087c11e1bbfa705459db56f81b8f..f39db2955416a0de1ed2f36b4fb9beafd7fca41f 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -273,32 +273,6 @@ Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned l
      return functable.chunkcopy(out, from, len);
  }
  
-Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
-    // Initialize default
-    functable.chunkcopy_safe = &chunkcopy_safe_c;
-
-#ifdef X86_SSE2_CHUNKSET
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
-    if (x86_cpu_has_sse2)
-# endif
-        functable.chunkcopy_safe = &chunkcopy_safe_sse2;
-#endif
-#ifdef X86_AVX_CHUNKSET
-    if (x86_cpu_has_avx2)
-        functable.chunkcopy_safe = &chunkcopy_safe_avx;
-#endif
-#ifdef ARM_NEON_CHUNKSET
-    if (arm_cpu_has_neon)
-        functable.chunkcopy_safe = &chunkcopy_safe_neon;
-#endif
-#ifdef POWER8_VSX_CHUNKSET
-    if (power_cpu_has_arch_2_07)
-        functable.chunkcopy_safe = &chunkcopy_safe_power8;
-#endif
-
-    return functable.chunkcopy_safe(out, from, len, safe);
-}
-
  Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
      // Initialize default
      functable.chunkunroll = &chunkunroll_c;
@@ -436,7 +410,6 @@ Z_INTERNAL Z_TLS struct functable_s functable = {
      compare256_stub,
      chunksize_stub,
      chunkcopy_stub,
-    chunkcopy_safe_stub,
      chunkunroll_stub,
      chunkmemset_stub,
      chunkmemset_safe_stub,
diff --git a/functable.h b/functable.h

index 949c5b1be85923ba3552b062459a17321042932d..a106c93aab9035fdf0ab8d6afc58976e73b0a211 100644 (file)
--- a/functable.h
+++ b/functable.h
@@ -18,7 +18,6 @@ struct functable_s {
      uint32_t (* compare256)         (const uint8_t *src0, const uint8_t *src1);
      uint32_t (* chunksize)          (void);
      uint8_t* (* chunkcopy)          (uint8_t *out, uint8_t const *from, unsigned len);
-    uint8_t* (* chunkcopy_safe)     (uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
      uint8_t* (* chunkunroll)        (uint8_t *out, unsigned *dist, unsigned *len);
      uint8_t* (* chunkmemset)        (uint8_t *out, unsigned dist, unsigned len);
      uint8_t* (* chunkmemset_safe)   (uint8_t *out, unsigned dist, unsigned len, unsigned left);
diff --git a/inffast.c b/inffast.c

index 58cbad7ef915ce5b9cfe85f8677000f3a501caef..e0fb99c593241b6b9ed752da9b45235c6093706b 100644 (file)
--- a/inffast.c
+++ b/inffast.c
@@ -11,7 +11,6 @@
  #include "inflate_p.h"
  #include "functable.h"
  
-
  /* Load 64 bits from IN and place the bytes at offset BITS in the result. */
  static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
      uint64_t chunk;
@@ -430,7 +429,7 @@ void Z_INTERNAL zng_inflate_fast_back(PREFIX3(stream) *strm, unsigned long start
                          from += wsize - op;
                          if (op < len) {         /* some from end of window */
                              len -= op;
-                            out = functable.chunkcopy_safe(out, from, op, safe);
+                            out = chunkcopy_safe(out, from, op, safe);
                              from = window;      /* more from start of window */
                              op = wnext;
                              /* This (rare) case can create a situation where
@@ -440,16 +439,16 @@ void Z_INTERNAL zng_inflate_fast_back(PREFIX3(stream) *strm, unsigned long start
                      }
                      if (op < len) {             /* still need some from output */
                          len -= op;
-                        out = functable.chunkcopy_safe(out, from, op, safe);
+                        out = chunkcopy_safe(out, from, op, safe);
                          out = functable.chunkunroll(out, &dist, &len);
-                        out = functable.chunkcopy_safe(out, out - dist, len, safe);
+                        out = chunkcopy_safe(out, out - dist, len, safe);
                      } else {
-                        out = functable.chunkcopy_safe(out, from, len, safe);
+                        out = chunkcopy_safe(out, from, len, safe);
                      }
                  } else if (extra_safe) {
                      /* Whole reference is in range of current output. */
                      if (dist >= len || dist >= state->chunksize)
-                        out = functable.chunkcopy_safe(out, out - dist, len, safe);
+                        out = chunkcopy_safe(out, out - dist, len, safe);
                      else
                          out = functable.chunkmemset_safe(out, dist, len, (unsigned)((safe - out) + 1));
                  } else {
diff --git a/inflate.c b/inflate.c

index 06b37bf40c98e57b3b3017a7b13fd7711c1016f3..5e2af29c6cb9e97273908daa2573e4ff8d080b87 100644 (file)
--- a/inflate.c
+++ b/inflate.c
@@ -887,7 +887,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
              }
              unsigned char *next_out = state->window + state->wsize + state->wnext;
              if (copy <= state->offset) {
-                functable.chunkcopy_safe(next_out, next_out - state->offset, copy, put + buf_left);
+                chunkcopy_safe(next_out, next_out - state->offset, copy, put + buf_left);
              } else {                             /* copy from output */
                  functable.chunkmemset_safe(next_out, state->offset, copy, (uint32_t)buf_left);
              }
diff --git a/inflate_p.h b/inflate_p.h

index c5ba13a0c601852f559c2489c0a88dcc64aebb50..86e3702b864fa1962035e78bb2d3a8461aa8c592 100644 (file)
--- a/inflate_p.h
+++ b/inflate_p.h
@@ -211,3 +211,78 @@ static inline void window_output_flush(PREFIX3(stream) *strm) {
      state->whave += out_bytes;
      state->whave = MIN(state->whave, state->wsize);
  }
+
+/* Behave like chunkcopy, but avoid writing beyond of legal output. */
+static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe) {
+    uint32_t safelen = (uint32_t)((safe - out) + 1);
+    len = MIN(len, safelen);
+    int olap_src = from >= out && from < out + len;
+    int olap_dst = out >= from && out < from + len;
+    int tocopy;
+
+    /* For all cases without overlap, memcpy is ideal */
+    if (!(olap_src || olap_dst)) {
+        memcpy(out, from, len);
+        return out + len;
+    }
+
+    /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior,
+     * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
+     * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
+     * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
+     * behind or lookahead distance */
+    int non_olap_size = (from > out) ? from - out : out - from;
+
+    memcpy(out, from, non_olap_size);
+    out += non_olap_size;
+    from += non_olap_size;
+    len -= non_olap_size;
+
+    /* So this doesn't give use a worst case scenario of function calls in a loop,
+     * we want to instead break this down into copy blocks of fixed lengths */
+    while (len) {
+        tocopy = MIN(non_olap_size, len);
+        len -= tocopy;
+
+        while (tocopy >= 32) {
+            memcpy(out, from, 32);
+            out += 32;
+            from += 32;
+            tocopy -= 32;
+        }
+
+        if (tocopy >= 16) {
+            memcpy(out, from, 16);
+            out += 16;
+            from += 16;
+            tocopy -= 16;
+        }
+
+        if (tocopy >= 8) {
+            zmemcpy_8(out, from);
+            out += 8;
+            from += 8;
+            tocopy -= 8;
+        }
+
+        if (tocopy >= 4) {
+            zmemcpy_4(out, from);
+            out += 4;
+            from += 4;
+            tocopy -= 4;
+        }
+
+        if (tocopy >= 2) {
+            zmemcpy_2(out, from);
+            out += 2;
+            from += 2;
+            tocopy -= 2;
+        }
+
+        if (tocopy) {
+            *out++ = *from++;
+        }
+    }
+
+    return out;
+}
author	Adam Stylinski <kungfujesus06@gmail.com>
	Mon, 21 Feb 2022 21:52:17 +0000 (16:52 -0500)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Wed, 16 Mar 2022 10:42:19 +0000 (11:42 +0100)
chunkset_tpl.h		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
functable.h		patch \| blob \| blame \| history
inffast.c		patch \| blob \| blame \| history
inflate.c		patch \| blob \| blame \| history
inflate_p.h		patch \| blob \| blame \| history