From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Mon, 21 Feb 2022 21:52:17 +0000 (-0500)
Subject: Speed up chunkcopy and memset
X-Git-Tag: 2.1.0-beta1~345
X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=49a6bb5d413fef477a76111e1a5946452ff9d4a8;p=thirdparty%2Fzlib-ng.git

Speed up chunkcopy and memset

This was found to have a significant impact on a highly compressible PNG
for both the encode and decode.  Some deltas show performance improving
as much as 60%+.

For the scenarios where the "dist" is not an even modulus of our chunk
size, we simply repeat the bytes as many times as possible into our
vector registers.  We then copy the entire vector and then advance the
quotient of our chunksize divided by our dist value.

If dist happens to be 1, there's no reason to not just call memset from
libc (this is likely to be just as fast if not faster).
---

diff --git a/chunkset_tpl.h b/chunkset_tpl.h
index 8e6f5664..189b5ec4 100644
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -3,6 +3,7 @@
  */
 
 #include "zbuild.h"
+#include <stdlib.h>
 
 /* Returns the chunk size */
 Z_INTERNAL uint32_t CHUNKSIZE(void) {
@@ -38,52 +39,6 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
     return out;
 }
 
-/* Behave like chunkcopy, but avoid writing beyond of legal output. */
-Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
-    unsigned safelen = (unsigned)((safe - out) + 1);
-    len = MIN(len, safelen);
-#if CHUNK_SIZE >= 32
-    while (len >= 32) {
-        memcpy(out, from, 32);
-        out += 32;
-        from += 32;
-        len -= 32;
-    }
-#endif
-#if CHUNK_SIZE >= 16
-    while (len >= 16) {
-        memcpy(out, from, 16);
-        out += 16;
-        from += 16;
-        len -= 16;
-    }
-#endif
-#if CHUNK_SIZE >= 8
-    while (len >= 8) {
-        zmemcpy_8(out, from);
-        out += 8;
-        from += 8;
-        len -= 8;
-    }
-#endif
-    if (len >= 4) {
-        zmemcpy_4(out, from);
-        out += 4;
-        from += 4;
-        len -= 4;
-    }
-    if (len >= 2) {
-        zmemcpy_2(out, from);
-        out += 2;
-        from += 2;
-        len -= 2;
-    }
-    if (len == 1) {
-        *out++ = *from++;
-    }
-    return out;
-}
-
 /* Perform short copies until distance can be rewritten as being at least
    sizeof chunk_t.
 
@@ -112,66 +67,80 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
        Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
     Assert(dist > 0, "chunkmemset cannot have a distance 0");
 
-    unsigned char *from = out - dist;
-    chunk_t chunk;
-    unsigned sz = sizeof(chunk);
-    if (len < sz) {
-        while (len != 0) {
-            *out++ = *from++;
-            --len;
-        }
-        return out;
-    }
+    uint8_t *from = out - dist;
 
-#ifdef HAVE_CHUNKMEMSET_1
     if (dist == 1) {
-        chunkmemset_1(from, &chunk);
-    } else
-#endif
+        memset(out, *from, len);
+        return out + len;
+    } else if (dist > sizeof(chunk_t)) {
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    chunk_t chunk_load;
+    uint32_t chunk_mod = 0;
+
+    /* TODO: possibly build up a permutation table for this if not an even modulus */
 #ifdef HAVE_CHUNKMEMSET_2
     if (dist == 2) {
-        chunkmemset_2(from, &chunk);
+        chunkmemset_2(from, &chunk_load);
     } else
 #endif
 #ifdef HAVE_CHUNKMEMSET_4
     if (dist == 4) {
-        chunkmemset_4(from, &chunk);
+        chunkmemset_4(from, &chunk_load);
     } else
 #endif
 #ifdef HAVE_CHUNKMEMSET_8
     if (dist == 8) {
-        chunkmemset_8(from, &chunk);
+        chunkmemset_8(from, &chunk_load);
+    } else if (dist == sizeof(chunk_t)) {
+        loadchunk(from, &chunk_load);
     } else
 #endif
-    if (dist == sz) {
-        loadchunk(from, &chunk);
-    } else if (dist < sz) {
-        unsigned char *end = out + len - 1;
-        while (len > dist) {
-            out = CHUNKCOPY_SAFE(out, from, dist, end);
-            len -= dist;
+    {
+        /* This code takes string of length dist from "from" and repeats
+         * it for as many times as can fit in a chunk_t (vector register) */
+        int32_t cpy_dist;
+        int32_t bytes_remaining = sizeof(chunk_t);
+        uint8_t *cur_chunk = (uint8_t*)&chunk_load;
+        while (bytes_remaining) {
+            cpy_dist = MIN(dist, bytes_remaining);
+            memcpy(cur_chunk, from, cpy_dist);
+            bytes_remaining -= cpy_dist;
+            cur_chunk += cpy_dist;
+            /* This allows us to bypass an expensive integer division since we're effectively
+             * counting in this loop, anyway. However, we may have to derive a similarly 
+             * sensible solution for if we use a permutation table that allows us to construct
+             * this vector in one load and one permute instruction */
+            chunk_mod = cpy_dist;
         }
-        if (len > 0) {
-            out = CHUNKCOPY_SAFE(out, from, len, end);
+    }
+
+    /* If we're lucky enough and dist happens to be an even modulus of our vector length,
+     * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
+    if (chunk_mod == 0) {
+        while (len >= (2 * sizeof(chunk_t))) {
+            storechunk(out, &chunk_load);
+            storechunk(out + sizeof(chunk_t), &chunk_load);
+            out += 2 * sizeof(chunk_t);
+            len -= 2 * sizeof(chunk_t);
         }
-        return out;
-    } else {
-        out = CHUNKUNROLL(out, &dist, &len);
-        return CHUNKCOPY(out, out - dist, len);
     }
 
-    unsigned rem = len % sz;
-    len -= rem;
-    while (len) {
-        storechunk(out, &chunk);
-        out += sz;
-        len -= sz;
+    /* If we don't have a "dist" length that divides evenly into a vector
+     * register, we can write the whole vector register but we need only
+     * advance by the amount of the whole string that fits in our chunk_t.
+     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
+    uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
+    while (len >= sizeof(chunk_t)) {
+        storechunk(out, &chunk_load);
+        len -= adv_amount;
+        out += adv_amount;
     }
 
-    /* Last, deal with the case when LEN is not a multiple of SZ. */
-    if (rem) {
-        memcpy(out, from, rem);
-        out += rem;
+    if (len) {
+        memcpy(out, &chunk_load, len);
+        out += len;
     }
 
     return out;
diff --git a/functable.c b/functable.c
index 5147d3f8..f39db295 100644
--- a/functable.c
+++ b/functable.c
@@ -273,32 +273,6 @@ Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned l
     return functable.chunkcopy(out, from, len);
 }
 
-Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
-    // Initialize default
-    functable.chunkcopy_safe = &chunkcopy_safe_c;
-
-#ifdef X86_SSE2_CHUNKSET
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
-    if (x86_cpu_has_sse2)
-# endif
-        functable.chunkcopy_safe = &chunkcopy_safe_sse2;
-#endif
-#ifdef X86_AVX_CHUNKSET
-    if (x86_cpu_has_avx2)
-        functable.chunkcopy_safe = &chunkcopy_safe_avx;
-#endif
-#ifdef ARM_NEON_CHUNKSET
-    if (arm_cpu_has_neon)
-        functable.chunkcopy_safe = &chunkcopy_safe_neon;
-#endif
-#ifdef POWER8_VSX_CHUNKSET
-    if (power_cpu_has_arch_2_07)
-        functable.chunkcopy_safe = &chunkcopy_safe_power8;
-#endif
-
-    return functable.chunkcopy_safe(out, from, len, safe);
-}
-
 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
     // Initialize default
     functable.chunkunroll = &chunkunroll_c;
@@ -436,7 +410,6 @@ Z_INTERNAL Z_TLS struct functable_s functable = {
     compare256_stub,
     chunksize_stub,
     chunkcopy_stub,
-    chunkcopy_safe_stub,
     chunkunroll_stub,
     chunkmemset_stub,
     chunkmemset_safe_stub,
diff --git a/functable.h b/functable.h
index 949c5b1b..a106c93a 100644
--- a/functable.h
+++ b/functable.h
@@ -18,7 +18,6 @@ struct functable_s {
     uint32_t (* compare256)         (const uint8_t *src0, const uint8_t *src1);
     uint32_t (* chunksize)          (void);
     uint8_t* (* chunkcopy)          (uint8_t *out, uint8_t const *from, unsigned len);
-    uint8_t* (* chunkcopy_safe)     (uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
     uint8_t* (* chunkunroll)        (uint8_t *out, unsigned *dist, unsigned *len);
     uint8_t* (* chunkmemset)        (uint8_t *out, unsigned dist, unsigned len);
     uint8_t* (* chunkmemset_safe)   (uint8_t *out, unsigned dist, unsigned len, unsigned left);
diff --git a/inffast.c b/inffast.c
index 58cbad7e..e0fb99c5 100644
--- a/inffast.c
+++ b/inffast.c
@@ -11,7 +11,6 @@
 #include "inflate_p.h"
 #include "functable.h"
 
-
 /* Load 64 bits from IN and place the bytes at offset BITS in the result. */
 static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
     uint64_t chunk;
@@ -430,7 +429,7 @@ void Z_INTERNAL zng_inflate_fast_back(PREFIX3(stream) *strm, unsigned long start
                         from += wsize - op;
                         if (op < len) {         /* some from end of window */
                             len -= op;
-                            out = functable.chunkcopy_safe(out, from, op, safe);
+                            out = chunkcopy_safe(out, from, op, safe);
                             from = window;      /* more from start of window */
                             op = wnext;
                             /* This (rare) case can create a situation where
@@ -440,16 +439,16 @@ void Z_INTERNAL zng_inflate_fast_back(PREFIX3(stream) *strm, unsigned long start
                     }
                     if (op < len) {             /* still need some from output */
                         len -= op;
-                        out = functable.chunkcopy_safe(out, from, op, safe);
+                        out = chunkcopy_safe(out, from, op, safe);
                         out = functable.chunkunroll(out, &dist, &len);
-                        out = functable.chunkcopy_safe(out, out - dist, len, safe);
+                        out = chunkcopy_safe(out, out - dist, len, safe);
                     } else {
-                        out = functable.chunkcopy_safe(out, from, len, safe);
+                        out = chunkcopy_safe(out, from, len, safe);
                     }
                 } else if (extra_safe) {
                     /* Whole reference is in range of current output. */
                     if (dist >= len || dist >= state->chunksize)
-                        out = functable.chunkcopy_safe(out, out - dist, len, safe);
+                        out = chunkcopy_safe(out, out - dist, len, safe);
                     else
                         out = functable.chunkmemset_safe(out, dist, len, (unsigned)((safe - out) + 1));
                 } else {
diff --git a/inflate.c b/inflate.c
index 06b37bf4..5e2af29c 100644
--- a/inflate.c
+++ b/inflate.c
@@ -887,7 +887,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
             }
             unsigned char *next_out = state->window + state->wsize + state->wnext;
             if (copy <= state->offset) {
-                functable.chunkcopy_safe(next_out, next_out - state->offset, copy, put + buf_left);
+                chunkcopy_safe(next_out, next_out - state->offset, copy, put + buf_left);
             } else {                             /* copy from output */
                 functable.chunkmemset_safe(next_out, state->offset, copy, (uint32_t)buf_left);
             }
diff --git a/inflate_p.h b/inflate_p.h
index c5ba13a0..86e3702b 100644
--- a/inflate_p.h
+++ b/inflate_p.h
@@ -211,3 +211,78 @@ static inline void window_output_flush(PREFIX3(stream) *strm) {
     state->whave += out_bytes;
     state->whave = MIN(state->whave, state->wsize);
 }
+
+/* Behave like chunkcopy, but avoid writing beyond of legal output. */
+static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe) {
+    uint32_t safelen = (uint32_t)((safe - out) + 1);
+    len = MIN(len, safelen);
+    int olap_src = from >= out && from < out + len;
+    int olap_dst = out >= from && out < from + len;
+    int tocopy;
+
+    /* For all cases without overlap, memcpy is ideal */
+    if (!(olap_src || olap_dst)) {
+        memcpy(out, from, len);
+        return out + len;
+    }
+
+    /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior,
+     * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
+     * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
+     * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
+     * behind or lookahead distance */
+    int non_olap_size = (from > out) ? from - out : out - from;
+
+    memcpy(out, from, non_olap_size);
+    out += non_olap_size;
+    from += non_olap_size;
+    len -= non_olap_size;
+
+    /* So this doesn't give use a worst case scenario of function calls in a loop,
+     * we want to instead break this down into copy blocks of fixed lengths */
+    while (len) {
+        tocopy = MIN(non_olap_size, len);
+        len -= tocopy;
+
+        while (tocopy >= 32) {
+            memcpy(out, from, 32);
+            out += 32;
+            from += 32;
+            tocopy -= 32;
+        }
+
+        if (tocopy >= 16) {
+            memcpy(out, from, 16);
+            out += 16;
+            from += 16;
+            tocopy -= 16;
+        }
+
+        if (tocopy >= 8) {
+            zmemcpy_8(out, from);
+            out += 8;
+            from += 8;
+            tocopy -= 8;
+        }
+
+        if (tocopy >= 4) {
+            zmemcpy_4(out, from);
+            out += 4;
+            from += 4;
+            tocopy -= 4;
+        }
+
+        if (tocopy >= 2) {
+            zmemcpy_2(out, from);
+            out += 2;
+            from += 2;
+            tocopy -= 2;
+        }
+
+        if (tocopy) {
+            *out++ = *from++;
+        }
+    }
+
+    return out;
+}