Sync changes between different slide_hash variants.

author Nathan Moinvaziri <nathan@nathanm.com>

Thu, 18 Feb 2021 03:29:46 +0000 (19:29 -0800)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Sat, 12 Jun 2021 16:26:15 +0000 (18:26 +0200)
author Nathan Moinvaziri <nathan@nathanm.com>
Thu, 18 Feb 2021 03:29:46 +0000 (19:29 -0800)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Sat, 12 Jun 2021 16:26:15 +0000 (18:26 +0200)
diff --git a/arch/arm/slide_neon.c b/arch/arm/slide_neon.c

index f64fa5b5b40107727a8fa32ace4c0b9359778eaa..81461391c0753c3cd697a3d4baa9edce19411e31 100644 (file)
--- a/arch/arm/slide_neon.c
+++ b/arch/arm/slide_neon.c
@@ -18,7 +18,7 @@
  #include "../../deflate.h"
  
  /* SIMD version of hash_chain rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
+static inline void slide_hash_neon_chain(Pos *table, uint32_t entries, uint16_t wsize) {
      Z_REGISTER uint16x8_t v, *p;
      Z_REGISTER size_t n;
  
@@ -26,7 +26,7 @@ static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t w
      Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
  
      Assert(sizeof(Pos) == 2, "Wrong Pos size");
-    v = vdupq_n_u16(window_size);
+    v = vdupq_n_u16(wsize);
  
      p = (uint16x8_t *)table;
      n = size / (sizeof(uint16x8_t) * 8);
@@ -46,7 +46,7 @@ static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t w
  Z_INTERNAL void slide_hash_neon(deflate_state *s) {
      unsigned int wsize = s->w_size;
  
-    slide_hash_chain(s->head, HASH_SIZE, wsize);
-    slide_hash_chain(s->prev, wsize, wsize);
+    slide_hash_neon_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_neon_chain(s->prev, wsize, wsize);
  }
  #endif
diff --git a/arch/power/slide_hash_power8.c b/arch/power/slide_hash_power8.c

index b1e30cea099e1a61261fdc56b4115154442413a5..112561939eb2946e336b3078012ea4680665d0a8 100644 (file)
--- a/arch/power/slide_hash_power8.c
+++ b/arch/power/slide_hash_power8.c
@@ -10,51 +10,47 @@
  #include "zbuild.h"
  #include "deflate.h"
  
-static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
+static inline void slide_hash_power8_chain(Pos *table, uint32_t entries, uint16_t wsize) {
      vector unsigned short vw, vm, *vp;
      unsigned chunks;
  
+    table += entries;
+
      /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
-     * so instead of processing each of the n_elems in the hash table
+     * so instead of processing each of the entries in the hash table
       * individually, we can do it in chunks of 8 with vector instructions.
       *
       * This function is only called from slide_hash_power8(), and both calls
-     * pass n_elems as a power of 2 higher than 2^7, as defined by
-     * deflateInit2_(), so n_elems will always be a multiple of 8. */
-    chunks = n_elems >> 3;
-    Assert(n_elems % 8 == 0, "Weird hash table size!");
+     * pass entries as a power of 2 higher than 2^7, as defined by
+     * deflateInit2_(), so entries will always be a multiple of 8. */
+    chunks = entries >> 3;
+    Assert(entries % 8 == 0, "Weird hash table size!");
  
-    /* This type casting is safe since s->w_size is always <= 64KB
+    /* This type casting is safe since wsize is always <= 64KB
       * as defined by deflateInit2_() and Posf == unsigned short */
-    vw[0] = (Pos) s->w_size;
+    vw[0] = wsize;
      vw = vec_splat(vw,0);
  
-    vp = (vector unsigned short *) table_end;
+    vp = (vector unsigned short *)table;
  
      do {
          /* Processing 8 elements at a time */
          vp--;
          vm = *vp;
  
-        /* This is equivalent to: m >= w_size ? m - w_size : 0
+        /* This is equivalent to: m >= wsize ? m - wsize : 0
           * Since we are using a saturated unsigned subtraction, any
-         * values that are > w_size will be set to 0, while the others
-         * will be subtracted by w_size. */
+         * values that are > wsize will be set to 0, while the others
+         * will be subtracted by wsize. */
          *vp = vec_subs(vm,vw);
      } while (--chunks);
  }
  
  void Z_INTERNAL slide_hash_power8(deflate_state *s) {
-    unsigned int n;
-    Pos *p;
-
-    n = HASH_SIZE;
-    p = &s->head[n];
-    slide_hash_power8_loop(s,n,p);
+    uint16_t wsize = s->w_size;
  
-    n = s->w_size;
-    p = &s->prev[n];
-    slide_hash_power8_loop(s,n,p);
+    slide_hash_power8_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_power8_chain(s->prev, wsize, wsize);
  }
  
  #endif /* POWER8_VSX_SLIDEHASH */
diff --git a/arch/x86/slide_avx.c b/arch/x86/slide_avx.c

index be9a9b7ea2e390e73f6aac75e84cd39753cb3875..01c788df1c215acff9801925dc065f8e06111437 100644 (file)
--- a/arch/x86/slide_avx.c
+++ b/arch/x86/slide_avx.c
@@ -14,34 +14,26 @@
  
  #include <immintrin.h>
  
-Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
-    Pos *p;
-    unsigned n;
-    uint16_t wsize = (uint16_t)s->w_size;
-    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+static inline void slide_hash_avx2_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 16;
  
-    n = HASH_SIZE;
-    p = &s->head[n] - 16;
      do {
          __m256i value, result;
  
-        value = _mm256_loadu_si256((__m256i *)p);
-        result= _mm256_subs_epu16(value, ymm_wsize);
-        _mm256_storeu_si256((__m256i *)p, result);
-        p -= 16;
-        n -= 16;
-    } while (n > 0);
+        value = _mm256_loadu_si256((__m256i *)table);
+        result = _mm256_subs_epu16(value, wsize);
+        _mm256_storeu_si256((__m256i *)table, result);
  
-    n = wsize;
-    p = &s->prev[n] - 16;
-    do {
-        __m256i value, result;
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+}
  
-        value = _mm256_loadu_si256((__m256i *)p);
-        result= _mm256_subs_epu16(value, ymm_wsize);
-        _mm256_storeu_si256((__m256i *)p, result);
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
  
-        p -= 16;
-        n -= 16;
-    } while (n > 0);
+    slide_hash_avx2_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_avx2_chain(s->prev, wsize, ymm_wsize);
  }
diff --git a/arch/x86/slide_sse.c b/arch/x86/slide_sse.c

index abf44747527e5e0bca69e58bcce32f01a9c7388b..65d58a71eb2a448bcc3d17b7e24af06974dbf04f 100644 (file)
--- a/arch/x86/slide_sse.c
+++ b/arch/x86/slide_sse.c
@@ -13,34 +13,26 @@
  
  #include <immintrin.h>
  
-Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
-    Pos *p;
-    unsigned n;
-    uint16_t wsize = (uint16_t)s->w_size;
-    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+static inline void slide_hash_sse2_chain(Pos *table, uint32_t entries, const __m128i wsize) {
+    table += entries;
+    table -= 8;
  
-    n = HASH_SIZE;
-    p = &s->head[n] - 8;
      do {
          __m128i value, result;
  
-        value = _mm_loadu_si128((__m128i *)p);
-        result= _mm_subs_epu16(value, xmm_wsize);
-        _mm_storeu_si128((__m128i *)p, result);
-        p -= 8;
-        n -= 8;
-    } while (n > 0);
+        value = _mm_loadu_si128((__m128i *)table);
+        result= _mm_subs_epu16(value, wsize);
+        _mm_storeu_si128((__m128i *)table, result);
  
-    n = wsize;
-    p = &s->prev[n] - 8;
-    do {
-        __m128i value, result;
+        table -= 8;
+        entries -= 8;
+    } while (entries > 0);
+}
  
-        value = _mm_loadu_si128((__m128i *)p);
-        result= _mm_subs_epu16(value, xmm_wsize);
-        _mm_storeu_si128((__m128i *)p, result);
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
  
-        p -= 8;
-        n -= 8;
-    } while (n > 0);
+    slide_hash_sse2_chain(s->head, HASH_SIZE, xmm_wsize);
+    slide_hash_sse2_chain(s->prev, wsize, xmm_wsize);
  }
diff --git a/deflate.c b/deflate.c

index ca9dafa8db48b1cd0501cd64bda8fe68fad18d56..2aae899be1def1a7580e8f17e3c6afb822235c00 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -188,20 +188,19 @@ static const config configuration_table[10] = {
   * bit values at the expense of memory usage). We slide even when level == 0 to
   * keep the hash table consistent if we switch back to level > 0 later.
   */
-Z_INTERNAL void slide_hash_c(deflate_state *s) {
-    Pos *p;
-    unsigned n;
-    unsigned int wsize = s->w_size;
-
-    n = HASH_SIZE;
-    p = &s->head[n];
+static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) {
  #ifdef NOT_TWEAK_COMPILER
+    table += entries;
      do {
          unsigned m;
-        m = *--p;
-        *p = (Pos)(m >= wsize ? m-wsize : 0);
-    } while (--n);
+        m = *--table;
+        *table = (Pos)(m >= wsize ? m-wsize : 0);
+        /* If entries is not on any hash chain, prev[entries] is garbage but
+         * its value will never be used.
+         */
+    } while (--entries);
  #else
+    {
      /* As of I make this change, gcc (4.8.*) isn't able to vectorize
       * this hot loop using saturated-subtraction on x86-64 architecture.
       * To avoid this defect, we can change the loop such that
@@ -210,40 +209,23 @@ Z_INTERNAL void slide_hash_c(deflate_state *s) {
       *       choose type "Pos" (instead of 'unsigned int') for the
       *       variable to avoid unnecessary zero-extension.
       */
-    {
          unsigned int i;
-        Pos *q = p - n;
-        for (i = 0; i < n; i++) {
+        Pos *q = table;
+        for (i = 0; i < entries; i++) {
              Pos m = *q;
              Pos t = (Pos)wsize;
              *q++ = (Pos)(m >= t ? m-t: 0);
          }
      }
  #endif /* NOT_TWEAK_COMPILER */
+}
  
-    n = wsize;
-    p = &s->prev[n];
-#ifdef NOT_TWEAK_COMPILER
-    do {
-        unsigned m;
-        m = *--p;
-        *p = (Pos)(m >= wsize ? m-wsize : 0);
-        /* If n is not on any hash chain, prev[n] is garbage but
-         * its value will never be used.
-         */
-    } while (--n);
-#else
-    {
-        unsigned int i;
-        Pos *q = p - n;
-        for (i = 0; i < n; i++) {
-            Pos m = *q;
-            Pos t = (Pos)wsize;
-            *q++ = (Pos)(m >= t ? m-t: 0);
+Z_INTERNAL void slide_hash_c(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_c_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_c_chain(s->prev, wsize, wsize);
          }
-    }
-#endif /* NOT_TWEAK_COMPILER */
-}
  
  /* ========================================================================= */
  int32_t Z_EXPORT PREFIX(deflateInit_)(PREFIX3(stream) *strm, int32_t level, const char *version, int32_t stream_size) {
author	Nathan Moinvaziri <nathan@nathanm.com>
	Thu, 18 Feb 2021 03:29:46 +0000 (19:29 -0800)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Sat, 12 Jun 2021 16:26:15 +0000 (18:26 +0200)
arch/arm/slide_neon.c		patch \| blob \| blame \| history
arch/power/slide_hash_power8.c		patch \| blob \| blame \| history
arch/x86/slide_avx.c		patch \| blob \| blame \| history
arch/x86/slide_sse.c		patch \| blob \| blame \| history
deflate.c		patch \| blob \| blame \| history