From: Nathan Moinvaziri <nathan@nathanm.com>
Date: Thu, 18 Feb 2021 03:29:46 +0000 (-0800)
Subject: Sync changes between different slide_hash variants.
X-Git-Tag: 2.1.0-beta1~597
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2e37971ec0bb4775c041336ce16cbd7a718e8005;p=thirdparty%2Fzlib-ng.git

Sync changes between different slide_hash variants.
---

diff --git a/arch/arm/slide_neon.c b/arch/arm/slide_neon.c
index f64fa5b5b..81461391c 100644
--- a/arch/arm/slide_neon.c
+++ b/arch/arm/slide_neon.c
@@ -18,7 +18,7 @@
 #include "../../deflate.h"
 
 /* SIMD version of hash_chain rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
+static inline void slide_hash_neon_chain(Pos *table, uint32_t entries, uint16_t wsize) {
     Z_REGISTER uint16x8_t v, *p;
     Z_REGISTER size_t n;
 
@@ -26,7 +26,7 @@ static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t w
     Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
 
     Assert(sizeof(Pos) == 2, "Wrong Pos size");
-    v = vdupq_n_u16(window_size);
+    v = vdupq_n_u16(wsize);
 
     p = (uint16x8_t *)table;
     n = size / (sizeof(uint16x8_t) * 8);
@@ -46,7 +46,7 @@ static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t w
 Z_INTERNAL void slide_hash_neon(deflate_state *s) {
     unsigned int wsize = s->w_size;
 
-    slide_hash_chain(s->head, HASH_SIZE, wsize);
-    slide_hash_chain(s->prev, wsize, wsize);
+    slide_hash_neon_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_neon_chain(s->prev, wsize, wsize);
 }
 #endif
diff --git a/arch/power/slide_hash_power8.c b/arch/power/slide_hash_power8.c
index b1e30cea0..112561939 100644
--- a/arch/power/slide_hash_power8.c
+++ b/arch/power/slide_hash_power8.c
@@ -10,51 +10,47 @@
 #include "zbuild.h"
 #include "deflate.h"
 
-static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
+static inline void slide_hash_power8_chain(Pos *table, uint32_t entries, uint16_t wsize) {
     vector unsigned short vw, vm, *vp;
     unsigned chunks;
 
+    table += entries;
+
     /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
-     * so instead of processing each of the n_elems in the hash table
+     * so instead of processing each of the entries in the hash table
      * individually, we can do it in chunks of 8 with vector instructions.
      *
      * This function is only called from slide_hash_power8(), and both calls
-     * pass n_elems as a power of 2 higher than 2^7, as defined by
-     * deflateInit2_(), so n_elems will always be a multiple of 8. */
-    chunks = n_elems >> 3;
-    Assert(n_elems % 8 == 0, "Weird hash table size!");
+     * pass entries as a power of 2 higher than 2^7, as defined by
+     * deflateInit2_(), so entries will always be a multiple of 8. */
+    chunks = entries >> 3;
+    Assert(entries % 8 == 0, "Weird hash table size!");
 
-    /* This type casting is safe since s->w_size is always <= 64KB
+    /* This type casting is safe since wsize is always <= 64KB
      * as defined by deflateInit2_() and Posf == unsigned short */
-    vw[0] = (Pos) s->w_size;
+    vw[0] = wsize;
     vw = vec_splat(vw,0);
 
-    vp = (vector unsigned short *) table_end;
+    vp = (vector unsigned short *)table;
 
     do {
         /* Processing 8 elements at a time */
         vp--;
         vm = *vp;
 
-        /* This is equivalent to: m >= w_size ? m - w_size : 0
+        /* This is equivalent to: m >= wsize ? m - wsize : 0
          * Since we are using a saturated unsigned subtraction, any
-         * values that are > w_size will be set to 0, while the others
-         * will be subtracted by w_size. */
+         * values that are > wsize will be set to 0, while the others
+         * will be subtracted by wsize. */
         *vp = vec_subs(vm,vw);
     } while (--chunks);
 }
 
 void Z_INTERNAL slide_hash_power8(deflate_state *s) {
-    unsigned int n;
-    Pos *p;
-
-    n = HASH_SIZE;
-    p = &s->head[n];
-    slide_hash_power8_loop(s,n,p);
+    uint16_t wsize = s->w_size;
 
-    n = s->w_size;
-    p = &s->prev[n];
-    slide_hash_power8_loop(s,n,p);
+    slide_hash_power8_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_power8_chain(s->prev, wsize, wsize);
 }
 
 #endif /* POWER8_VSX_SLIDEHASH */
diff --git a/arch/x86/slide_avx.c b/arch/x86/slide_avx.c
index be9a9b7ea..01c788df1 100644
--- a/arch/x86/slide_avx.c
+++ b/arch/x86/slide_avx.c
@@ -14,34 +14,26 @@
 
 #include <immintrin.h>
 
-Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
-    Pos *p;
-    unsigned n;
-    uint16_t wsize = (uint16_t)s->w_size;
-    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+static inline void slide_hash_avx2_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 16;
 
-    n = HASH_SIZE;
-    p = &s->head[n] - 16;
     do {
         __m256i value, result;
 
-        value = _mm256_loadu_si256((__m256i *)p);
-        result= _mm256_subs_epu16(value, ymm_wsize);
-        _mm256_storeu_si256((__m256i *)p, result);
-        p -= 16;
-        n -= 16;
-    } while (n > 0);
+        value = _mm256_loadu_si256((__m256i *)table);
+        result = _mm256_subs_epu16(value, wsize);
+        _mm256_storeu_si256((__m256i *)table, result);
 
-    n = wsize;
-    p = &s->prev[n] - 16;
-    do {
-        __m256i value, result;
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+}
 
-        value = _mm256_loadu_si256((__m256i *)p);
-        result= _mm256_subs_epu16(value, ymm_wsize);
-        _mm256_storeu_si256((__m256i *)p, result);
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
 
-        p -= 16;
-        n -= 16;
-    } while (n > 0);
+    slide_hash_avx2_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_avx2_chain(s->prev, wsize, ymm_wsize);
 }
diff --git a/arch/x86/slide_sse.c b/arch/x86/slide_sse.c
index abf447475..65d58a71e 100644
--- a/arch/x86/slide_sse.c
+++ b/arch/x86/slide_sse.c
@@ -13,34 +13,26 @@
 
 #include <immintrin.h>
 
-Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
-    Pos *p;
-    unsigned n;
-    uint16_t wsize = (uint16_t)s->w_size;
-    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+static inline void slide_hash_sse2_chain(Pos *table, uint32_t entries, const __m128i wsize) {
+    table += entries;
+    table -= 8;
 
-    n = HASH_SIZE;
-    p = &s->head[n] - 8;
     do {
         __m128i value, result;
 
-        value = _mm_loadu_si128((__m128i *)p);
-        result= _mm_subs_epu16(value, xmm_wsize);
-        _mm_storeu_si128((__m128i *)p, result);
-        p -= 8;
-        n -= 8;
-    } while (n > 0);
+        value = _mm_loadu_si128((__m128i *)table);
+        result= _mm_subs_epu16(value, wsize);
+        _mm_storeu_si128((__m128i *)table, result);
 
-    n = wsize;
-    p = &s->prev[n] - 8;
-    do {
-        __m128i value, result;
+        table -= 8;
+        entries -= 8;
+    } while (entries > 0);
+}
 
-        value = _mm_loadu_si128((__m128i *)p);
-        result= _mm_subs_epu16(value, xmm_wsize);
-        _mm_storeu_si128((__m128i *)p, result);
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
 
-        p -= 8;
-        n -= 8;
-    } while (n > 0);
+    slide_hash_sse2_chain(s->head, HASH_SIZE, xmm_wsize);
+    slide_hash_sse2_chain(s->prev, wsize, xmm_wsize);
 }
diff --git a/deflate.c b/deflate.c
index ca9dafa8d..2aae899be 100644
--- a/deflate.c
+++ b/deflate.c
@@ -188,20 +188,19 @@ static const config configuration_table[10] = {
  * bit values at the expense of memory usage). We slide even when level == 0 to
  * keep the hash table consistent if we switch back to level > 0 later.
  */
-Z_INTERNAL void slide_hash_c(deflate_state *s) {
-    Pos *p;
-    unsigned n;
-    unsigned int wsize = s->w_size;
-
-    n = HASH_SIZE;
-    p = &s->head[n];
+static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) {
 #ifdef NOT_TWEAK_COMPILER
+    table += entries;
     do {
         unsigned m;
-        m = *--p;
-        *p = (Pos)(m >= wsize ? m-wsize : 0);
-    } while (--n);
+        m = *--table;
+        *table = (Pos)(m >= wsize ? m-wsize : 0);
+        /* If entries is not on any hash chain, prev[entries] is garbage but
+         * its value will never be used.
+         */
+    } while (--entries);
 #else
+    {
     /* As of I make this change, gcc (4.8.*) isn't able to vectorize
      * this hot loop using saturated-subtraction on x86-64 architecture.
      * To avoid this defect, we can change the loop such that
@@ -210,40 +209,23 @@ Z_INTERNAL void slide_hash_c(deflate_state *s) {
      *       choose type "Pos" (instead of 'unsigned int') for the
      *       variable to avoid unnecessary zero-extension.
      */
-    {
         unsigned int i;
-        Pos *q = p - n;
-        for (i = 0; i < n; i++) {
+        Pos *q = table;
+        for (i = 0; i < entries; i++) {
             Pos m = *q;
             Pos t = (Pos)wsize;
             *q++ = (Pos)(m >= t ? m-t: 0);
         }
     }
 #endif /* NOT_TWEAK_COMPILER */
+}
 
-    n = wsize;
-    p = &s->prev[n];
-#ifdef NOT_TWEAK_COMPILER
-    do {
-        unsigned m;
-        m = *--p;
-        *p = (Pos)(m >= wsize ? m-wsize : 0);
-        /* If n is not on any hash chain, prev[n] is garbage but
-         * its value will never be used.
-         */
-    } while (--n);
-#else
-    {
-        unsigned int i;
-        Pos *q = p - n;
-        for (i = 0; i < n; i++) {
-            Pos m = *q;
-            Pos t = (Pos)wsize;
-            *q++ = (Pos)(m >= t ? m-t: 0);
+Z_INTERNAL void slide_hash_c(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_c_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_c_chain(s->prev, wsize, wsize);
         }
-    }
-#endif /* NOT_TWEAK_COMPILER */
-}
 
 /* ========================================================================= */
 int32_t Z_EXPORT PREFIX(deflateInit_)(PREFIX3(stream) *strm, int32_t level, const char *version, int32_t stream_size) {