From: Nathan Moinvaziri Date: Thu, 18 Feb 2021 03:29:46 +0000 (-0800) Subject: Sync changes between different slide_hash variants. X-Git-Tag: 2.1.0-beta1~597 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2e37971ec0bb4775c041336ce16cbd7a718e8005;p=thirdparty%2Fzlib-ng.git Sync changes between different slide_hash variants. --- diff --git a/arch/arm/slide_neon.c b/arch/arm/slide_neon.c index f64fa5b5b..81461391c 100644 --- a/arch/arm/slide_neon.c +++ b/arch/arm/slide_neon.c @@ -18,7 +18,7 @@ #include "../../deflate.h" /* SIMD version of hash_chain rebase */ -static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) { +static inline void slide_hash_neon_chain(Pos *table, uint32_t entries, uint16_t wsize) { Z_REGISTER uint16x8_t v, *p; Z_REGISTER size_t n; @@ -26,7 +26,7 @@ static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t w Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); Assert(sizeof(Pos) == 2, "Wrong Pos size"); - v = vdupq_n_u16(window_size); + v = vdupq_n_u16(wsize); p = (uint16x8_t *)table; n = size / (sizeof(uint16x8_t) * 8); @@ -46,7 +46,7 @@ static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t w Z_INTERNAL void slide_hash_neon(deflate_state *s) { unsigned int wsize = s->w_size; - slide_hash_chain(s->head, HASH_SIZE, wsize); - slide_hash_chain(s->prev, wsize, wsize); + slide_hash_neon_chain(s->head, HASH_SIZE, wsize); + slide_hash_neon_chain(s->prev, wsize, wsize); } #endif diff --git a/arch/power/slide_hash_power8.c b/arch/power/slide_hash_power8.c index b1e30cea0..112561939 100644 --- a/arch/power/slide_hash_power8.c +++ b/arch/power/slide_hash_power8.c @@ -10,51 +10,47 @@ #include "zbuild.h" #include "deflate.h" -static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) { +static inline void slide_hash_power8_chain(Pos *table, uint32_t entries, uint16_t wsize) { vector unsigned short vw, vm, *vp; unsigned chunks; + table += entries; + /* Each vector register (chunk) corresponds to 128 bits == 8 Posf, - * so instead of processing each of the n_elems in the hash table + * so instead of processing each of the entries in the hash table * individually, we can do it in chunks of 8 with vector instructions. * * This function is only called from slide_hash_power8(), and both calls - * pass n_elems as a power of 2 higher than 2^7, as defined by - * deflateInit2_(), so n_elems will always be a multiple of 8. */ - chunks = n_elems >> 3; - Assert(n_elems % 8 == 0, "Weird hash table size!"); + * pass entries as a power of 2 higher than 2^7, as defined by + * deflateInit2_(), so entries will always be a multiple of 8. */ + chunks = entries >> 3; + Assert(entries % 8 == 0, "Weird hash table size!"); - /* This type casting is safe since s->w_size is always <= 64KB + /* This type casting is safe since wsize is always <= 64KB * as defined by deflateInit2_() and Posf == unsigned short */ - vw[0] = (Pos) s->w_size; + vw[0] = wsize; vw = vec_splat(vw,0); - vp = (vector unsigned short *) table_end; + vp = (vector unsigned short *)table; do { /* Processing 8 elements at a time */ vp--; vm = *vp; - /* This is equivalent to: m >= w_size ? m - w_size : 0 + /* This is equivalent to: m >= wsize ? m - wsize : 0 * Since we are using a saturated unsigned subtraction, any - * values that are > w_size will be set to 0, while the others - * will be subtracted by w_size. */ + * values that are > wsize will be set to 0, while the others + * will be subtracted by wsize. */ *vp = vec_subs(vm,vw); } while (--chunks); } void Z_INTERNAL slide_hash_power8(deflate_state *s) { - unsigned int n; - Pos *p; - - n = HASH_SIZE; - p = &s->head[n]; - slide_hash_power8_loop(s,n,p); + uint16_t wsize = s->w_size; - n = s->w_size; - p = &s->prev[n]; - slide_hash_power8_loop(s,n,p); + slide_hash_power8_chain(s->head, HASH_SIZE, wsize); + slide_hash_power8_chain(s->prev, wsize, wsize); } #endif /* POWER8_VSX_SLIDEHASH */ diff --git a/arch/x86/slide_avx.c b/arch/x86/slide_avx.c index be9a9b7ea..01c788df1 100644 --- a/arch/x86/slide_avx.c +++ b/arch/x86/slide_avx.c @@ -14,34 +14,26 @@ #include -Z_INTERNAL void slide_hash_avx2(deflate_state *s) { - Pos *p; - unsigned n; - uint16_t wsize = (uint16_t)s->w_size; - const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize); +static inline void slide_hash_avx2_chain(Pos *table, uint32_t entries, const __m256i wsize) { + table += entries; + table -= 16; - n = HASH_SIZE; - p = &s->head[n] - 16; do { __m256i value, result; - value = _mm256_loadu_si256((__m256i *)p); - result= _mm256_subs_epu16(value, ymm_wsize); - _mm256_storeu_si256((__m256i *)p, result); - p -= 16; - n -= 16; - } while (n > 0); + value = _mm256_loadu_si256((__m256i *)table); + result = _mm256_subs_epu16(value, wsize); + _mm256_storeu_si256((__m256i *)table, result); - n = wsize; - p = &s->prev[n] - 16; - do { - __m256i value, result; + table -= 16; + entries -= 16; + } while (entries > 0); +} - value = _mm256_loadu_si256((__m256i *)p); - result= _mm256_subs_epu16(value, ymm_wsize); - _mm256_storeu_si256((__m256i *)p, result); +Z_INTERNAL void slide_hash_avx2(deflate_state *s) { + uint16_t wsize = (uint16_t)s->w_size; + const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize); - p -= 16; - n -= 16; - } while (n > 0); + slide_hash_avx2_chain(s->head, HASH_SIZE, ymm_wsize); + slide_hash_avx2_chain(s->prev, wsize, ymm_wsize); } diff --git a/arch/x86/slide_sse.c b/arch/x86/slide_sse.c index abf447475..65d58a71e 100644 --- a/arch/x86/slide_sse.c +++ b/arch/x86/slide_sse.c @@ -13,34 +13,26 @@ #include -Z_INTERNAL void slide_hash_sse2(deflate_state *s) { - Pos *p; - unsigned n; - uint16_t wsize = (uint16_t)s->w_size; - const __m128i xmm_wsize = _mm_set1_epi16((short)wsize); +static inline void slide_hash_sse2_chain(Pos *table, uint32_t entries, const __m128i wsize) { + table += entries; + table -= 8; - n = HASH_SIZE; - p = &s->head[n] - 8; do { __m128i value, result; - value = _mm_loadu_si128((__m128i *)p); - result= _mm_subs_epu16(value, xmm_wsize); - _mm_storeu_si128((__m128i *)p, result); - p -= 8; - n -= 8; - } while (n > 0); + value = _mm_loadu_si128((__m128i *)table); + result= _mm_subs_epu16(value, wsize); + _mm_storeu_si128((__m128i *)table, result); - n = wsize; - p = &s->prev[n] - 8; - do { - __m128i value, result; + table -= 8; + entries -= 8; + } while (entries > 0); +} - value = _mm_loadu_si128((__m128i *)p); - result= _mm_subs_epu16(value, xmm_wsize); - _mm_storeu_si128((__m128i *)p, result); +Z_INTERNAL void slide_hash_sse2(deflate_state *s) { + uint16_t wsize = (uint16_t)s->w_size; + const __m128i xmm_wsize = _mm_set1_epi16((short)wsize); - p -= 8; - n -= 8; - } while (n > 0); + slide_hash_sse2_chain(s->head, HASH_SIZE, xmm_wsize); + slide_hash_sse2_chain(s->prev, wsize, xmm_wsize); } diff --git a/deflate.c b/deflate.c index ca9dafa8d..2aae899be 100644 --- a/deflate.c +++ b/deflate.c @@ -188,20 +188,19 @@ static const config configuration_table[10] = { * bit values at the expense of memory usage). We slide even when level == 0 to * keep the hash table consistent if we switch back to level > 0 later. */ -Z_INTERNAL void slide_hash_c(deflate_state *s) { - Pos *p; - unsigned n; - unsigned int wsize = s->w_size; - - n = HASH_SIZE; - p = &s->head[n]; +static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) { #ifdef NOT_TWEAK_COMPILER + table += entries; do { unsigned m; - m = *--p; - *p = (Pos)(m >= wsize ? m-wsize : 0); - } while (--n); + m = *--table; + *table = (Pos)(m >= wsize ? m-wsize : 0); + /* If entries is not on any hash chain, prev[entries] is garbage but + * its value will never be used. + */ + } while (--entries); #else + { /* As of I make this change, gcc (4.8.*) isn't able to vectorize * this hot loop using saturated-subtraction on x86-64 architecture. * To avoid this defect, we can change the loop such that @@ -210,40 +209,23 @@ Z_INTERNAL void slide_hash_c(deflate_state *s) { * choose type "Pos" (instead of 'unsigned int') for the * variable to avoid unnecessary zero-extension. */ - { unsigned int i; - Pos *q = p - n; - for (i = 0; i < n; i++) { + Pos *q = table; + for (i = 0; i < entries; i++) { Pos m = *q; Pos t = (Pos)wsize; *q++ = (Pos)(m >= t ? m-t: 0); } } #endif /* NOT_TWEAK_COMPILER */ +} - n = wsize; - p = &s->prev[n]; -#ifdef NOT_TWEAK_COMPILER - do { - unsigned m; - m = *--p; - *p = (Pos)(m >= wsize ? m-wsize : 0); - /* If n is not on any hash chain, prev[n] is garbage but - * its value will never be used. - */ - } while (--n); -#else - { - unsigned int i; - Pos *q = p - n; - for (i = 0; i < n; i++) { - Pos m = *q; - Pos t = (Pos)wsize; - *q++ = (Pos)(m >= t ? m-t: 0); +Z_INTERNAL void slide_hash_c(deflate_state *s) { + unsigned int wsize = s->w_size; + + slide_hash_c_chain(s->head, HASH_SIZE, wsize); + slide_hash_c_chain(s->prev, wsize, wsize); } - } -#endif /* NOT_TWEAK_COMPILER */ -} /* ========================================================================= */ int32_t Z_EXPORT PREFIX(deflateInit_)(PREFIX3(stream) *strm, int32_t level, const char *version, int32_t stream_size) {