#include "../../deflate.h"
/* SIMD version of hash_chain rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
+static inline void slide_hash_neon_chain(Pos *table, uint32_t entries, uint16_t wsize) {
Z_REGISTER uint16x8_t v, *p;
Z_REGISTER size_t n;
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
Assert(sizeof(Pos) == 2, "Wrong Pos size");
- v = vdupq_n_u16(window_size);
+ v = vdupq_n_u16(wsize);
p = (uint16x8_t *)table;
n = size / (sizeof(uint16x8_t) * 8);
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
unsigned int wsize = s->w_size;
- slide_hash_chain(s->head, HASH_SIZE, wsize);
- slide_hash_chain(s->prev, wsize, wsize);
+ slide_hash_neon_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_neon_chain(s->prev, wsize, wsize);
}
#endif
#include "zbuild.h"
#include "deflate.h"
-static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
+static inline void slide_hash_power8_chain(Pos *table, uint32_t entries, uint16_t wsize) {
vector unsigned short vw, vm, *vp;
unsigned chunks;
+ table += entries;
+
/* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
- * so instead of processing each of the n_elems in the hash table
+ * so instead of processing each of the entries in the hash table
* individually, we can do it in chunks of 8 with vector instructions.
*
* This function is only called from slide_hash_power8(), and both calls
- * pass n_elems as a power of 2 higher than 2^7, as defined by
- * deflateInit2_(), so n_elems will always be a multiple of 8. */
- chunks = n_elems >> 3;
- Assert(n_elems % 8 == 0, "Weird hash table size!");
+ * pass entries as a power of 2 higher than 2^7, as defined by
+ * deflateInit2_(), so entries will always be a multiple of 8. */
+ chunks = entries >> 3;
+ Assert(entries % 8 == 0, "Weird hash table size!");
- /* This type casting is safe since s->w_size is always <= 64KB
+ /* This type casting is safe since wsize is always <= 64KB
* as defined by deflateInit2_() and Posf == unsigned short */
- vw[0] = (Pos) s->w_size;
+ vw[0] = wsize;
vw = vec_splat(vw,0);
- vp = (vector unsigned short *) table_end;
+ vp = (vector unsigned short *)table;
do {
/* Processing 8 elements at a time */
vp--;
vm = *vp;
- /* This is equivalent to: m >= w_size ? m - w_size : 0
+ /* This is equivalent to: m >= wsize ? m - wsize : 0
* Since we are using a saturated unsigned subtraction, any
- * values that are > w_size will be set to 0, while the others
- * will be subtracted by w_size. */
+ * values that are > wsize will be set to 0, while the others
+ * will be subtracted by wsize. */
*vp = vec_subs(vm,vw);
} while (--chunks);
}
void Z_INTERNAL slide_hash_power8(deflate_state *s) {
- unsigned int n;
- Pos *p;
-
- n = HASH_SIZE;
- p = &s->head[n];
- slide_hash_power8_loop(s,n,p);
+ uint16_t wsize = s->w_size;
- n = s->w_size;
- p = &s->prev[n];
- slide_hash_power8_loop(s,n,p);
+ slide_hash_power8_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_power8_chain(s->prev, wsize, wsize);
}
#endif /* POWER8_VSX_SLIDEHASH */
#include <immintrin.h>
-Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
- Pos *p;
- unsigned n;
- uint16_t wsize = (uint16_t)s->w_size;
- const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+static inline void slide_hash_avx2_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+ table += entries;
+ table -= 16;
- n = HASH_SIZE;
- p = &s->head[n] - 16;
do {
__m256i value, result;
- value = _mm256_loadu_si256((__m256i *)p);
- result= _mm256_subs_epu16(value, ymm_wsize);
- _mm256_storeu_si256((__m256i *)p, result);
- p -= 16;
- n -= 16;
- } while (n > 0);
+ value = _mm256_loadu_si256((__m256i *)table);
+ result = _mm256_subs_epu16(value, wsize);
+ _mm256_storeu_si256((__m256i *)table, result);
- n = wsize;
- p = &s->prev[n] - 16;
- do {
- __m256i value, result;
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+}
- value = _mm256_loadu_si256((__m256i *)p);
- result= _mm256_subs_epu16(value, ymm_wsize);
- _mm256_storeu_si256((__m256i *)p, result);
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
- p -= 16;
- n -= 16;
- } while (n > 0);
+ slide_hash_avx2_chain(s->head, HASH_SIZE, ymm_wsize);
+ slide_hash_avx2_chain(s->prev, wsize, ymm_wsize);
}
#include <immintrin.h>
-Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
- Pos *p;
- unsigned n;
- uint16_t wsize = (uint16_t)s->w_size;
- const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+static inline void slide_hash_sse2_chain(Pos *table, uint32_t entries, const __m128i wsize) {
+ table += entries;
+ table -= 8;
- n = HASH_SIZE;
- p = &s->head[n] - 8;
do {
__m128i value, result;
- value = _mm_loadu_si128((__m128i *)p);
- result= _mm_subs_epu16(value, xmm_wsize);
- _mm_storeu_si128((__m128i *)p, result);
- p -= 8;
- n -= 8;
- } while (n > 0);
+ value = _mm_loadu_si128((__m128i *)table);
+ result= _mm_subs_epu16(value, wsize);
+ _mm_storeu_si128((__m128i *)table, result);
- n = wsize;
- p = &s->prev[n] - 8;
- do {
- __m128i value, result;
+ table -= 8;
+ entries -= 8;
+ } while (entries > 0);
+}
- value = _mm_loadu_si128((__m128i *)p);
- result= _mm_subs_epu16(value, xmm_wsize);
- _mm_storeu_si128((__m128i *)p, result);
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
- p -= 8;
- n -= 8;
- } while (n > 0);
+ slide_hash_sse2_chain(s->head, HASH_SIZE, xmm_wsize);
+ slide_hash_sse2_chain(s->prev, wsize, xmm_wsize);
}
* bit values at the expense of memory usage). We slide even when level == 0 to
* keep the hash table consistent if we switch back to level > 0 later.
*/
-Z_INTERNAL void slide_hash_c(deflate_state *s) {
- Pos *p;
- unsigned n;
- unsigned int wsize = s->w_size;
-
- n = HASH_SIZE;
- p = &s->head[n];
+static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) {
#ifdef NOT_TWEAK_COMPILER
+ table += entries;
do {
unsigned m;
- m = *--p;
- *p = (Pos)(m >= wsize ? m-wsize : 0);
- } while (--n);
+ m = *--table;
+ *table = (Pos)(m >= wsize ? m-wsize : 0);
+ /* If entries is not on any hash chain, prev[entries] is garbage but
+ * its value will never be used.
+ */
+ } while (--entries);
#else
+ {
/* As of I make this change, gcc (4.8.*) isn't able to vectorize
* this hot loop using saturated-subtraction on x86-64 architecture.
* To avoid this defect, we can change the loop such that
* choose type "Pos" (instead of 'unsigned int') for the
* variable to avoid unnecessary zero-extension.
*/
- {
unsigned int i;
- Pos *q = p - n;
- for (i = 0; i < n; i++) {
+ Pos *q = table;
+ for (i = 0; i < entries; i++) {
Pos m = *q;
Pos t = (Pos)wsize;
*q++ = (Pos)(m >= t ? m-t: 0);
}
}
#endif /* NOT_TWEAK_COMPILER */
+}
- n = wsize;
- p = &s->prev[n];
-#ifdef NOT_TWEAK_COMPILER
- do {
- unsigned m;
- m = *--p;
- *p = (Pos)(m >= wsize ? m-wsize : 0);
- /* If n is not on any hash chain, prev[n] is garbage but
- * its value will never be used.
- */
- } while (--n);
-#else
- {
- unsigned int i;
- Pos *q = p - n;
- for (i = 0; i < n; i++) {
- Pos m = *q;
- Pos t = (Pos)wsize;
- *q++ = (Pos)(m >= t ? m-t: 0);
+Z_INTERNAL void slide_hash_c(deflate_state *s) {
+ unsigned int wsize = s->w_size;
+
+ slide_hash_c_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_c_chain(s->prev, wsize, wsize);
}
- }
-#endif /* NOT_TWEAK_COMPILER */
-}
/* ========================================================================= */
int32_t Z_EXPORT PREFIX(deflateInit_)(PREFIX3(stream) *strm, int32_t level, const char *version, int32_t stream_size) {