check_avx2_intrinsics()
if(HAVE_AVX2_INTRIN AND WITH_SSE42)
add_definitions(-DX86_AVX2)
+ #list(APPEND AVX512_SRCS insert_string_avx2.c)
+ #list(APPEND AVX512_SRCS insert_string_avx2-8.c)
set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
check_avx512_intrinsics()
if(HAVE_AVX512_INTRIN AND WITH_AVX2)
add_definitions(-DX86_AVX512)
+ #list(APPEND AVX512_SRCS insert_string_avx512.c)
list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH update_hash
-#define INSERT_STRING insert_string
+#define INSERT_STRING insert_string2
#define QUICK_INSERT_STRING quick_insert_string
#include "insert_string_tpl.h"
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+ uint8_t *strstart = s->window + str; // Start of string
+ uint8_t *strend = strstart + count; // End of string
+ Pos *headp = s->head; // Local variabes to avoid indirection
+ Pos *prevp = s->prev; // -||-
+ uint32_t w_mask = s->w_mask; // -||-
+ Pos idx = (Pos)str; // Starting index
+
+ for ( ; strstart + 3 < strend; idx+=3, strstart+=3) {
+ uint32_t val_0 = zng_memread_4(strstart);
+ uint32_t val_1 = zng_memread_4(strstart + 1);
+ uint32_t val_2 = zng_memread_4(strstart + 2);
+
+ uint32_t h_0 = ((val_0 * 2654435761U) >> 16) & HASH_MASK;
+ uint32_t h_1 = ((val_1 * 2654435761U) >> 16) & HASH_MASK;
+ uint32_t h_2 = ((val_2 * 2654435761U) >> 16) & HASH_MASK;
+
+ Pos idx_1 = idx + 1;
+ Pos idx_2 = idx + 2;
+
+ Pos head_0 = headp[h_0];
+ if (head_0 != idx) {
+ prevp[idx & w_mask] = head_0;
+ headp[h_0] = idx;
+ }
+
+ Pos head_1 = headp[h_1];
+ if (head_1 != idx_1) {
+ prevp[idx_1 & w_mask] = head_1;
+ headp[h_1] = idx_1;
+ }
+
+ Pos head_2 = headp[h_2];
+ if (head_2 != idx_2) {
+ prevp[idx_2 & w_mask] = head_2;
+ headp[h_2] = idx_2;
+ }
+ }
+
+ // Handle remaining elements as scalar
+ for ( ; strstart < strend; idx++, strstart++) {
+ uint32_t val, h;
+
+ val = zng_memread_4(strstart);
+ h = ((val * 2654435761U) >> 16);
+ h &= HASH_MASK;
+
+ Pos head = headp[h];
+ if (head != idx) {
+ prevp[idx & w_mask] = head;
+ headp[h] = idx;
+ }
+ }
+}
--- /dev/null
+#include "zbuild.h"
+#include "deflate.h"
+#include <immintrin.h>
+#include <stdio.h>
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+ uint8_t *strstart = s->window + str; // Start of string
+ uint8_t *strend = strstart + count; // End of string
+ Pos *headp = s->head; // Local variabes to avoid indirection
+ Pos *prevp = s->prev; // -||-
+ uint32_t w_mask = s->w_mask; // -||-
+ Pos idx = (Pos)str; // Starting index
+
+ // Use vectorized loop if enough input length
+ if (count >= 8) {
+ __m256i hash_mask_vec = _mm256_set1_epi32(HASH_MASK); // mask as vector
+ __m256i indices = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0); // gather indexes
+ const __m256i permVec = _mm256_setr_epi8(0, 1, 2, 3, // load order
+ 1, 2, 3, 4,
+ 2, 3, 4, 5,
+ 3, 4, 5, 6,
+ 4, 5, 6, 7,
+ 5, 6, 7, 8,
+ 6, 7, 8, 9,
+ 7, 8, 9, 10);
+
+ // Main vectorized loop
+ for ( ; strstart+8 < strend; idx+=8, strstart+=8) {
+ // Load data
+ __m256i val_vec = _mm256_i32gather_epi32((const int*)strstart, indices, 1);
+
+ // Hash calculation
+ __m256i h_vec = _mm256_mullo_epi32(val_vec, _mm256_set1_epi32(2654435761U));
+ h_vec = _mm256_srli_epi32(h_vec, 16);
+ h_vec = _mm256_and_si256(h_vec, hash_mask_vec);
+
+ int32_t h_array[8];
+ _mm256_storeu_si256((__m256i*)h_array, h_vec);
+
+ for (int i = 0; i < 8; i++) {
+ uint32_t h = h_array[i];
+ Pos idx0 = idx + i;
+ Pos head = headp[h];
+ if (head != idx0) {
+ prevp[idx0 & w_mask] = head;
+ headp[h] = idx0;
+ }
+ }
+ }
+ }
+
+ // Handle remaining elements as scalar
+ for ( ; strstart < strend; idx++, strstart++) {
+ uint32_t val, h;
+
+ val = zng_memread_4(strstart);
+ h = ((val * 2654435761U) >> 16);
+ h &= HASH_MASK;
+
+ Pos head = headp[h];
+ if (head != idx) {
+ prevp[idx & w_mask] = head;
+ headp[h] = idx;
+ }
+ }
+}
--- /dev/null
+#include "zbuild.h"
+#include "deflate.h"
+#include <immintrin.h>
+#include <stdio.h>
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+ uint8_t *strstart = s->window + str; // Start of string
+ uint8_t *strend = strstart + count; // End of string
+ Pos *headp = s->head; // Local variabes to avoid indirection
+ Pos *prevp = s->prev; // -||-
+ uint32_t w_mask = s->w_mask; // -||-
+ Pos idx = (Pos)str; // Starting index
+
+ printf("%d\n", count);
+
+ // Use vectorized loop if enough input length
+ if (count >= 4) {
+ __m128i hash_mask_vec = _mm_set1_epi32(HASH_MASK); // mask as vector
+ const __m128i permVec = _mm_setr_epi8(0, 1, 2, 3, // load order
+ 1, 2, 3, 4,
+ 2, 3, 4, 5,
+ 3, 4, 5, 6);
+
+ // Main vectorized loop
+ for ( ; strstart+4 < strend; idx+=4, strstart+=4) {
+ // Load data
+ __m128i val_vec = _mm_loadl_epi64((__m128i *)strstart);
+ val_vec = _mm_shuffle_epi8(val_vec, permVec);
+
+ // Prepare idx
+ Pos idx1 = idx + 1;
+ Pos idx2 = idx + 2;
+ Pos idx3 = idx + 3;
+
+ // Hash calculation
+ __m128i h_vec = _mm_mullo_epi32(val_vec, _mm_set1_epi32(2654435761U));
+ h_vec = _mm_srli_epi32(h_vec, 16);
+ h_vec = _mm_and_si128(h_vec, hash_mask_vec);
+
+ // Extract the hashed values
+ uint32_t h0 = _mm_extract_epi32(h_vec, 0);
+ uint32_t h1 = _mm_extract_epi32(h_vec, 1);
+ uint32_t h2 = _mm_extract_epi32(h_vec, 2);
+ uint32_t h3 = _mm_extract_epi32(h_vec, 3);
+
+ // Insert into hash table
+ Pos head0 = headp[h0];
+ if (head0 != idx) {
+ prevp[idx & w_mask] = head0;
+ headp[h0] = idx;
+ }
+ Pos head1 = headp[h1];
+ if (head1 != idx1) {
+ prevp[idx1 & w_mask] = head1;
+ headp[h1] = idx1;
+ }
+ Pos head2 = headp[h2];
+ if (head2 != idx2) {
+ prevp[idx2 & w_mask] = head2;
+ headp[h2] = idx2;
+ }
+ Pos head3 = headp[h3];
+ if (head3 != idx3) {
+ prevp[idx3 & w_mask] = head3;
+ headp[h3] = idx3;
+ }
+ }
+ }
+
+ // Handle remaining elements as scalar
+ for ( ; strstart < strend; idx++, strstart++) {
+ uint32_t val, h;
+
+ val = zng_memread_4(strstart);
+ h = ((val * 2654435761U) >> 16);
+ h &= HASH_MASK;
+
+ Pos head = headp[h];
+ if (head != idx) {
+ prevp[idx & w_mask] = head;
+ headp[h] = idx;
+ }
+ }
+}
--- /dev/null
+#include "zbuild.h"
+#include "deflate.h"
+#include <immintrin.h>
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+ uint8_t *strstart = s->window + str; // Start of string
+ uint8_t *strend = strstart + count; // End of string
+ Pos *headp = s->head; // Local variabes to avoid indirection
+ Pos *prevp = s->prev; // -||-
+ uint32_t w_mask = s->w_mask; // -||-
+ Pos idx = (Pos)str; // Starting index
+
+ // Use vectorized loop if enough input length
+ if (count >= 4) {
+ __m128i hash_mask_vec = _mm_set1_epi32(HASH_MASK); // mask as vector
+ __m128i w_mask_vec = _mm_set1_epi32(s->w_mask); // w_mask as vector
+ __m128i indices = _mm_set_epi32(3, 2, 1, 0); // gather indexes
+ const __m128i permVec = _mm_setr_epi8(0, 1, 2, 3, // load order
+ 1, 2, 3, 4,
+ 2, 3, 4, 5,
+ 3, 4, 5, 6);
+
+ // Main vectorized loop
+ for ( ; strstart+4 < strend; idx+=4, strstart+=4) {
+ // Load data
+ __m128i val_vec = _mm_loadl_epi64((__m128i *)strstart);
+ val_vec = _mm_shuffle_epi8(val_vec, permVec);
+
+ // Prepare idx
+ __m128i idx_vec = _mm_add_epi32(_mm_set1_epi32(idx), indices);
+
+ // prev_indices = idx_vec & w_mask
+ __m128i prev_indices = _mm_and_si128(idx_vec, w_mask_vec);
+
+ // Hash calculation
+ __m128i h_vec = _mm_mullo_epi32(val_vec, _mm_set1_epi32(2654435761U));
+ h_vec = _mm_srli_epi32(h_vec, 16);
+ h_vec = _mm_and_si128(h_vec, hash_mask_vec);
+
+ // head_vec = headp[h_vec]
+ __m128i head_vec = _mm_i32gather_epi32((const int*)headp, h_vec, sizeof(Pos));
+
+ // Compute mask where head != idx
+ __mmask8 mask = _mm_cmpeq_epi32_mask(head_vec, idx_vec);
+ mask = ~mask & 0xF;
+
+ // Scatter headp[h] = idx
+ _mm_mask_i32scatter_epi32((int*)headp, mask, h_vec, idx_vec, sizeof(Pos));
+
+ // Scatter prevp[idx & w_mask] = headp[h]
+ _mm_mask_i32scatter_epi32((int*)prevp, mask, prev_indices, head_vec, sizeof(Pos));
+ }
+ }
+
+ // Handle remaining elements as scalar
+ for ( ; strstart < strend; idx++, strstart++) {
+ uint32_t val, h;
+
+ val = zng_memread_4(strstart);
+ h = ((val * 2654435761U) >> 16);
+ h &= HASH_MASK;
+
+ Pos head = headp[h];
+ if (head != idx) {
+ prevp[idx & w_mask] = head;
+ headp[h] = idx;
+ }
+ }
+}