From: Hans Kristian Rosbach Date: Tue, 19 Aug 2025 20:35:53 +0000 (+0200) Subject: DRAFT: Vectorize insert_string. X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fheads%2Finsert_string_avx512;p=thirdparty%2Fzlib-ng.git DRAFT: Vectorize insert_string. Multiple variants --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 38885089..8afcd43d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1106,6 +1106,8 @@ if(WITH_OPTIM) check_avx2_intrinsics() if(HAVE_AVX2_INTRIN AND WITH_SSE42) add_definitions(-DX86_AVX2) + #list(APPEND AVX512_SRCS insert_string_avx2.c) + #list(APPEND AVX512_SRCS insert_string_avx2-8.c) set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c) add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c) @@ -1124,6 +1126,7 @@ if(WITH_OPTIM) check_avx512_intrinsics() if(HAVE_AVX512_INTRIN AND WITH_AVX2) add_definitions(-DX86_AVX512) + #list(APPEND AVX512_SRCS insert_string_avx512.c) list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"") list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c) diff --git a/insert_string.c b/insert_string.c index 11a5b97f..c21a5dea 100644 --- a/insert_string.c +++ b/insert_string.c @@ -15,7 +15,62 @@ #define HASH_CALC_VAR_INIT uint32_t h = 0 #define UPDATE_HASH update_hash -#define INSERT_STRING insert_string +#define INSERT_STRING insert_string2 #define QUICK_INSERT_STRING quick_insert_string #include "insert_string_tpl.h" + +void insert_string(deflate_state *const s, uint32_t str, uint32_t count) { + uint8_t *strstart = s->window + str; // Start of string + uint8_t *strend = strstart + count; // End of string + Pos *headp = s->head; // Local variabes to avoid indirection + Pos *prevp = s->prev; // -||- + uint32_t w_mask = s->w_mask; // -||- + Pos idx = (Pos)str; // Starting index + + for ( ; strstart + 3 < strend; idx+=3, strstart+=3) { + uint32_t val_0 = zng_memread_4(strstart); + uint32_t val_1 = zng_memread_4(strstart + 1); + uint32_t val_2 = zng_memread_4(strstart + 2); + + uint32_t h_0 = ((val_0 * 2654435761U) >> 16) & HASH_MASK; + uint32_t h_1 = ((val_1 * 2654435761U) >> 16) & HASH_MASK; + uint32_t h_2 = ((val_2 * 2654435761U) >> 16) & HASH_MASK; + + Pos idx_1 = idx + 1; + Pos idx_2 = idx + 2; + + Pos head_0 = headp[h_0]; + if (head_0 != idx) { + prevp[idx & w_mask] = head_0; + headp[h_0] = idx; + } + + Pos head_1 = headp[h_1]; + if (head_1 != idx_1) { + prevp[idx_1 & w_mask] = head_1; + headp[h_1] = idx_1; + } + + Pos head_2 = headp[h_2]; + if (head_2 != idx_2) { + prevp[idx_2 & w_mask] = head_2; + headp[h_2] = idx_2; + } + } + + // Handle remaining elements as scalar + for ( ; strstart < strend; idx++, strstart++) { + uint32_t val, h; + + val = zng_memread_4(strstart); + h = ((val * 2654435761U) >> 16); + h &= HASH_MASK; + + Pos head = headp[h]; + if (head != idx) { + prevp[idx & w_mask] = head; + headp[h] = idx; + } + } +} diff --git a/insert_string_avx2-8.c b/insert_string_avx2-8.c new file mode 100644 index 00000000..6b5f067c --- /dev/null +++ b/insert_string_avx2-8.c @@ -0,0 +1,66 @@ +#include "zbuild.h" +#include "deflate.h" +#include +#include + +void insert_string(deflate_state *const s, uint32_t str, uint32_t count) { + uint8_t *strstart = s->window + str; // Start of string + uint8_t *strend = strstart + count; // End of string + Pos *headp = s->head; // Local variabes to avoid indirection + Pos *prevp = s->prev; // -||- + uint32_t w_mask = s->w_mask; // -||- + Pos idx = (Pos)str; // Starting index + + // Use vectorized loop if enough input length + if (count >= 8) { + __m256i hash_mask_vec = _mm256_set1_epi32(HASH_MASK); // mask as vector + __m256i indices = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0); // gather indexes + const __m256i permVec = _mm256_setr_epi8(0, 1, 2, 3, // load order + 1, 2, 3, 4, + 2, 3, 4, 5, + 3, 4, 5, 6, + 4, 5, 6, 7, + 5, 6, 7, 8, + 6, 7, 8, 9, + 7, 8, 9, 10); + + // Main vectorized loop + for ( ; strstart+8 < strend; idx+=8, strstart+=8) { + // Load data + __m256i val_vec = _mm256_i32gather_epi32((const int*)strstart, indices, 1); + + // Hash calculation + __m256i h_vec = _mm256_mullo_epi32(val_vec, _mm256_set1_epi32(2654435761U)); + h_vec = _mm256_srli_epi32(h_vec, 16); + h_vec = _mm256_and_si256(h_vec, hash_mask_vec); + + int32_t h_array[8]; + _mm256_storeu_si256((__m256i*)h_array, h_vec); + + for (int i = 0; i < 8; i++) { + uint32_t h = h_array[i]; + Pos idx0 = idx + i; + Pos head = headp[h]; + if (head != idx0) { + prevp[idx0 & w_mask] = head; + headp[h] = idx0; + } + } + } + } + + // Handle remaining elements as scalar + for ( ; strstart < strend; idx++, strstart++) { + uint32_t val, h; + + val = zng_memread_4(strstart); + h = ((val * 2654435761U) >> 16); + h &= HASH_MASK; + + Pos head = headp[h]; + if (head != idx) { + prevp[idx & w_mask] = head; + headp[h] = idx; + } + } +} diff --git a/insert_string_avx2.c b/insert_string_avx2.c new file mode 100644 index 00000000..46fe2a0b --- /dev/null +++ b/insert_string_avx2.c @@ -0,0 +1,84 @@ +#include "zbuild.h" +#include "deflate.h" +#include +#include + +void insert_string(deflate_state *const s, uint32_t str, uint32_t count) { + uint8_t *strstart = s->window + str; // Start of string + uint8_t *strend = strstart + count; // End of string + Pos *headp = s->head; // Local variabes to avoid indirection + Pos *prevp = s->prev; // -||- + uint32_t w_mask = s->w_mask; // -||- + Pos idx = (Pos)str; // Starting index + + printf("%d\n", count); + + // Use vectorized loop if enough input length + if (count >= 4) { + __m128i hash_mask_vec = _mm_set1_epi32(HASH_MASK); // mask as vector + const __m128i permVec = _mm_setr_epi8(0, 1, 2, 3, // load order + 1, 2, 3, 4, + 2, 3, 4, 5, + 3, 4, 5, 6); + + // Main vectorized loop + for ( ; strstart+4 < strend; idx+=4, strstart+=4) { + // Load data + __m128i val_vec = _mm_loadl_epi64((__m128i *)strstart); + val_vec = _mm_shuffle_epi8(val_vec, permVec); + + // Prepare idx + Pos idx1 = idx + 1; + Pos idx2 = idx + 2; + Pos idx3 = idx + 3; + + // Hash calculation + __m128i h_vec = _mm_mullo_epi32(val_vec, _mm_set1_epi32(2654435761U)); + h_vec = _mm_srli_epi32(h_vec, 16); + h_vec = _mm_and_si128(h_vec, hash_mask_vec); + + // Extract the hashed values + uint32_t h0 = _mm_extract_epi32(h_vec, 0); + uint32_t h1 = _mm_extract_epi32(h_vec, 1); + uint32_t h2 = _mm_extract_epi32(h_vec, 2); + uint32_t h3 = _mm_extract_epi32(h_vec, 3); + + // Insert into hash table + Pos head0 = headp[h0]; + if (head0 != idx) { + prevp[idx & w_mask] = head0; + headp[h0] = idx; + } + Pos head1 = headp[h1]; + if (head1 != idx1) { + prevp[idx1 & w_mask] = head1; + headp[h1] = idx1; + } + Pos head2 = headp[h2]; + if (head2 != idx2) { + prevp[idx2 & w_mask] = head2; + headp[h2] = idx2; + } + Pos head3 = headp[h3]; + if (head3 != idx3) { + prevp[idx3 & w_mask] = head3; + headp[h3] = idx3; + } + } + } + + // Handle remaining elements as scalar + for ( ; strstart < strend; idx++, strstart++) { + uint32_t val, h; + + val = zng_memread_4(strstart); + h = ((val * 2654435761U) >> 16); + h &= HASH_MASK; + + Pos head = headp[h]; + if (head != idx) { + prevp[idx & w_mask] = head; + headp[h] = idx; + } + } +} diff --git a/insert_string_avx512.c b/insert_string_avx512.c new file mode 100644 index 00000000..4a578bb8 --- /dev/null +++ b/insert_string_avx512.c @@ -0,0 +1,69 @@ +#include "zbuild.h" +#include "deflate.h" +#include + +void insert_string(deflate_state *const s, uint32_t str, uint32_t count) { + uint8_t *strstart = s->window + str; // Start of string + uint8_t *strend = strstart + count; // End of string + Pos *headp = s->head; // Local variabes to avoid indirection + Pos *prevp = s->prev; // -||- + uint32_t w_mask = s->w_mask; // -||- + Pos idx = (Pos)str; // Starting index + + // Use vectorized loop if enough input length + if (count >= 4) { + __m128i hash_mask_vec = _mm_set1_epi32(HASH_MASK); // mask as vector + __m128i w_mask_vec = _mm_set1_epi32(s->w_mask); // w_mask as vector + __m128i indices = _mm_set_epi32(3, 2, 1, 0); // gather indexes + const __m128i permVec = _mm_setr_epi8(0, 1, 2, 3, // load order + 1, 2, 3, 4, + 2, 3, 4, 5, + 3, 4, 5, 6); + + // Main vectorized loop + for ( ; strstart+4 < strend; idx+=4, strstart+=4) { + // Load data + __m128i val_vec = _mm_loadl_epi64((__m128i *)strstart); + val_vec = _mm_shuffle_epi8(val_vec, permVec); + + // Prepare idx + __m128i idx_vec = _mm_add_epi32(_mm_set1_epi32(idx), indices); + + // prev_indices = idx_vec & w_mask + __m128i prev_indices = _mm_and_si128(idx_vec, w_mask_vec); + + // Hash calculation + __m128i h_vec = _mm_mullo_epi32(val_vec, _mm_set1_epi32(2654435761U)); + h_vec = _mm_srli_epi32(h_vec, 16); + h_vec = _mm_and_si128(h_vec, hash_mask_vec); + + // head_vec = headp[h_vec] + __m128i head_vec = _mm_i32gather_epi32((const int*)headp, h_vec, sizeof(Pos)); + + // Compute mask where head != idx + __mmask8 mask = _mm_cmpeq_epi32_mask(head_vec, idx_vec); + mask = ~mask & 0xF; + + // Scatter headp[h] = idx + _mm_mask_i32scatter_epi32((int*)headp, mask, h_vec, idx_vec, sizeof(Pos)); + + // Scatter prevp[idx & w_mask] = headp[h] + _mm_mask_i32scatter_epi32((int*)prevp, mask, prev_indices, head_vec, sizeof(Pos)); + } + } + + // Handle remaining elements as scalar + for ( ; strstart < strend; idx++, strstart++) { + uint32_t val, h; + + val = zng_memread_4(strstart); + h = ((val * 2654435761U) >> 16); + h &= HASH_MASK; + + Pos head = headp[h]; + if (head != idx) { + prevp[idx & w_mask] = head; + headp[h] = idx; + } + } +}