From: Hans Kristian Rosbach <hk-git@circlestorm.org>
Date: Tue, 19 Aug 2025 20:35:53 +0000 (+0200)
Subject: DRAFT: Vectorize insert_string.
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fheads%2Finsert_string_avx512;p=thirdparty%2Fzlib-ng.git

DRAFT: Vectorize insert_string.

Multiple variants
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 38885089..8afcd43d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1106,6 +1106,8 @@ if(WITH_OPTIM)
             check_avx2_intrinsics()
             if(HAVE_AVX2_INTRIN AND WITH_SSE42)
                 add_definitions(-DX86_AVX2)
+                #list(APPEND AVX512_SRCS insert_string_avx2.c)
+                #list(APPEND AVX512_SRCS insert_string_avx2-8.c)
                 set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
                 add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
                 list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
@@ -1124,6 +1126,7 @@ if(WITH_OPTIM)
             check_avx512_intrinsics()
             if(HAVE_AVX512_INTRIN AND WITH_AVX2)
                 add_definitions(-DX86_AVX512)
+                #list(APPEND AVX512_SRCS insert_string_avx512.c)
                 list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
                 add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
                 list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
diff --git a/insert_string.c b/insert_string.c
index 11a5b97f..c21a5dea 100644
--- a/insert_string.c
+++ b/insert_string.c
@@ -15,7 +15,62 @@
 #define HASH_CALC_VAR_INIT   uint32_t h = 0
 
 #define UPDATE_HASH          update_hash
-#define INSERT_STRING        insert_string
+#define INSERT_STRING        insert_string2
 #define QUICK_INSERT_STRING  quick_insert_string
 
 #include "insert_string_tpl.h"
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+    uint8_t *strstart = s->window + str;                // Start of string
+    uint8_t *strend = strstart + count;                 // End of string
+    Pos *headp = s->head;                               // Local variabes to avoid indirection
+    Pos *prevp = s->prev;                               //  -||-
+    uint32_t w_mask = s->w_mask;                        //  -||-
+    Pos idx = (Pos)str;                                 // Starting index
+
+    for ( ; strstart + 3 < strend; idx+=3, strstart+=3) {
+        uint32_t val_0 = zng_memread_4(strstart);
+        uint32_t val_1 = zng_memread_4(strstart + 1);
+        uint32_t val_2 = zng_memread_4(strstart + 2);
+
+        uint32_t h_0 = ((val_0 * 2654435761U) >> 16) & HASH_MASK;
+        uint32_t h_1 = ((val_1 * 2654435761U) >> 16) & HASH_MASK;
+        uint32_t h_2 = ((val_2 * 2654435761U) >> 16) & HASH_MASK;
+
+        Pos idx_1 = idx + 1;
+        Pos idx_2 = idx + 2;
+
+        Pos head_0 = headp[h_0];
+        if (head_0 != idx) {
+            prevp[idx & w_mask] = head_0;
+            headp[h_0] = idx;
+        }
+
+        Pos head_1 = headp[h_1];
+        if (head_1 != idx_1) {
+            prevp[idx_1 & w_mask] = head_1;
+            headp[h_1] = idx_1;
+        }
+
+        Pos head_2 = headp[h_2];
+        if (head_2 != idx_2) {
+            prevp[idx_2 & w_mask] = head_2;
+            headp[h_2] = idx_2;
+        }
+    }
+
+    // Handle remaining elements as scalar
+    for ( ; strstart < strend; idx++, strstart++) {
+        uint32_t val, h;
+
+        val = zng_memread_4(strstart);
+        h = ((val * 2654435761U) >> 16);
+        h &= HASH_MASK;
+
+        Pos head = headp[h];
+        if (head != idx) {
+            prevp[idx & w_mask] = head;
+            headp[h] = idx;
+        }
+    }
+}
diff --git a/insert_string_avx2-8.c b/insert_string_avx2-8.c
new file mode 100644
index 00000000..6b5f067c
--- /dev/null
+++ b/insert_string_avx2-8.c
@@ -0,0 +1,66 @@
+#include "zbuild.h"
+#include "deflate.h"
+#include <immintrin.h>
+#include <stdio.h>
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+    uint8_t *strstart = s->window + str;                // Start of string
+    uint8_t *strend = strstart + count;                 // End of string
+    Pos *headp = s->head;                               // Local variabes to avoid indirection
+    Pos *prevp = s->prev;                               //  -||-
+    uint32_t w_mask = s->w_mask;                        //  -||-
+    Pos idx = (Pos)str;                                 // Starting index
+
+    // Use vectorized loop if enough input length
+    if (count >= 8) {
+        __m256i hash_mask_vec = _mm256_set1_epi32(HASH_MASK);  // mask as vector
+        __m256i indices = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);        // gather indexes
+        const __m256i permVec = _mm256_setr_epi8(0, 1, 2, 3,   // load order
+                                              1, 2, 3, 4,
+                                              2, 3, 4, 5,
+                                              3, 4, 5, 6,
+                                              4, 5, 6, 7,
+                                              5, 6, 7, 8,
+                                              6, 7, 8, 9,
+                                              7, 8, 9, 10);
+
+        // Main vectorized loop
+        for ( ; strstart+8 < strend; idx+=8, strstart+=8) {
+            // Load data
+             __m256i val_vec = _mm256_i32gather_epi32((const int*)strstart, indices, 1);
+
+            // Hash calculation
+            __m256i h_vec = _mm256_mullo_epi32(val_vec, _mm256_set1_epi32(2654435761U));
+            h_vec = _mm256_srli_epi32(h_vec, 16);
+            h_vec = _mm256_and_si256(h_vec, hash_mask_vec);
+
+            int32_t h_array[8];
+            _mm256_storeu_si256((__m256i*)h_array, h_vec);
+
+            for (int i = 0; i < 8; i++) {
+                uint32_t h = h_array[i];
+                Pos idx0 = idx + i;
+                Pos head = headp[h];
+                if (head != idx0) {
+                    prevp[idx0 & w_mask] = head;
+                    headp[h] = idx0;
+                }
+            }
+        }
+    }
+
+    // Handle remaining elements as scalar
+    for ( ; strstart < strend; idx++, strstart++) {
+        uint32_t val, h;
+
+        val = zng_memread_4(strstart);
+        h = ((val * 2654435761U) >> 16);
+        h &= HASH_MASK;
+
+        Pos head = headp[h];
+        if (head != idx) {
+            prevp[idx & w_mask] = head;
+            headp[h] = idx;
+        }
+    }
+}
diff --git a/insert_string_avx2.c b/insert_string_avx2.c
new file mode 100644
index 00000000..46fe2a0b
--- /dev/null
+++ b/insert_string_avx2.c
@@ -0,0 +1,84 @@
+#include "zbuild.h"
+#include "deflate.h"
+#include <immintrin.h>
+#include <stdio.h>
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+    uint8_t *strstart = s->window + str;                // Start of string
+    uint8_t *strend = strstart + count;                 // End of string
+    Pos *headp = s->head;                               // Local variabes to avoid indirection
+    Pos *prevp = s->prev;                               //  -||-
+    uint32_t w_mask = s->w_mask;                        //  -||-
+    Pos idx = (Pos)str;                                 // Starting index
+
+    printf("%d\n", count);
+
+    // Use vectorized loop if enough input length
+    if (count >= 4) {
+        __m128i hash_mask_vec = _mm_set1_epi32(HASH_MASK);  // mask as vector
+        const __m128i permVec = _mm_setr_epi8(0, 1, 2, 3,   // load order
+                                              1, 2, 3, 4,
+                                              2, 3, 4, 5,
+                                              3, 4, 5, 6);
+
+        // Main vectorized loop
+        for ( ; strstart+4 < strend; idx+=4, strstart+=4) {
+            // Load data
+            __m128i val_vec = _mm_loadl_epi64((__m128i *)strstart);
+            val_vec = _mm_shuffle_epi8(val_vec, permVec);
+
+            // Prepare idx
+            Pos idx1 = idx + 1;
+            Pos idx2 = idx + 2;
+            Pos idx3 = idx + 3;
+
+            // Hash calculation
+            __m128i h_vec = _mm_mullo_epi32(val_vec, _mm_set1_epi32(2654435761U));
+            h_vec = _mm_srli_epi32(h_vec, 16);
+            h_vec = _mm_and_si128(h_vec, hash_mask_vec);
+
+            // Extract the hashed values
+            uint32_t h0 = _mm_extract_epi32(h_vec, 0);
+            uint32_t h1 = _mm_extract_epi32(h_vec, 1);
+            uint32_t h2 = _mm_extract_epi32(h_vec, 2);
+            uint32_t h3 = _mm_extract_epi32(h_vec, 3);
+
+            // Insert into hash table
+            Pos head0 = headp[h0];
+            if (head0 != idx) {
+                prevp[idx & w_mask] = head0;
+                headp[h0] = idx;
+            }
+            Pos head1 = headp[h1];
+            if (head1 != idx1) {
+                prevp[idx1 & w_mask] = head1;
+                headp[h1] = idx1;
+            }
+            Pos head2 = headp[h2];
+            if (head2 != idx2) {
+                prevp[idx2 & w_mask] = head2;
+                headp[h2] = idx2;
+            }
+            Pos head3 = headp[h3];
+            if (head3 != idx3) {
+                prevp[idx3 & w_mask] = head3;
+                headp[h3] = idx3;
+            }
+        }
+    }
+
+    // Handle remaining elements as scalar
+    for ( ; strstart < strend; idx++, strstart++) {
+        uint32_t val, h;
+
+        val = zng_memread_4(strstart);
+        h = ((val * 2654435761U) >> 16);
+        h &= HASH_MASK;
+
+        Pos head = headp[h];
+        if (head != idx) {
+            prevp[idx & w_mask] = head;
+            headp[h] = idx;
+        }
+    }
+}
diff --git a/insert_string_avx512.c b/insert_string_avx512.c
new file mode 100644
index 00000000..4a578bb8
--- /dev/null
+++ b/insert_string_avx512.c
@@ -0,0 +1,69 @@
+#include "zbuild.h"
+#include "deflate.h"
+#include <immintrin.h>
+
+void insert_string(deflate_state *const s, uint32_t str, uint32_t count) {
+    uint8_t *strstart = s->window + str;                // Start of string
+    uint8_t *strend = strstart + count;                 // End of string
+    Pos *headp = s->head;                               // Local variabes to avoid indirection
+    Pos *prevp = s->prev;                               //  -||-
+    uint32_t w_mask = s->w_mask;                        //  -||-
+    Pos idx = (Pos)str;                                 // Starting index
+
+    // Use vectorized loop if enough input length
+    if (count >= 4) {
+        __m128i hash_mask_vec = _mm_set1_epi32(HASH_MASK);  // mask as vector
+        __m128i w_mask_vec = _mm_set1_epi32(s->w_mask);     // w_mask as vector
+        __m128i indices = _mm_set_epi32(3, 2, 1, 0);        // gather indexes
+        const __m128i permVec = _mm_setr_epi8(0, 1, 2, 3,   // load order
+                                              1, 2, 3, 4,
+                                              2, 3, 4, 5,
+                                              3, 4, 5, 6);
+
+        // Main vectorized loop
+        for ( ; strstart+4 < strend; idx+=4, strstart+=4) {
+            // Load data
+            __m128i val_vec = _mm_loadl_epi64((__m128i *)strstart);
+            val_vec = _mm_shuffle_epi8(val_vec, permVec);
+
+            // Prepare idx
+            __m128i idx_vec = _mm_add_epi32(_mm_set1_epi32(idx), indices);
+
+            // prev_indices = idx_vec & w_mask
+            __m128i prev_indices = _mm_and_si128(idx_vec, w_mask_vec);
+
+            // Hash calculation
+            __m128i h_vec = _mm_mullo_epi32(val_vec, _mm_set1_epi32(2654435761U));
+            h_vec = _mm_srli_epi32(h_vec, 16);
+            h_vec = _mm_and_si128(h_vec, hash_mask_vec);
+
+            // head_vec = headp[h_vec]
+            __m128i head_vec = _mm_i32gather_epi32((const int*)headp, h_vec, sizeof(Pos));
+
+            // Compute mask where head != idx
+            __mmask8 mask = _mm_cmpeq_epi32_mask(head_vec, idx_vec);
+            mask = ~mask & 0xF;
+
+            // Scatter headp[h] = idx
+            _mm_mask_i32scatter_epi32((int*)headp, mask, h_vec, idx_vec, sizeof(Pos));
+
+            // Scatter prevp[idx & w_mask] = headp[h]
+            _mm_mask_i32scatter_epi32((int*)prevp, mask, prev_indices, head_vec, sizeof(Pos));
+        }
+    }
+
+    // Handle remaining elements as scalar
+    for ( ; strstart < strend; idx++, strstart++) {
+        uint32_t val, h;
+
+        val = zng_memread_4(strstart);
+        h = ((val * 2654435761U) >> 16);
+        h &= HASH_MASK;
+
+        Pos head = headp[h];
+        if (head != idx) {
+            prevp[idx & w_mask] = head;
+            headp[h] = idx;
+        }
+    }
+}