Adding an SSE42 optimized copy + adler checksum implementation

author Adam Stylinski <kungfujesus06@gmail.com>

Sun, 3 Apr 2022 16:18:12 +0000 (12:18 -0400)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Mon, 23 May 2022 14:13:39 +0000 (16:13 +0200)
author Adam Stylinski <kungfujesus06@gmail.com>
Sun, 3 Apr 2022 16:18:12 +0000 (12:18 -0400)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Mon, 23 May 2022 14:13:39 +0000 (16:13 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 247af86d22d352750fadc40646820affdd9f049f..9719468bf325cebee691051bf65ce20e7669a3c7 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -779,8 +779,8 @@ if(WITH_OPTIM)
          if(WITH_SSE42)
              check_sse42_intrinsics()
              if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
-                add_definitions(-DX86_SSE42_CRC_HASH)
-                set(SSE42_SRCS ${ARCHDIR}/insert_string_sse42.c)
+                add_definitions(-DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32)
+                set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
                  add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE42FLAG}\"")
                  list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
                  set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
diff --git a/adler32_fold.h b/adler32_fold.h

index d93a5105345de827fb2458c2d7196e00fcd0c7fe..ec4270a7ab782077e9f8fe901cfb861f2e3381b5 100644 (file)
--- a/adler32_fold.h
+++ b/adler32_fold.h
@@ -11,7 +11,9 @@
  typedef struct adler32_fold_s {
      uint8_t adler[64]; // First half of component sums
      uint8_t sum2[64]; // Second half of component sums
-    uint32_t nsums; // The number of scalar sums performed
+    uint8_t leftover[16]; // A buffer for sub 16 sized carry over, sized for full loads and alignment
+    uint32_t nsums; // The number of scalar sums leftover
+    uint32_t bytes_leftover; // The number of leftover bytes from the previous sum
  } adler32_fold;
  
  Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler);
diff --git a/adler32_p.h b/adler32_p.h

index 1adc1ccb0aebc9181a242ac1bff3976c9e4141da..5a14172f73294f1043c79c78b9e280704d06e5cd 100644 (file)
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -26,6 +26,18 @@ static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, u
      return adler | (sum2 << 16);
  }
  
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++; 
+        adler += *dst++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
  static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
      while (len) {
          --len;
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in

index 689e3a0c24c7e59c34fe730d6e972650407309fd..f9aedf82ba679c90f2c0dc395539b300cbf2214f 100644 (file)
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -28,6 +28,7 @@ all: \
         adler32_avx2.o adler32_avx2.lo \
         adler32_avx512.o adler32_avx512.lo \
         adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+       adler32_sse42.o adler32_sse42.lo \
         adler32_ssse3.o adler32_ssse3.lo \
         chunkset_avx.o chunkset_avx.lo \
         chunkset_sse2.o chunkset_sse2.lo \
@@ -110,25 +111,31 @@ adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
         $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
  
  adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
-       $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+       $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
  
  adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
         $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
  
  adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
-       $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+       $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
  
  adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
         $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
  
  adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
-       $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+       $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
  
  adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
         $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
  
  adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
-       $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+       $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+       $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+       $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
  
  mostlyclean: clean
  clean:
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c

new file mode 100644 (file)

index 0000000..dce1550
--- /dev/null
+++ b/arch/x86/adler32_sse42.c
@@ -0,0 +1,31 @@
+/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "adler32_ssse3_p.h"
+#include <immintrin.h>
+
+#ifdef X86_SSE42_ADLER32
+
+Z_INTERNAL void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler) {
+    adler->nsums = init_adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_final_sse42(adler32_fold *adler) {
+    return adler->nsums;
+}
+
+#include "adler32_sse42_tpl.h"
+#undef ADLER32_SSE42_TPL_H_
+#define COPY
+#include "adler32_sse42_tpl.h"
+#undef COPY
+
+#endif
diff --git a/arch/x86/adler32_sse42_tpl.h b/arch/x86/adler32_sse42_tpl.h

new file mode 100644 (file)

index 0000000..71d1db8
--- /dev/null
+++ b/arch/x86/adler32_sse42_tpl.h
@@ -0,0 +1,132 @@
+/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSE42_TPL_H_
+#define ADLER32_SSE42_TPL_H_
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef COPY
+Z_INTERNAL void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len) {
+#endif
+
+    uint32_t adler0, adler1;
+    adler1 = (adler->nsums >> 16) & 0xffff;
+    adler0 = adler->nsums & 0xffff; 
+
+    if (len < 16) {
+rem_peel:
+#ifdef COPY
+       adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+       adler->nsums = adler32_len_16(adler0, src, len, adler1);
+#endif
+       return;
+    }
+
+    __m128i vbuf, vbuf_0;
+    __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            v_sad_sum2, vsum2, vsum2_0;
+    __m128i zero = _mm_setzero_si128();
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    size_t k;
+
+    while (len >= 16) {
+
+        k = MIN(len, NMAX);
+        k -= k % 16;
+        len -= k;
+
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+#endif
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+#endif
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+    }
+
+    /* If this is true, there's fewer than 16 elements remaining */
+    if (len) {
+        goto rem_peel;
+    }
+
+    adler->nsums = adler0 | (adler1 << 16);
+}
+
+#endif
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c

index b2ef0115c88064cc3e8b297d01faac5122c94f3d..8c55badf985517b860ee839ec43b889f311321cd 100644 (file)
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -8,25 +8,12 @@
  
  #include "../../zbuild.h"
  #include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
  
  #ifdef X86_SSSE3_ADLER32
  
  #include <immintrin.h>
  
-static inline uint32_t partial_hsum(__m128i x) {
-    __m128i second_int = _mm_bsrli_si128(x, 8);
-    __m128i sum = _mm_add_epi32(x, second_int);
-    return _mm_cvtsi128_si32(sum);
-}
-
-static inline uint32_t hsum(__m128i x) {
-    __m128i sum1 = _mm_unpackhi_epi64(x, x);
-    __m128i sum2 = _mm_add_epi32(x, sum1);
-    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
-    __m128i sum4 = _mm_add_epi32(sum2, sum3);
-    return _mm_cvtsi128_si32(sum4);
-}
-
  Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
      uint32_t sum2;
  
diff --git a/arch/x86/adler32_ssse3_p.h b/arch/x86/adler32_ssse3_p.h

new file mode 100644 (file)

index 0000000..ba914e1
--- /dev/null
+++ b/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3_ADLER32
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+    __m128i second_int = _mm_bsrli_si128(x, 8);
+    __m128i sum = _mm_add_epi32(x, second_int);
+    return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+    __m128i sum1 = _mm_unpackhi_epi64(x, x);
+    __m128i sum2 = _mm_add_epi32(x, sum1);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+    return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
diff --git a/arch/x86/adler32_ssse3_tpl.h b/arch/x86/adler32_ssse3_tpl.h

new file mode 100644 (file)

index 0000000..aedfa81
--- /dev/null
+++ b/arch/x86/adler32_ssse3_tpl.h
@@ -0,0 +1,188 @@
+/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_TPL_H_
+#define ADLER32_SSSE3_TPL_H_
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef COPY
+Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) {
+#endif
+    uint32_t adler0, adler1;
+
+     /* split Adler-32 into component sums */
+    adler1 = (adler->nsums >> 16) & 0xffff;
+    adler0 = adler->nsums & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1)) {
+#ifdef COPY
+        *(dst++) = *src;
+#endif
+        adler->nsums = adler32_len_1(adler0, src, adler1);
+        return;
+    }
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(src == NULL)) {
+        adler->nsums = 1L;
+        return;
+    }
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16)) {
+        goto sub16;
+    }
+
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+
+    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+    /* If our buffer is unaligned (likely), make the determination whether
+     * or not there's enough of a buffer to consume to make the scalar, aligning
+     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+     * load. This is a pretty simple test, just test if 16 - the remainder + len is
+     * < 16 */
+    size_t max_iters = NMAX;
+    size_t rem = (uintptr_t)src & 15;
+    size_t align_offset = 16 - rem;
+    size_t k = 0;
+    if (rem) {
+        if (len < 16 + align_offset) {
+            /* Let's eat the cost of this one unaligned load so that
+             * we don't completely skip over the vectorization. Doing
+             * 16 bytes at a time unaligned is is better than 16 + <= 15
+             * sums */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            len -= 16;
+            src += 16;
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+#endif
+            vs1 = _mm_cvtsi32_si128(adler0);
+            vs2 = _mm_cvtsi32_si128(adler1);
+            vs3 = _mm_setzero_si128();
+            vs1_0 = vs1;
+            goto unaligned_jmp;
+        }
+
+#ifdef COPY
+        memcpy(dst, src, align_offset);
+        dst += align_offset;
+#endif
+        for (size_t i = 0; i < align_offset; ++i) {
+            adler0 += *(src++);
+            adler1 += adler0;
+        }
+
+        /* lop off the max number of sums based on the scalar sums done
+         * above */
+        len -= align_offset;
+        max_iters -= align_offset; 
+    }
+
+
+    while (len >= 16) {
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        k = (len < max_iters ? len : max_iters);
+        k -= k % 16;
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)src);
+            vbuf_0 = _mm_load_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+#endif
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+unaligned_jmp:
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+#ifdef COPY
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+#endif
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+         * a partial reduction sum implicitly and only summing to integers in vector positions
+         * 0 and 2. This saves us some contention on the shuffle port(s) */
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+        max_iters = NMAX;
+    }
+
+sub16:
+#ifdef COPY
+    adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+    /* Process tail (len < 16).  */
+    adler->nsums = adler32_len_16(adler0, src, len, adler1);
+#endif
+}
+
+#endif
diff --git a/configure b/configure

index 6f79825a7f9fedaa99d6b5005b58856385946c96..b90c2342fc7ffccd4f659b00864cc816c37b1deb 100755 (executable)
--- a/configure
+++ b/configure
@@ -1495,16 +1495,16 @@ case "${ARCH}" in
              check_sse42_intrinsics
  
              if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
-                CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
-                SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH"
+                CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32"
+                SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH -DX86_SSE42_ADLER32"
  
                  if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
                    CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
                    SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN"
                  fi
  
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} insert_string_sse42.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse42.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_sse42.o insert_string_sse42.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse42.lo insert_string_sse42.lo"
              fi
  
              check_sse2_intrinsics
diff --git a/cpu_features.h b/cpu_features.h

index d3df33b9f6c52225ffc41c7ee2ca999d8a12973d..3dfaf6ce2e36fa3fceecc39df7aa2812473e7d0a 100644 (file)
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -7,6 +7,7 @@
  #define CPU_FEATURES_H_
  
  #include "crc32_fold.h"
+#include "adler32_fold.h"
  
  #if defined(X86_FEATURES)
  #  include "arch/x86/x86_features.h"
@@ -34,6 +35,12 @@ extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len
  #ifdef X86_SSSE3_ADLER32
  extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
  #endif
+#ifdef X86_SSE42_ADLER32
+extern void     adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler);
+extern void     adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern void     adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_final_sse42(adler32_fold *adler);
+#endif
  #ifdef X86_AVX2_ADLER32
  extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
  #endif
diff --git a/deflate.c b/deflate.c

index a98988a831fbbdc6030c2dd44abfe92067d14804..6818a860a049a214c1c1da3e616a090a1385e76f 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -448,7 +448,11 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
          strm->adler = functable.crc32_fold_reset(&s->crc_fold);
      else
  #endif
+        //strm->adler = ADLER32_INITIAL_VALUE;
+    {
          strm->adler = ADLER32_INITIAL_VALUE;
+        functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
+    }
      s->last_flush = -2;
  
      zng_tr_init(s);
@@ -767,6 +771,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
          if (s->strstart != 0)
              put_uint32_msb(s, strm->adler);
          strm->adler = ADLER32_INITIAL_VALUE;
+        functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
          s->status = BUSY_STATE;
  
          /* Compression must start with an empty pending buffer */
@@ -973,8 +978,11 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
          put_uint32(s, (uint32_t)strm->total_in);
      } else
  #endif
-    if (s->wrap == 1)
-        put_uint32_msb(s, strm->adler);
+    {
+        strm->adler = functable.adler32_fold_final(&s->adler_fold);
+        if (s->wrap == 1)
+            put_uint32_msb(s, strm->adler);
+    }
      PREFIX(flush_pending)(strm);
      /* If avail_out is zero, the application will call deflate again
       * to flush the rest.
@@ -1083,9 +1091,11 @@ Z_INTERNAL unsigned read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned
          functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
  #endif
      } else {
-        memcpy(buf, strm->next_in, len);
          if (strm->state->wrap == 1)
-            strm->adler = functable.adler32(strm->adler, buf, len);
+            functable.adler32_fold_copy(&strm->state->adler_fold, buf, strm->next_in, len);
+        else
+            memcpy(buf, strm->next_in, len);
+            //strm->adler = functable.adler32(strm->adler, buf, len);
      }
      strm->next_in  += len;
      strm->total_in += len;
diff --git a/deflate.h b/deflate.h

index b5ae2f8e665157a75fe36e92e05c9aa9f3d25f18..2d34c95edf430883827ed15589d951acfcd00834 100644 (file)
--- a/deflate.h
+++ b/deflate.h
@@ -10,6 +10,7 @@
     subject to change. Applications should only use zlib.h.
   */
  
+#include "adler32_fold.h"
  #include "zutil.h"
  #include "zendian.h"
  #include "crc32_fold.h"
@@ -211,6 +212,7 @@ struct internal_state {
  
      int nice_match; /* Stop searching when current match exceeds this */
  
+    struct adler32_fold_s ALIGNED_(64) adler_fold;
      struct crc32_fold_s ALIGNED_(16) crc_fold;
  
                  /* used by trees.c: */
diff --git a/functable.c b/functable.c

index 317349f303528744f886676a5f3e919fc56a6883..a2c75c9fcf2bb5a43a3e66bbd42863e85d3dae6f 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -204,21 +204,37 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
  
  Z_INTERNAL void adler32_fold_reset_stub(adler32_fold *adler, uint32_t init_adler) {
      functable.adler32_fold_reset = &adler32_fold_reset_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold_reset = &adler32_fold_reset_sse42;
+#endif
      functable.adler32_fold_reset(adler, init_adler);
  }
  
  Z_INTERNAL void adler32_fold_copy_stub(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
      functable.adler32_fold_copy = &adler32_fold_copy_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold_copy = &adler32_fold_copy_sse42;
+#endif
      functable.adler32_fold_copy(adler, dst, src, len);
  }
  
  Z_INTERNAL void adler32_fold_stub(adler32_fold *adler, const uint8_t *src, size_t len) {
      functable.adler32_fold = &adler32_fold_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold = &adler32_fold_sse42;
+#endif
      functable.adler32_fold(adler, src, len);
  }
  
  Z_INTERNAL uint32_t adler32_fold_final_stub(adler32_fold *adler) {
      functable.adler32_fold_final = &adler32_fold_final_c;
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX2_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+    if (x86_cpu_has_sse42)
+        functable.adler32_fold_final = &adler32_fold_final_sse42;
+#endif
      return functable.adler32_fold_final(adler);
  }
  
diff --git a/inflate.c b/inflate.c

index 9e816a40d8db089bba58258054ecc216c38aace3..864ca8882c1d477363686e0705435297cd462804 100644 (file)
--- a/inflate.c
+++ b/inflate.c
@@ -20,6 +20,7 @@ static uint32_t syncsearch(uint32_t *have, const unsigned char *buf, uint32_t le
  
  static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst,
                             const uint8_t *src, uint32_t copy) {
+    if (!copy) return;
      struct inflate_state *state = (struct inflate_state*)strm->state;
  #ifdef GUNZIP
      if (state->flags) {
diff --git a/win32/Makefile.msc b/win32/Makefile.msc

index b121cde22c3b4301a95ceda82798951b767469f1..bce5c004490e8ea73914c4b7f79dfc3562cf7b83 100644 (file)
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -30,9 +30,12 @@ WFLAGS  = \
         -DX86_FEATURES \
         -DX86_PCLMULQDQ_CRC \
         -DX86_SSE2 \
+    -DX86_SSE42_ADLER32 \
         -DX86_SSE42_CRC_INTRIN \
         -DX86_SSE42_CRC_HASH \
+    -DX86_SSSE3_ADLER32 \
         -DX86_AVX2 \
+    -DX86_AVX2_ADLER32 \
         -DX86_AVX_CHUNKSET \
         -DX86_SSE2_CHUNKSET \
         #
@@ -48,6 +51,11 @@ SUFFIX =
  
  OBJS = \
         adler32.obj \
+    adler32_avx2.obj \
+    adler32_avx512.obj \
+    adler32_avx512_vnni.obj \
+    adler32_sse42.obj \
+    adler32_ssse3.obj \
      adler32_fold.obj \
         chunkset.obj \
         chunkset_avx.obj \
@@ -182,6 +190,13 @@ $(TOP)/zconf$(SUFFIX).h: zconf
  SRCDIR = $(TOP)
  # Keep the dependences in sync with top-level Makefile.in
  adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
+adler32_avx2.obj: $(SRCDIR)/arch/x86/adler32_avx2.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/fallback_builtins.h
+adler32_avx512.obj: $(SRCDIR)/arch/x86/adler32_avx512.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
+adler32_avx512_vnni.obj: $(SRCDIR)/arch/x86/adler32_avx512_vnni.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
+adler32_sse42.obj: $(SRCDIR)/arch/x86/adler32_sse42.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
+                   $(SRCDIR)/arch/x86/adler32_ssse3_p.h $(SRCDIR)/arch/x86/adler32_sse42_tpl.h
+adler32_ssse3.obj: $(SRCDIR)/arch/x86/adler32_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
+                   $(SRCDIR)/arch/x86/adler32_ssse3_p.h
  adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h
  functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zendian.h $(SRCDIR)/arch/x86/x86_features.h
  gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h
author	Adam Stylinski <kungfujesus06@gmail.com>
	Sun, 3 Apr 2022 16:18:12 +0000 (12:18 -0400)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Mon, 23 May 2022 14:13:39 +0000 (16:13 +0200)
CMakeLists.txt		patch \| blob \| blame \| history
adler32_fold.h		patch \| blob \| blame \| history
adler32_p.h		patch \| blob \| blame \| history
arch/x86/Makefile.in		patch \| blob \| blame \| history
arch/x86/adler32_sse42.c	[new file with mode: 0644]	patch \| blob
arch/x86/adler32_sse42_tpl.h	[new file with mode: 0644]	patch \| blob
arch/x86/adler32_ssse3.c		patch \| blob \| blame \| history
arch/x86/adler32_ssse3_p.h	[new file with mode: 0644]	patch \| blob
arch/x86/adler32_ssse3_tpl.h	[new file with mode: 0644]	patch \| blob
configure		patch \| blob \| blame \| history
cpu_features.h		patch \| blob \| blame \| history
deflate.c		patch \| blob \| blame \| history
deflate.h		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
inflate.c		patch \| blob \| blame \| history
win32/Makefile.msc		patch \| blob \| blame \| history