Adding avx512_vnni inline + copy elision

author Adam Stylinski <kungfujesus06@gmail.com>

Fri, 8 Apr 2022 17:24:21 +0000 (13:24 -0400)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Mon, 23 May 2022 14:13:39 +0000 (16:13 +0200)
author Adam Stylinski <kungfujesus06@gmail.com>
Fri, 8 Apr 2022 17:24:21 +0000 (13:24 -0400)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Mon, 23 May 2022 14:13:39 +0000 (16:13 +0200)
diff --git a/adler32_fold.c b/adler32_fold.c

index 688f848533367c36a44d8fb9af0324c5bc44f91b..20fec2bd3c94269d17edb59c20028ce0f383e410 100644 (file)
--- a/adler32_fold.c
+++ b/adler32_fold.c
@@ -1,4 +1,4 @@
-/* crc32_fold.c -- adler32 folding interface
+/* adler32_fold.c -- adler32 folding interface
   * Copyright (C) 2022 Adam Stylinski 
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
diff --git a/adler32_fold.h b/adler32_fold.h

index ea456adc31df961b74b3f8683a286e9f916b9ed9..20aa1c7400b76a5dd7b48f1a736cd0cceef9d3f1 100644 (file)
--- a/adler32_fold.h
+++ b/adler32_fold.h
@@ -6,8 +6,6 @@
  #ifndef ADLER32_FOLD_H_
  #define ADLER32_FOLD_H_
  
-#include <stdint.h>
-
  Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  
  #endif
diff --git a/adler32_p.h b/adler32_p.h

index 5a14172f73294f1043c79c78b9e280704d06e5cd..1d2e77f49f0db0d5792bc018494f27a7d22750be 100644 (file)
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -26,10 +26,10 @@ static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, u
      return adler | (sum2 << 16);
  }
  
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
-    while (len--) {
-        *dst = *buf++; 
-        adler += *dst++;
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
          sum2 += adler;
      }
      adler %= BASE;
@@ -38,10 +38,10 @@ static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *
      return adler | (sum2 << 16);
  }
  
-static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
-    while (len) {
-        --len;
-        adler += *buf++;
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++; 
+        adler += *dst++;
          sum2 += adler;
      }
      adler %= BASE;
diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c

index fcca34ec53e91ca6c07b81505af65ccba28249f5..dcd1166f342b1bd9505eab289e955985be75b9a6 100644 (file)
--- a/arch/x86/adler32_avx2.c
+++ b/arch/x86/adler32_avx2.c
@@ -5,29 +5,13 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../fallback_builtins.h"
-#include "adler32_avx2_p.h"
-#include "../../adler32_fold.h"
-#include <stdio.h>
-
  #include <immintrin.h>
  
  #ifdef X86_AVX2_ADLER32
  
  #include "adler32_avx2_tpl.h"
-#undef ADLER32_AVX2_TPL_H_
+
  #define COPY
  #include "adler32_avx2_tpl.h"
-#undef COPY
-
-/*
-Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 1L;
-    if (len == 0) return adler;
-    return adler32_fold_avx2(adler, buf, len);
-}
-*/
  
  #endif
diff --git a/arch/x86/adler32_avx2_p.h b/arch/x86/adler32_avx2_p.h

index 1c80bde0572b5a52ac7ee5e3ddf47c212d5c52a7..f7079bf3eb2c73383ed6718ef6d69d8287664b1b 100644 (file)
--- a/arch/x86/adler32_avx2_p.h
+++ b/arch/x86/adler32_avx2_p.h
@@ -6,10 +6,10 @@
  #ifndef ADLER32_AVX2_P_H_
  #define ADLER32_AVX2_P_H_
  
-#ifdef X86_AVX2_ADLER32
+#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32)
  
  /* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
-static inline uint32_t hsum(__m256i x) {
+static inline uint32_t hsum256(__m256i x) {
      __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
                                    _mm256_castsi256_si128(x));
      __m128i sum2  = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
@@ -17,7 +17,7 @@ static inline uint32_t hsum(__m256i x) {
      return (uint32_t)_mm_cvtsi128_si32(sum3);
  }
  
-static inline uint32_t partial_hsum(__m256i x) {
+static inline uint32_t partial_hsum256(__m256i x) {
      /* We need a permutation vector to extract every other integer. The
       * rest are going to be zeros */
      const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
diff --git a/arch/x86/adler32_avx2_tpl.h b/arch/x86/adler32_avx2_tpl.h

index 7df51d573dbd7a2336e5504a6ac28849442c8379..59cacfa48324a3d85b629bf2847436c1c6bd0c56 100644 (file)
--- a/arch/x86/adler32_avx2_tpl.h
+++ b/arch/x86/adler32_avx2_tpl.h
@@ -3,9 +3,6 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#ifndef ADLER32_AVX2_TPL_H_
-#define ADLER32_AVX2_TPL_H_
-
  #include "../../zbuild.h"
  #include <immintrin.h>
  #include "../../adler32_fold.h"
@@ -38,9 +35,9 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len)
  rem_peel:
      if (len < 16) {
  #ifdef COPY
-       return adler32_copy_len_16(adler0, src, dst, len, adler1);
+        return adler32_copy_len_16(adler0, src, dst, len, adler1);
  #else
-       return adler32_len_16(adler0, src, len, adler1);
+        return adler32_len_16(adler0, src, len, adler1);
  #endif
      } else if (len < 32) {
  #ifdef COPY
@@ -129,8 +126,8 @@ rem_peel:
           * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
           * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
           * what the compiler is doing to avoid integer divisions. */
-        adler0 = partial_hsum(vs1) % BASE;
-        adler1 = hsum(vs2) % BASE;
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
      }
  
      adler = adler0 | (adler1 << 16);
@@ -141,5 +138,3 @@ rem_peel:
  
      return adler;
  }
-
-#endif
diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c

index e26b9cc524ac82a98807d452d3157cd8f83a4ed6..c0bf0721f2c8662c69d966d70dab931e9660851a 100644 (file)
--- a/arch/x86/adler32_avx512.c
+++ b/arch/x86/adler32_avx512.c
@@ -6,20 +6,11 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../cpu_features.h"
-#include "../../fallback_builtins.h"
-#include <immintrin.h>
-#include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
-
  #ifdef X86_AVX512_ADLER32
  
  #include "adler32_avx512_tpl.h"
-#undef ADLER32_AVX512_TPL_H_
+
  #define COPY
  #include "adler32_avx512_tpl.h"
-#undef COPY
  
  #endif
diff --git a/arch/x86/adler32_avx512_tpl.h b/arch/x86/adler32_avx512_tpl.h

index df5dd3810f60c7528606f810dbe2965d0f1b41a9..d324ce98599ebc79b056e4f6317bdca951eb94dc 100644 (file)
--- a/arch/x86/adler32_avx512_tpl.h
+++ b/arch/x86/adler32_avx512_tpl.h
@@ -3,16 +3,13 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#ifndef ADLER32_AVX512_TPL_H_
-#define ADLER32_AVX512_TPL_H_
-
  #include "../../zbuild.h"
  #include "../../adler32_p.h"
+#include "../../adler32_fold.h"
  #include "../../cpu_features.h"
  #include "../../fallback_builtins.h"
  #include <immintrin.h>
  #include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
  
  #ifdef X86_AVX512_ADLER32
  
@@ -22,13 +19,13 @@ Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const
  Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
  #endif
  
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
      uint32_t adler0, adler1;
      adler1 = (adler >> 16) & 0xffff;
      adler0 = adler & 0xffff; 
  
-    if (src == NULL) return 1L;
-    if (len == 0) return adler;
-
  rem_peel:
      if (len < 64) {
          /* This handles the remaining copies, just call normal adler checksum after this */
@@ -107,4 +104,3 @@ rem_peel:
  }
  
  #endif
-#endif
diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c

index 253eed9c6a8f22904d2621b0174cae7fb0c920d1..330bfe38e7d2e3c4144c3ea89d62377f23f6ee75 100644 (file)
--- a/arch/x86/adler32_avx512_vnni.c
+++ b/arch/x86/adler32_avx512_vnni.c
@@ -7,66 +7,54 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
+#ifdef X86_AVX512VNNI_ADLER32
+
  #include "../../zbuild.h"
  #include "../../adler32_p.h"
  #include "../../cpu_features.h"
  #include "../../fallback_builtins.h"
  #include <immintrin.h>
+#include "../../adler32_fold.h"
  #include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
  
-#ifdef X86_AVX512VNNI_ADLER32
-Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
-    uint32_t sum2;
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff; 
  
-    /* For impossibly tiny sizes, use the smaller width versions. We still need
-     * to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32
+rem_peel:
      if (len < 32)
-        return adler32_sse41(adler, buf, len);
+#if defined(X86_SSSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
  #endif
  
-#ifdef X86_AVX2_ADLER32
      if (len < 64)
-        return adler32_avx2(adler, buf, len);
-#endif
-
-     /* split Adler-32 into component sums */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    /* Only capture these corner cases if we didn't compile with SSE41 and AVX2 support
-     * This should make for shorter compiled code */
-#if !defined(X86_AVX2_ADLER32) && !defined(X86_SSE41_ADLER32)
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1))
-        return adler32_len_1(adler, buf, sum2);
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(buf == NULL))
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16))
-        return adler32_len_16(adler, buf, len, sum2);
+#ifdef X86_AVX2_ADLER32
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
  #endif
  
-    /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up
-     * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial,
-     * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better
-     * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be
-     * aliased up to a 512 bit wide zmm */
-    __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-    __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
-
      const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
                                            38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                                            56, 57, 58, 59, 60, 61, 62, 63, 64);
  
      const __m512i zero = _mm512_setzero_si512();
+    __m512i vs1, vs2;
  
      while (len >= 64) {
-        int k = (len < NMAX ? (int)len : NMAX);
+        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
          k -= k % 64;
          len -= k;
          __m512i vs1_0 = vs1;
@@ -77,8 +65,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
  
          /* Remainder peeling */
          if (k % 128) {
-            vbuf1 = _mm512_loadu_si512(buf);
-            buf += 64;
+            vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+            src += 64;
              k -= 64;
  
              __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
@@ -94,9 +83,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
                 vs1 = adler + sum(c[i])
                 vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
              */
-            vbuf0 = _mm512_loadu_si512(buf);
-            vbuf1 = _mm512_loadu_si512(buf + 64);
-            buf += 128;
+            vbuf0 = _mm512_loadu_si512((__m512i*)src);
+            vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+            src += 128;
              k -= 128;
  
              __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
@@ -117,14 +106,120 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
          vs2 = _mm512_add_epi32(vs2, vs3);
          vs2 = _mm512_add_epi32(vs2, vs2_1);
  
-        adler = partial_hsum(vs1) % BASE;
-        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-        sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
-        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler; 
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff; 
+
+rem_peel_copy:
+    if (len < 32) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
+#endif
+    }
+
+    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    const __m256i zero = _mm256_setzero_si256();
+    __m256i vs1, vs2;
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m256i vs2_1 = _mm256_setzero_si256();
+        __m256i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 64) {
+            vbuf1 = _mm256_loadu_si256((__m256i*)src);
+            _mm256_storeu_si256((__m256i*)dst, vbuf1);
+            dst += 32;
+
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm256_loadu_si256((__m256i*)src);
+            vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+            _mm256_storeu_si256((__m256i*)dst, vbuf0);
+            _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+            dst += 64;
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm256_add_epi32(vs3, vs1);
+            vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+        vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
      }
  
+    adler = adler0 | (adler1 << 16);
+
      /* Process tail (len < 64). */
-    return adler32_len_16(adler, buf, len, sum2);
+    if (len) {
+        goto rem_peel_copy;
+    }
+
+    return adler; 
  }
  
  #endif
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c

index 4f21702aaf6767a52db9dfa1a95e0acd492a4899..92efe4d8db35dafbf51beb23a424f873b6f09acc 100644 (file)
--- a/arch/x86/adler32_sse42.c
+++ b/arch/x86/adler32_sse42.c
@@ -1,4 +1,4 @@
-/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
   * Copyright (C) 1995-2011 Mark Adler
   * Authors:
   *   Adam Stylinski <kungfujesus06@gmail.com>
@@ -15,7 +15,6 @@
  #ifdef X86_SSE42_ADLER32
  
  Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-
      uint32_t adler0, adler1;
      adler1 = (adler >> 16) & 0xffff;
      adler0 = adler & 0xffff; 
diff --git a/arch/x86/adler32_ssse3_tpl.h b/arch/x86/adler32_ssse3_tpl.h

deleted file mode 100644 (file)

index aedfa81..0000000
--- a/arch/x86/adler32_ssse3_tpl.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_SSSE3_TPL_H_
-#define ADLER32_SSSE3_TPL_H_
-
-#include "../../zbuild.h"
-#include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
-#include "adler32_ssse3_p.h"
-
-#ifdef COPY
-Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
-#else
-Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) {
-#endif
-    uint32_t adler0, adler1;
-
-     /* split Adler-32 into component sums */
-    adler1 = (adler->nsums >> 16) & 0xffff;
-    adler0 = adler->nsums & 0xffff;
-
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1)) {
-#ifdef COPY
-        *(dst++) = *src;
-#endif
-        adler->nsums = adler32_len_1(adler0, src, adler1);
-        return;
-    }
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(src == NULL)) {
-        adler->nsums = 1L;
-        return;
-    }
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16)) {
-        goto sub16;
-    }
-
-    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
-    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    const __m128i dot3v = _mm_set1_epi16(1);
-    const __m128i zero = _mm_setzero_si128();
-
-    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
-            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
-
-    /* If our buffer is unaligned (likely), make the determination whether
-     * or not there's enough of a buffer to consume to make the scalar, aligning
-     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
-     * load. This is a pretty simple test, just test if 16 - the remainder + len is
-     * < 16 */
-    size_t max_iters = NMAX;
-    size_t rem = (uintptr_t)src & 15;
-    size_t align_offset = 16 - rem;
-    size_t k = 0;
-    if (rem) {
-        if (len < 16 + align_offset) {
-            /* Let's eat the cost of this one unaligned load so that
-             * we don't completely skip over the vectorization. Doing
-             * 16 bytes at a time unaligned is is better than 16 + <= 15
-             * sums */
-            vbuf = _mm_loadu_si128((__m128i*)src);
-            len -= 16;
-            src += 16;
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            dst += 16;
-#endif
-            vs1 = _mm_cvtsi32_si128(adler0);
-            vs2 = _mm_cvtsi32_si128(adler1);
-            vs3 = _mm_setzero_si128();
-            vs1_0 = vs1;
-            goto unaligned_jmp;
-        }
-
-#ifdef COPY
-        memcpy(dst, src, align_offset);
-        dst += align_offset;
-#endif
-        for (size_t i = 0; i < align_offset; ++i) {
-            adler0 += *(src++);
-            adler1 += adler0;
-        }
-
-        /* lop off the max number of sums based on the scalar sums done
-         * above */
-        len -= align_offset;
-        max_iters -= align_offset; 
-    }
-
-
-    while (len >= 16) {
-        vs1 = _mm_cvtsi32_si128(adler0);
-        vs2 = _mm_cvtsi32_si128(adler1);
-        vs3 = _mm_setzero_si128();
-        vs2_0 = _mm_setzero_si128();
-        vs1_0 = vs1;
-
-        k = (len < max_iters ? len : max_iters);
-        k -= k % 16;
-        len -= k;
-
-        while (k >= 32) {
-            /*
-               vs1 = adler + sum(c[i])
-               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-            */
-            vbuf = _mm_load_si128((__m128i*)src);
-            vbuf_0 = _mm_load_si128((__m128i*)(src + 16));
-            src += 32;
-            k -= 32;
-
-            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
-            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
-            dst += 32;
-#endif
-            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
-            vs3 = _mm_add_epi32(vs1_0, vs3);
-
-            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
-            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
-            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
-            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
-            vs2 = _mm_add_epi32(vsum2, vs2);
-            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
-            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
-            vs1_0 = vs1;
-        }
-
-        vs2 = _mm_add_epi32(vs2_0, vs2);
-        vs3 = _mm_slli_epi32(vs3, 5);
-        vs2 = _mm_add_epi32(vs3, vs2);
-        vs3 = _mm_setzero_si128();
-
-        while (k >= 16) {
-            /*
-               vs1 = adler + sum(c[i])
-               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-            */
-            vbuf = _mm_load_si128((__m128i*)src);
-            src += 16;
-            k -= 16;
-
-unaligned_jmp:
-            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            dst += 16;
-#endif
-            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
-            vs3 = _mm_add_epi32(vs1_0, vs3);
-            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
-            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
-            vs2 = _mm_add_epi32(vsum2, vs2);
-            vs1_0 = vs1;
-        }
-
-        vs3 = _mm_slli_epi32(vs3, 4);
-        vs2 = _mm_add_epi32(vs2, vs3);
-
-        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
-         * a partial reduction sum implicitly and only summing to integers in vector positions
-         * 0 and 2. This saves us some contention on the shuffle port(s) */
-        adler0 = partial_hsum(vs1) % BASE;
-        adler1 = hsum(vs2) % BASE;
-        max_iters = NMAX;
-    }
-
-sub16:
-#ifdef COPY
-    adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
-#else
-    /* Process tail (len < 16).  */
-    adler->nsums = adler32_len_16(adler0, src, len, adler1);
-#endif
-}
-
-#endif
diff --git a/cpu_features.h b/cpu_features.h

index 9e0d5cb95df26c3fa9c182973dfcc11a3de4dc91..fc1b5d73395f821117c8819de91fd33d805e5822 100644 (file)
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -6,8 +6,8 @@
  #ifndef CPU_FEATURES_H_
  #define CPU_FEATURES_H_
  
-#include "crc32_fold.h"
  #include "adler32_fold.h"
+#include "crc32_fold.h"
  
  #if defined(X86_FEATURES)
  #  include "arch/x86/x86_features.h"
@@ -35,16 +35,11 @@ extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len
  #ifdef X86_SSSE3_ADLER32
  extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
  #endif
-#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
  #ifdef X86_AVX2_ADLER32
  extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  #endif
  #ifdef X86_AVX512_ADLER32
  extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  #endif
  #ifdef X86_AVX512VNNI_ADLER32
  extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
@@ -53,6 +48,20 @@ extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, si
  extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
  #endif
  
+/* adler32 folding */
+#ifdef X86_SSE42_ADLER32
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX2_ADLER32
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
  /* CRC32 folding */
  #ifdef X86_PCLMULQDQ_CRC
  extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
diff --git a/deflate.c b/deflate.c

index 006803c320819d3d17e343de376bd44f7cc82fe2..c2700f2b5f55e38efe4aedc5eb24438385760644 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -52,7 +52,6 @@
  #include "deflate.h"
  #include "deflate_p.h"
  #include "functable.h"
-#include <stdio.h>
  
  const char PREFIX(deflate_copyright)[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler ";
  /*
@@ -446,7 +445,6 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
  
  #ifdef GZIP
      if (s->wrap == 2) {
-        /* Ensure that there's always a reset, regardless of "wrap" */
          strm->adler = functable.crc32_fold_reset(&s->crc_fold);
      } else
  #endif
diff --git a/deflate.h b/deflate.h

index abc87d8b8ed27c6fe2cb5c087e21a6dd7a8899fd..1b59c7539d34976bd6c189021d0d70b86bf35f9e 100644 (file)
--- a/deflate.h
+++ b/deflate.h
@@ -10,9 +10,9 @@
     subject to change. Applications should only use zlib.h.
   */
  
-#include "adler32_fold.h"
  #include "zutil.h"
  #include "zendian.h"
+#include "adler32_fold.h"
  #include "crc32_fold.h"
  
  /* define NO_GZIP when compiling if you want to disable gzip header and
diff --git a/functable.c b/functable.c

index 8328359702fc7ee341236d14176411b4b44fd878..3945323125acd540a18621672404d61f0e68ebc2 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -204,7 +204,7 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
  
  Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
      functable.adler32_fold_copy = &adler32_fold_copy_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+#if (defined X86_SSE42_ADLER32)
      if (x86_cpu_has_sse42)
          functable.adler32_fold_copy = &adler32_fold_copy_sse42;
  #endif
@@ -215,6 +215,10 @@ Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const u
  #ifdef X86_AVX512_ADLER32
      if (x86_cpu_has_avx512)
          functable.adler32_fold_copy = &adler32_fold_copy_avx512;
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+    if (x86_cpu_has_avx512vnni)
+        functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
  #endif
      return functable.adler32_fold_copy(adler, dst, src, len);
  }
diff --git a/inflate.c b/inflate.c

index 8611e4bd485943ed39ad8dea314885f3dbca5dbd..b9c3aa79730475d8a51be4d7f944e7711f11242b 100644 (file)
--- a/inflate.c
+++ b/inflate.c
@@ -609,7 +609,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
  #endif
          case DICTID:
              NEEDBITS(32);
-            //strm->adler = state->check = ZSWAP32(hold);
              strm->adler = state->check = ZSWAP32(hold);
              INITBITS();
              state->mode = DICT;
diff --git a/inflate.h b/inflate.h

index 5761077a60a05241ebc3b3ad522f09bc839e6d6b..941e8b0a282c300f670bd02497c1f619a7819861 100644 (file)
--- a/inflate.h
+++ b/inflate.h
@@ -11,8 +11,8 @@
  #ifndef INFLATE_H_
  #define INFLATE_H_
  
-#include "crc32_fold.h"
  #include "adler32_fold.h"
+#include "crc32_fold.h"
  
  /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate().
     NO_GZIP would be used to avoid linking in the crc code when it is not needed.
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt

index df1df4973126c4639260b0fe451602dbbaf7e19b..19762fc738abe506d2a396c088277567e2000c68 100644 (file)
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
  
  add_executable(benchmark_zlib
      benchmark_adler32.cc
+    benchmark_adler32_copy.cc
      benchmark_compare256.cc
      benchmark_crc32.cc
      benchmark_main.cc
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc

new file mode 100644 (file)

index 0000000..fac4c7f
--- /dev/null
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -0,0 +1,117 @@
+/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len);
+
+class adler32_copy: public benchmark::Fixture {
+private:
+    uint32_t *random_ints_src;
+    uint32_t *random_ints_dst;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        /* Control the alignment so that we have the best case scenario for loads. With
+         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
+         * And while this is a realistic scenario, it makes it difficult to compare benchmark
+         * to benchmark because one allocation could have been aligned perfectly for the loads
+         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
+         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
+         * control the _consistency_ of the results */
+        random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        assert(random_ints != NULL);
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            random_ints_src[i] = rand();
+        }
+    }
+
+    void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = adler32_func(hash, (unsigned char *)random_ints_dst,
+                                (const unsigned char*)random_ints_src, state.range(0));
+        }
+
+        benchmark::DoNotOptimize(hash);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(random_ints_src);
+        zng_free(random_ints_dst);
+    }
+};
+
+#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const unsigned char *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, len); \
+            return fptr(init_sum, buf, len); \
+        }); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+
+#ifdef ARM_NEON_ADLER32
+/* If we inline this copy for neon, the function would go here */
+//BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon);
+BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon);
+#endif
+
+#ifdef PPC_VMX_ADLER32
+//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec);
+BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec);
+#endif
+#ifdef POWER8_VSX_ADLER32
+//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07);
+BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07);
+#endif
+
+#ifdef X86_SSE42_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3);
+BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42);
+#endif
+#ifdef X86_AVX2_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2);
+#endif
+#ifdef X86_AVX512_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512);
+BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni);
+#endif
diff --git a/win32/Makefile.msc b/win32/Makefile.msc

index 8db2633b44f81ed66dab033563aaf49a9c15ccb3..8a398e499392a68d081422e7530f0c31d73b76da 100644 (file)
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -30,15 +30,15 @@ WFLAGS  = \
         -DX86_FEATURES \
         -DX86_PCLMULQDQ_CRC \
         -DX86_SSE2 \
-    -DX86_SSE42_ADLER32 \
+       -DX86_SSE42_ADLER32 \
         -DX86_SSE42_CRC_INTRIN \
         -DX86_SSE42_CRC_HASH \
-    -DX86_SSSE3_ADLER32 \
+       -DX86_SSSE3_ADLER32 \
         -DX86_AVX2 \
-    -DX86_AVX2_ADLER32 \
+       -DX86_AVX2_ADLER32 \
         -DX86_AVX_CHUNKSET \
-       -DX86_SSE2_CHUNKSET \
-       #
+       -DX86_SSE2_CHUNKSET
+
  LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
  ARFLAGS = -nologo
  RCFLAGS = /dWIN32 /r
@@ -51,12 +51,12 @@ SUFFIX =
  
  OBJS = \
         adler32.obj \
-    adler32_avx2.obj \
-    adler32_avx512.obj \
-    adler32_avx512_vnni.obj \
-    adler32_sse42.obj \
-    adler32_ssse3.obj \
-    adler32_fold.obj \
+       adler32_avx2.obj \
+       adler32_avx512.obj \
+       adler32_avx512_vnni.obj \
+       adler32_sse42.obj \
+       adler32_ssse3.obj \
+       adler32_fold.obj \
         chunkset.obj \
         chunkset_avx.obj \
         chunkset_sse2.obj \
author	Adam Stylinski <kungfujesus06@gmail.com>
	Fri, 8 Apr 2022 17:24:21 +0000 (13:24 -0400)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Mon, 23 May 2022 14:13:39 +0000 (16:13 +0200)
adler32_fold.c		patch \| blob \| blame \| history
adler32_fold.h		patch \| blob \| blame \| history
adler32_p.h		patch \| blob \| blame \| history
arch/x86/adler32_avx2.c		patch \| blob \| blame \| history
arch/x86/adler32_avx2_p.h		patch \| blob \| blame \| history
arch/x86/adler32_avx2_tpl.h		patch \| blob \| blame \| history
arch/x86/adler32_avx512.c		patch \| blob \| blame \| history
arch/x86/adler32_avx512_tpl.h		patch \| blob \| blame \| history
arch/x86/adler32_avx512_vnni.c		patch \| blob \| blame \| history
arch/x86/adler32_sse42.c		patch \| blob \| blame \| history
arch/x86/adler32_ssse3_tpl.h	[deleted file]	patch \| blob \| blame \| history
cpu_features.h		patch \| blob \| blame \| history
deflate.c		patch \| blob \| blame \| history
deflate.h		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
inflate.c		patch \| blob \| blame \| history
inflate.h		patch \| blob \| blame \| history
test/benchmarks/CMakeLists.txt		patch \| blob \| blame \| history
test/benchmarks/benchmark_adler32_copy.cc	[new file with mode: 0644]	patch \| blob
win32/Makefile.msc		patch \| blob \| blame \| history