From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Sun, 24 Oct 2021 23:24:53 +0000 (-0400)
Subject: Minor efficiency improvement
X-Git-Tag: 2.1.0-beta1~486
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9a1109bd77159c43150eb32c5e09e16293da332d;p=thirdparty%2Fzlib-ng.git

Minor efficiency improvement

This now leverages the broadcasting instrinsics with an AND mask
to load up the registers.  Additionally, there's a minor efficiency
boost here by casting up to 64 bit precision (by means of register
aliasing) so that the modulo can be safely deferred until the write
back to the full sums.

The "write" back to the stack here is actually optimized out by GCC
and turned into a write directly to a 32 bit GPR for each of the 8
elements.  This much is not new, but now, since we don't have to do a
modulus with the BASE value, we can bypass 8 64 bit multiplications,
shifts, and subtractions while in those registers.

I tried to do a horizontal reduction sum on the 8 64 bit elements since
the vpextract* set of instructions aren't exactly low latency, however
to do this safely (no overflow) it requires 2 128 bit register extractions,
8 vpmovsxdq to bring the things up to 64 bit precision, some shuffles, more
128 bit extractions to get around the 128 bit lane requirement of the shuffles,
and finally a trip to a GPR and back to do the modulus on the scalar value.
This method could have been more efficient if there were an inexpensive 64 bit
horizontal addition instruction for AVX, but there isn't.

To test this, I wrote a pretty basic benchmark using Python's zlib bindings on
a huge set of random data, carefully timing only the checksum bits.  Invoking
perf stat from within the python process after the RNG shows a lower average
number of cycles to complete and a shorter runtime.
---

diff --git a/arch/x86/adler32_avx.c b/arch/x86/adler32_avx.c
index db43446f..183a4d6f 100644
--- a/arch/x86/adler32_avx.c
+++ b/arch/x86/adler32_avx.c
@@ -33,27 +33,22 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_
     if (UNLIKELY(len < 16))
         return adler32_len_16(adler, buf, len, sum2);
 
-    uint32_t ALIGNED_(32) s1[8], s2[8];
+    __m256i vsMask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+    __m256i vs1 = _mm256_set1_epi32(adler);
+    __m256i vs2 = _mm256_set1_epi32(sum2);
+    vs1 = _mm256_and_si256(vs1, vsMask);
+    vs2 = _mm256_and_si256(vs2, vsMask);
 
-    memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
-    memset(s2, 0, sizeof(s2)); s2[7] = sum2;
-
-    char ALIGNED_(32) dot1[32] = \
-        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-    __m256i dot1v = _mm256_load_si256((__m256i*)dot1);
-    char ALIGNED_(32) dot2[32] = \
-        {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-         16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
-    __m256i dot2v = _mm256_load_si256((__m256i*)dot2);
-    short ALIGNED_(32) dot3[16] = \
-        {1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1};
-    __m256i dot3v = _mm256_load_si256((__m256i*)dot3);
+    const __m256i dot1v = _mm256_set1_epi8(1);
+    const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27,
+                                           26, 25, 24, 23, 22, 21,
+                                           20, 19, 18, 17, 16, 15,
+                                           14, 13, 12, 11, 10, 9,
+                                           8, 7, 6, 5, 4, 3, 2, 1);
+    __m256i dot3v = _mm256_set1_epi16(1);
 
 
     while (len >= 32) {
-       __m256i vs1 = _mm256_load_si256((__m256i*)s1);
-       __m256i vs2 = _mm256_load_si256((__m256i*)s2);
        __m256i vs1_0 = vs1;
 
        int k = (len < NMAX ? (int)len : NMAX);
@@ -88,15 +83,20 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_
        _mm256_store_si256((__m256i*)s1_unpack, vs1);
        _mm256_store_si256((__m256i*)s2_unpack, vs2);
 
-       adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
-               (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
-       adler %= BASE;
-       s1[7] = adler;
+       uint64_t adler64 = ((uint64_t)s1_unpack[0] + (uint64_t)s1_unpack[1] + (uint64_t)s1_unpack[2] +
+                           (uint64_t)s1_unpack[3] + (uint64_t)s1_unpack[4] + (uint64_t)s1_unpack[5] +
+                           (uint64_t)s1_unpack[6] + (uint64_t)s1_unpack[7]) % BASE;
+
+       uint64_t sum264  = ((uint64_t)s2_unpack[0] + (uint64_t)s2_unpack[1] + (uint64_t)s2_unpack[2] +
+                           (uint64_t)s2_unpack[3] + (uint64_t)s2_unpack[4] + (uint64_t)s2_unpack[5] +
+                           (uint64_t)s2_unpack[6] + (uint64_t)s2_unpack[7]) % BASE;
 
-       sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
-              (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
-       sum2 %= BASE;
-       s2[7] = sum2;
+       adler = (uint32_t)adler64;
+       sum2 = (uint32_t)sum264;
+       vs1 = _mm256_set1_epi32(adler);
+       vs2 = _mm256_set1_epi32(sum2);
+       vs1 = _mm256_and_si256(vs1, vsMask);
+       vs2 = _mm256_and_si256(vs2, vsMask);
     }
 
     /* Process tail (len < 16).  */