]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Explicit SSE2 vectorization of Chorba CRC method
authorAdam Stylinski <kungfujesus06@gmail.com>
Sun, 16 Feb 2025 17:13:00 +0000 (12:13 -0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 28 Mar 2025 19:43:59 +0000 (20:43 +0100)
The version that's currently in the generic implementation for 32768
byte buffers leverages the stack. It manages to autovectorize but
unfortunately the trips to the stack hurt its performance for CPUs which
need this the most. This version is explicitly SIMD vectorized and
doesn't use trips to the stack.  In my testing it's ~10% faster than the
"small" variant, and about 42% faster than the "32768" variant.

CMakeLists.txt
arch/x86/Makefile.in
arch/x86/chorba_sse2.c [new file with mode: 0644]
arch/x86/x86_functions.h
arch/x86/x86_intrins.h
configure
crc32.h
functable.c
test/benchmarks/benchmark_crc32.cc
test/test_crc32.cc
win32/Makefile.msc

index 1610e8b8bb5a261c82d4ec632678b800dd764281..5243251aaddb2fb67d8ef84ff6b54dbba774d9a5 100644 (file)
@@ -966,7 +966,7 @@ if(WITH_OPTIM)
             endif()
             if(HAVE_SSE2_INTRIN)
                 add_definitions(-DX86_SSE2)
-                set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+                set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/chorba_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
                 list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
                 if(NOT ${ARCH} MATCHES "x86_64")
                     set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
index a797517df32a6568d536a507c636aa4e014b51e4..7705cd0913330268270676036896e816a72577d3 100644 (file)
@@ -34,6 +34,7 @@ all: \
        chunkset_avx512.o chunkset_avx512.lo \
        chunkset_sse2.o chunkset_sse2.lo \
        chunkset_ssse3.o chunkset_ssse3.lo \
+       chorba_sse2.o chorba_sse2.lo \
        compare256_avx2.o compare256_avx2.lo \
        compare256_sse2.o compare256_sse2.lo \
        crc32_pclmulqdq.o crc32_pclmulqdq.lo \
@@ -71,6 +72,12 @@ chunkset_ssse3.o:
 chunkset_ssse3.lo:
        $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
 
+chorba_sse2.o:
+       $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse2.c
+
+chorba_sse2.lo:
+       $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse2.c
+
 compare256_avx2.o:
        $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
 
diff --git a/arch/x86/chorba_sse2.c b/arch/x86/chorba_sse2.c
new file mode 100644 (file)
index 0000000..5f38cfc
--- /dev/null
@@ -0,0 +1,880 @@
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2)
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32.h"
+#include <emmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch/generic/generic_functions.h"
+#include <assert.h>
+
+extern uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
+
+#define READ_NEXT(in, off, a, b) do { \
+        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+        } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+        d  = _mm_srli_epi64(invec, 20); \
+        } while (0);
+
+Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint64_t* buf, size_t len) {
+    const uint64_t* input = buf;
+    ALIGNED_(16) uint64_t final[9] = {0};
+    uint64_t next1 = crc;
+    crc = 0;
+    uint64_t next2 = 0;
+    uint64_t next3 = 0;
+    uint64_t next4 = 0;
+    uint64_t next5 = 0;
+
+    __m128i next12 = _mm_cvtsi64x_si128(next1);
+    __m128i next34 = _mm_setzero_si128();
+    __m128i next56 = _mm_setzero_si128();
+    __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4;
+
+    size_t i = 0;
+
+    /* This is weird, doing for vs while drops 10% off the exec time */
+    for(; (i + 256 + 40 + 32 + 32) < len; i += 32) {
+        __m128i in1in2, in3in4;
+
+        /*
+        uint64_t chorba1 = input[i / sizeof(uint64_t)];
+        uint64_t chorba2 = input[i / sizeof(uint64_t) + 1];
+        uint64_t chorba3 = input[i / sizeof(uint64_t) + 2];
+        uint64_t chorba4 = input[i / sizeof(uint64_t) + 3];
+        uint64_t chorba5 = input[i / sizeof(uint64_t) + 4];
+        uint64_t chorba6 = input[i / sizeof(uint64_t) + 5];
+        uint64_t chorba7 = input[i / sizeof(uint64_t) + 6];
+        uint64_t chorba8 = input[i / sizeof(uint64_t) + 7];
+        */
+
+        const uint64_t *inputPtr = input + (i / sizeof(uint64_t));
+        const __m128i *inputPtr128 = (__m128i*)inputPtr;
+        __m128i chorba12 = _mm_load_si128(inputPtr128++);
+        __m128i chorba34 = _mm_load_si128(inputPtr128++);
+        __m128i chorba56 = _mm_load_si128(inputPtr128++);
+        __m128i chorba78 = _mm_load_si128(inputPtr128++);
+
+        chorba12 = _mm_xor_si128(chorba12, next12);
+        chorba34 = _mm_xor_si128(chorba34, next34);
+        chorba56 = _mm_xor_si128(chorba56, next56);
+        chorba78 = _mm_xor_si128(chorba78, chorba12);
+        __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1));
+        __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12),
+                                                           _mm_castsi128_pd(chorba34), 1));
+        /*
+        chorba1 ^= next1;
+        chorba2 ^= next2;
+        chorba3 ^= next3;
+        chorba4 ^= next4;
+        chorba5 ^= next5;
+        chorba7 ^= chorba1;
+        chorba8 ^= chorba2;
+        */
+        i += 8 * 8;
+
+        /* 0-3 */
+        /*in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];*/
+        READ_NEXT(input, i, in1in2, in3in4);
+        __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12));
+        in1in2 = _mm_xor_si128(in1in2, chorba34xor);
+        /*
+        in1 ^= chorba3;
+        in2 ^= chorba4 ^ chorba1;
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        */
+
+        in3in4 = _mm_xor_si128(in3in4, ab1);
+        /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */
+        __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2));
+
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23));
+        in3in4 = _mm_xor_si128(in3in4, chorba12);
+
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1;
+        in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+
+        /*out1 = a3 ^ b2 ^ c1;
+        out2 = b3 ^ c2 ^ d1 ^ a4;*/
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        //out3 = b4 ^ c3 ^ d2;
+        //out4 = c4 ^ d3;
+
+        //out5 = d4;
+
+        /*
+        next1 = out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 4-7 */
+        /*in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];*/
+        READ_NEXT(input, i, in1in2, in3in4);
+
+        in1in2 = _mm_xor_si128(in1in2, next12);
+        in1in2 = _mm_xor_si128(in1in2, chorba78);
+        in1in2 = _mm_xor_si128(in1in2, chorba45);
+        in1in2 = _mm_xor_si128(in1in2, chorba34);
+
+        /*
+        in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3;
+        in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4;
+        */
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+
+        in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5;
+        in4 ^= next4 ^ b1 ^ a2  ^ chorba7 ^ chorba6;
+        */
+        in3in4 = _mm_xor_si128(in3in4, next34);
+        in3in4 = _mm_xor_si128(in3in4, ab1);
+        in3in4 = _mm_xor_si128(in3in4, chorba56);
+        __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+
+        /*
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        ///*
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, cd1);
+
+        next12 = _mm_xor_si128(next12, a4_);
+        next12 = _mm_xor_si128(next12, next56);
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        next34 = _mm_xor_si128(b4c4, cd3);
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        //*/
+
+        /*
+        out1 = a3 ^ b2 ^ c1;
+        out2 = b3 ^ c2 ^ d1 ^ a4;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 8-11 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1;
+        in2 ^= next2 ^ chorba8 ^ chorba2;
+        */
+
+        READ_NEXT(input, i, in1in2, in3in4);
+
+        __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128());
+        __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12);
+        in1in2 = _mm_xor_si128(in1in2, chorba80);
+        in1in2 = _mm_xor_si128(in1in2, chorba78);
+        in1in2 = _mm_xor_si128(in1in2, next12_chorba12);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];*/
+        in3in4 = _mm_xor_si128(next34, in3in4);
+        in3in4 = _mm_xor_si128(in3in4, ab1);
+        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, chorba34);
+        in3in4 = _mm_xor_si128(in3in4, a2_);
+
+        /*
+        in3 ^= next3 ^ a1 ^ chorba3;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(a4_, ab3);
+        next12 = _mm_xor_si128(next12, cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 12-15 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        */
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, next12);
+        __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12);
+        in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12);
+        __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+        in1in2 = _mm_xor_si128(in1in2, chorb1_);
+
+
+        /*
+        in1 ^= next1 ^ chorba5 ^ chorba1;
+        in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
+        */
+
+        in3in4 = _mm_xor_si128(next34, in3in4);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        ///*
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        //*/
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 16-19 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
+        in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
+        */
+        ///*
+        READ_NEXT(input, i, in1in2, in3in4);
+        __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+        in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34));
+        in1in2 = _mm_xor_si128(chorba1_, in1in2);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+        //*/
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        */
+        ///*
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+        in3in4 = _mm_xor_si128(in3in4, next34);
+        //*/
+        /*
+        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
+        */
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 20-23 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
+        in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
+        */
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+        in1in2 = _mm_xor_si128(in1in2, chorba80);
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+        */
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+        in3in4 = _mm_xor_si128(in3in4, chorba12);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        i += 32;
+
+        /* 24-27 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+        in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
+        */
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+        in1in2 = _mm_xor_si128(in1in2, chorba80);
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_));
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+        i += 32;
+
+        /* 28-31 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5;
+        in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6;
+        */
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56));
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80));
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+        */
+
+        /*
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+    }
+
+    for(; (i + 40 + 32) < len; i += 32) {
+        __m128i in1in2, in3in4;
+
+        /*in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];*/
+        //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4);
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, next12);
+
+        /*
+        in1 ^=next1;
+        in2 ^=next2;
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1;
+        in4 ^= next4 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        next12 = _mm_xor_si128(next12, next56);
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+    }
+
+    next1 = _mm_cvtsi128_si64x(next12);
+    next2 = _mm_cvtsi128_si64x(_mm_unpackhi_epi64(next12, next12));
+    next3 = _mm_cvtsi128_si64x(next34);
+    next4 = _mm_cvtsi128_si64x(_mm_unpackhi_epi64(next34, next34));
+    next5 = _mm_cvtsi128_si64x(next56);
+
+    /* Skip the call to memcpy */
+    size_t copy_len = len - i;
+    __m128i *final128 = (__m128i*)final;
+    __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t));
+    while (copy_len >= 64) {
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+         copy_len -= 64;
+    }
+
+    while (copy_len >= 16) {
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        copy_len -= 16;
+    }
+
+    uint8_t *src_bytes = (uint8_t*)input128;
+    uint8_t *dst_bytes = (uint8_t*)final128;
+    while (copy_len--) {
+       *dst_bytes++ = *src_bytes++;
+    }
+
+    final[0] ^= next1;
+    final[1] ^= next2;
+    final[2] ^= next3;
+    final[3] ^= next4;
+    final[4] ^= next5;
+
+    /* We perform the same loop that braid_internal is doing but we'll skip
+     * the function call for this tiny tail */
+    uint8_t *final_bytes = (uint8_t*)final;
+    size_t rem = len - i;
+
+    while (rem--) {
+        crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8);
+    }
+
+    return crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c;
+    uint64_t* aligned_buf;
+    size_t aligned_len;
+
+    c = (~crc) & 0xffffffff;
+    unsigned long algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;
+    if (algn_diff < len) {
+        if (algn_diff) {
+            c = crc32_braid_internal(c, buf, algn_diff);
+        }
+        aligned_buf = (uint64_t*) (buf + algn_diff);
+        aligned_len = len - algn_diff;
+        if(aligned_len > CHORBA_LARGE_THRESHOLD) {
+            c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
+        } else if (aligned_len > 72) {
+            c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len);
+        } else {
+            c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
+        }
+    }
+    else {
+        c = crc32_braid_internal(c, buf, len);
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return c ^ 0xffffffff;
+}
+#endif
index fc62daeae15af6f1ef44b0d6731f8f7296b090b4..a8de8d9afcf08c8524762ff4a4634e9de4e1c806 100644 (file)
@@ -6,6 +6,14 @@
 #ifndef X86_FUNCTIONS_H_
 #define X86_FUNCTIONS_H_
 
+/* So great news, your compiler is broken and causes stack smashing. Rather than
+ * notching out its compilation we'll just remove the assignment in the functable.
+ * Further context:
+ * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
+#if defined(_MSC_VER) && !defined(_M_AMD64) && _MSC_VER >= 1920 && _MSC_VER <= 1929
+#define NO_CHORBA_SSE2
+#endif
+
 #ifdef X86_SSE2
 uint32_t chunksize_sse2(void);
 uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
@@ -17,6 +25,9 @@ uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsign
     void slide_hash_sse2(deflate_state *s);
 #  endif
     void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+#   if !defined(WITHOUT_CHORBA)
+    uint32_t crc32_chorba_sse2(uint32_t crc32, const uint8_t *buf, size_t len);
+#   endif
 #endif
 
 #ifdef X86_SSSE3
@@ -70,7 +81,6 @@ uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
 uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #endif
 
-
 #ifdef DISABLE_RUNTIME_CPU_DETECTION
 // X86 - SSE2
 #  if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
@@ -89,6 +99,10 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #      define native_longest_match longest_match_sse2
 #      undef native_longest_match_slow
 #      define native_longest_match_slow longest_match_slow_sse2
+#      if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+#          undef native_crc32
+#          define native_crc32 crc32_chorba_sse2
+#      endif
 #    endif
 #endif
 // X86 - SSSE3
index a2ec0027c31cbabb7af79be34f1c4f0c66b2de91..b27755834644dd5f88409beb782e8b7b42d6728e 100644 (file)
@@ -89,4 +89,40 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
 #  undef _mm512_extracti32x4_epi32
 #  define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1)
 #endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+/* For whatever reason this intrinsic is 64 bit only with MSVC?
+ * While we don't have 64 bit GPRs, it should at least be able to move it to stack
+ * or shuffle it over 2 registers */
+#if !defined(_M_AMD64)
+/* So, while we can't move directly to a GPR, hopefully this move to
+ * a stack resident variable doesn't equate to something awful */
+static inline int64_t _mm_cvtsi128_si64x(__m128i a) {
+    union { __m128i v; int64_t i; } u;
+    u.v = a;
+    return u.i;
+}
+
+static inline __m128i _mm_cvtsi64x_si128(int64_t a) {
+   return _mm_set_epi64x(0, a);
+}
+#endif
+#endif
+
+
+#if defined(__clang__)
+#define _mm_cvtsi64x_si128(v) _mm_cvtsi64_si128(v)
+#define _mm_cvtsi128_si64x(v) _mm_cvtsi128_si64(v)
+#endif
+
+#if defined(__GNUC__) && !defined( __x86_64__) && !defined(__clang__)
+static inline int64_t _mm_cvtsi128_si64x(__m128i a) {
+    union { __m128i v; int64_t i; } u;
+    u.v = a;
+    return u.i;
+}
+#define _mm_cvtsi64x_si128(a) _mm_set_epi64x(0, a)
+#endif
+
 #endif // include guard X86_INTRINS_H
index fe19641bb6724cb1817aba58cc283556e9956196..e29ccddd7e71fc732445b6680e9170e7ace7b0fe 100755 (executable)
--- a/configure
+++ b/configure
@@ -1642,8 +1642,8 @@ case "${ARCH}" in
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE2"
                 SFLAGS="${SFLAGS} -DX86_SSE2"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o compare256_sse2.o slide_hash_sse2.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo compare256_sse2.lo slide_hash_sse2.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o chorba_sse2.o compare256_sse2.o slide_hash_sse2.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo chorba_sse2.lo compare256_sse2.lo slide_hash_sse2.lo"
 
                 if test $forcesse2 -eq 1; then
                     CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
diff --git a/crc32.h b/crc32.h
index e26b59e520c1c2b311a3dd3fddd70284ea708476..4c1eacaea660d27d2c7adad3e1a5103bf813fe68 100644 (file)
--- a/crc32.h
+++ b/crc32.h
@@ -15,6 +15,8 @@
 #define CHORBA_SMALL_THRESHOLD_64BIT 72
 #define CHORBA_SMALL_THRESHOLD_32BIT 80
 
+Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len);
+
 typedef struct crc32_fold_s {
     uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
     uint32_t value;
index 4fc55318cc39afdfe4126e3fe8cd22780bbf96d3..aea7dbb3505d9d43d972adb6b05eae4107ab7a3c 100644 (file)
@@ -75,6 +75,9 @@ static void init_functable(void) {
     {
         ft.chunkmemset_safe = &chunkmemset_safe_sse2;
         ft.chunksize = &chunksize_sse2;
+#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+        ft.crc32 = &crc32_chorba_sse2;
+#endif
         ft.inflate_fast = &inflate_fast_sse2;
         ft.slide_hash = &slide_hash_sse2;
 #  ifdef HAVE_BUILTIN_CTZ
index 5c5751afc7dc1cbefc38fd73fb3419f7ee80e5e5..e51cff7bba8488ed8a8bccca637a7ab0187d6afd 100644 (file)
@@ -68,6 +68,12 @@ BENCHMARK_CRC32(braid, crc32_braid, 1);
 BENCHMARK_CRC32(native, native_crc32, 1);
 #else
 
+#ifndef WITHOUT_CHORBA
+#   if defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+    BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+#   endif
+#endif
+
 #ifdef ARM_CRC32
 BENCHMARK_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32);
 #endif
index ee301ef6027b69aa27b38f63aaab1b6de86fa15e..f6aac12a9742fcb11ce7555fad0d715f5234d433 100644 (file)
@@ -281,5 +281,8 @@ TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
 #ifdef X86_VPCLMULQDQ_CRC
 TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
 #endif
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
+#endif
 
 #endif
index 72c0df765f4b65454c507438c30330733fcf90b3..4af3ad0dc8a3e34b64f7c5129ec0294447656b26 100644 (file)
@@ -60,6 +60,7 @@ OBJS = \
        chunkset_avx2.obj \
        chunkset_sse2.obj \
        chunkset_ssse3.obj \
+    chorba_sse2.obj \
        compare256_c.obj \
        compare256_avx2.obj \
        compare256_sse2.obj \
@@ -210,6 +211,7 @@ adler32_sse42.obj: $(TOP)/arch/x86/adler32_sse42.c $(TOP)/zbuild.h $(TOP)/adler3
 adler32_ssse3.obj: $(TOP)/arch/x86/adler32_ssse3.c $(TOP)/zbuild.h $(TOP)/adler32_p.h \
                    $(TOP)/arch/x86/adler32_ssse3_p.h
 adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h
+chorba_sse2.obj: $(TOP)/arch/x86/chorba_sse2.c $(TOP)/zbuild.h $(TOP)/crc32_braid_tbl.h $(TOP)/crc32_braid_p.h
 chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h
 chunkset_avx2.obj: $(TOP)/arch/x86/chunkset_avx2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h $(TOP)/arch/generic/chunk_permute_table.h
 chunkset_sse2.obj: $(TOP)/arch/x86/chunkset_sse2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h