]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
SSE4.1 optimized chorba
authorAdam Stylinski <kungfujesus06@gmail.com>
Tue, 11 Mar 2025 01:17:25 +0000 (21:17 -0400)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 15 Apr 2025 12:11:12 +0000 (14:11 +0200)
This is ~25-30% faster than the SSE2 variant on a core2 quad. The main reason
for this has to do with the fact that, while incurring far fewer shifts,
an entirely separate stack buffer has to be managed that is the size of
the L1 cache on most CPUs. This was one of the main reasons the 32k
specialized function was slower for the scalar counterpart, despite auto
vectorizing. The auto vectorized loop was setting up the stack buffer at
unaligned offsets, which is detrimental to performance pre-nehalem.
Additionally, we were losing a fair bit of time to the zero
initialization, which we are now doing more selectively.

There are a ton of loads and stores happening, and for sure we are bound
on the fill buffer + store forwarding. An SSE2 version of this code is
probably possible by simply replacing the shifts with unpacks with zero
and the palignr's with shufpd's. I'm just not sure it'll be all that worth
it, though. We are gating against SSE4.1 not because we are using specifically
a 4.1 instruction but because that marks when Wolfdale came out and palignr
became a lot faster.

CMakeLists.txt
arch/x86/Makefile.in
arch/x86/chorba_sse41.c [new file with mode: 0644]
arch/x86/x86_features.c
arch/x86/x86_features.h
arch/x86/x86_functions.h
cmake/detect-intrinsics.cmake
configure
functable.c
test/benchmarks/benchmark_crc32.cc
test/test_crc32.cc

index df137738a764c989b63636ccb48c78ae4fe6ba6e..2324ecdad9f6c6bdf057d38d63431afa8bf58314 100644 (file)
@@ -131,7 +131,8 @@ elseif(BASEARCH_S360_FOUND)
 elseif(BASEARCH_X86_FOUND)
     option(WITH_SSE2 "Build with SSE2" ON)
     cmake_dependent_option(WITH_SSSE3 "Build with SSSE3" ON "WITH_SSE2" OFF)
-    cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSSE3" OFF)
+    cmake_dependent_option(WITH_SSE41 "Build with SSE41" ON "WITH_SSSE3" OFF)
+    cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSE41" OFF)
     cmake_dependent_option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON "WITH_SSE42" OFF)
     cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF)
     cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF)
@@ -151,7 +152,7 @@ mark_as_advanced(FORCE
     WITH_DFLTCC_INFLATE
     WITH_CRC32_VX
     WITH_AVX2 WITH_SSE2
-    WITH_SSSE3 WITH_SSE42
+    WITH_SSSE3 WITH_SSE41 WITH_SSE42
     WITH_PCLMULQDQ
     WITH_ALTIVEC
     WITH_POWER8
@@ -1035,9 +1036,20 @@ if(WITH_OPTIM)
                 set(WITH_SSSE3 OFF)
             endif()
         endif()
+        if(WITH_SSE41)
+            check_sse41_intrinsics()
+            if(HAVE_SSE41_INTRIN AND WITH_SSSE3)
+                add_definitions(-DX86_SSE41)
+                set(SSE41_SRCS ${ARCHDIR}/chorba_sse41.c)
+                list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
+                set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_SSE41 OFF)
+            endif()
+        endif()
         if(WITH_SSE42)
             check_sse42_intrinsics()
-            if(HAVE_SSE42_INTRIN AND WITH_SSSE3)
+            if(HAVE_SSE42_INTRIN AND WITH_SSE41)
                 add_definitions(-DX86_SSE42)
                 set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
                 add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
@@ -1526,6 +1538,7 @@ elseif(BASEARCH_X86_FOUND)
     add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
     add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
     add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
+    add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
     add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
     add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
     add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
index 3b00c3ed8f6e5ff4d73f9afec777937d091080a9..53f5325928fdd1b6425a8560e2c472448d62bea3 100644 (file)
@@ -13,6 +13,7 @@ AVX512VNNIFLAG=-mavx512vnni -mbmi2
 AVX2FLAG=-mavx2 -mbmi2
 SSE2FLAG=-msse2
 SSSE3FLAG=-mssse3
+SSE41FLAG=-msse4.1
 SSE42FLAG=-msse4.2
 PCLMULFLAG=-mpclmul
 VPCLMULFLAG=-mvpclmulqdq
@@ -35,6 +36,7 @@ all: \
        chunkset_sse2.o chunkset_sse2.lo \
        chunkset_ssse3.o chunkset_ssse3.lo \
        chorba_sse2.o chorba_sse2.lo \
+       chorba_sse41.o chorba_sse41.lo \
        compare256_avx2.o compare256_avx2.lo \
        compare256_avx512.o compare256_avx512.lo \
        compare256_sse2.o compare256_sse2.lo \
@@ -79,6 +81,12 @@ chorba_sse2.o:
 chorba_sse2.lo:
        $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse2.c
 
+chorba_sse41.o:
+       $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
+
+chorba_sse41.lo:
+       $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
+
 compare256_avx2.o:
        $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
 
diff --git a/arch/x86/chorba_sse41.c b/arch/x86/chorba_sse41.c
new file mode 100644 (file)
index 0000000..5e03ed1
--- /dev/null
@@ -0,0 +1,342 @@
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41)
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32.h"
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch/generic/generic_functions.h"
+#include <assert.h>
+
+extern uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
+extern uint32_t chorba_small_nondestructive_sse2(uint32_t c, uint64_t *aligned_buf, size_t aligned_len);
+
+#define READ_NEXT(in, off, a, b) do { \
+        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+        } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+        d  = _mm_srli_epi64(invec, 20); \
+        } while (0);
+
+#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
+        out0 = _mm_slli_si128(in0, shift); \
+        out1 = _mm_alignr_epi8(in1, in0, shift); \
+        out2 = _mm_alignr_epi8(in2, in1, shift); \
+        out3 = _mm_alignr_epi8(in3, in2, shift); \
+        out4 = _mm_srli_si128(in3, shift); \
+        } while (0)
+
+#define STORE4(out0, out1, out2, out3, out) do { \
+        _mm_store_si128(out++, out0); \
+        _mm_store_si128(out++, out1); \
+        _mm_store_si128(out++, out2); \
+        _mm_store_si128(out++, out3); \
+    } while (0)
+
+#define READ4(out0, out1, out2, out3, in) do { \
+    out0 = _mm_load_si128(in++); \
+    out1 = _mm_load_si128(in++); \
+    out2 = _mm_load_si128(in++); \
+    out3 = _mm_load_si128(in++); \
+    } while (0)
+
+/* This is intentionally shifted one down to compensate for the deferred store from
+ * the last iteration */
+#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
+    out0 = _mm_xor_si128(in[1], xor0); \
+    out1 = _mm_xor_si128(in[2], xor1); \
+    out2 = _mm_xor_si128(in[3], xor2); \
+    out3 = _mm_xor_si128(in[4], xor3); \
+    } while (0)
+
+static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t* buf, size_t len) {
+    const uint64_t* input = buf;
+    ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+    __m128i *bitbuffer_v = (__m128i*)bitbuffer;
+    const uint8_t* bitbufferbytes = (const uint8_t*) bitbuffer;
+    __m128i z = _mm_setzero_si128();
+
+    __m128i *bitbuf128 = &bitbuffer_v[64];
+    __m128i *bitbuf144 = &bitbuffer_v[72];
+    __m128i *bitbuf182 = &bitbuffer_v[91];
+    __m128i *bitbuf210 = &bitbuffer_v[105];
+    __m128i *bitbuf300 = &bitbuffer_v[150];
+    __m128i *bitbuf0 = bitbuf128;
+    __m128i *inptr = (__m128i*)input;
+
+    /* We only need to zero out the bytes between the 128'th value and the 144th
+     * that are actually read */
+    __m128i *z_cursor = bitbuf128;
+    for (size_t i = 0; i < 2; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We only need to zero out the bytes between the 144'th value and the 182nd that
+     * are actually read */
+    z_cursor = bitbuf144 + 8;
+    for (size_t i = 0; i < 11; ++i) {
+        _mm_store_si128(z_cursor++, z);
+    }
+
+    /* We only need to zero out the bytes between the 182nd value and the 210th that
+     * are actually read. */
+    z_cursor = bitbuf182;
+    for (size_t i = 0; i < 4; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We need to mix this in */
+    __m128i init_crc = _mm_cvtsi64x_si128(crc);
+    crc = 0;
+
+    size_t i = 0;
+
+    /* Previous iteration runs carried over */
+    __m128i buf144 = z;
+    __m128i buf182 = z;
+    __m128i buf210 = z;
+
+    for(; i + 300*8+64 < len && i < 22 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        READ4(in12, in34, in56, in78, inptr);
+
+        if (i == 0) {
+            in12 = _mm_xor_si128(in12, init_crc);
+        }
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e = _mm_xor_si128(buf182, in_1);
+        STORE4(e, in23, in45, in67, bitbuf182);
+        buf182 = in8_;
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len && i < 32 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len && i < 84 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        if (i < 128 * 8) {
+            READ4(in12, in34, in56, in78, inptr);
+        } else {
+            in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+        }
+
+        // [0, 145, 183, 211]
+
+        /* Pre Penryn CPUs the unpack should be faster */
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i n, o, p;
+        __m128i m = _mm_xor_si128(buf210, in_1);
+
+        /* Couldn't tell you why but despite knowing that this is always false,
+         * removing this branch with GCC makes things significantly slower. Some
+         * loop bodies must be being joined or something */
+        if (i < 84 * 8) {
+            n = in23;
+            o = in45;
+            p = in67;
+            buf210 = in8_;
+        } else {
+            READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
+        }
+
+        STORE4(m, n, o, p, bitbuf210);
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    /* Second half of stores bubbled out */
+    _mm_store_si128(bitbuf144, buf144);
+    _mm_store_si128(bitbuf182, buf182);
+    _mm_store_si128(bitbuf210, buf210);
+
+    /* We also have to zero out the tail */
+    size_t left_to_z = len - (300*8 + i);
+    __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
+    while (left_to_z >= 64) {
+       STORE4(z, z, z, z, bitbuf_tail);
+       left_to_z -= 64;
+    }
+
+    while (left_to_z >= 16) {
+       _mm_store_si128(bitbuf_tail++, z);
+       left_to_z -= 16;
+    }
+
+    uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
+    while (left_to_z--) {
+       *tail_bytes++ = 0;
+    }
+
+    ALIGNED_(16) uint64_t final[9] = {0};
+    __m128i next12, next34, next56;
+    next12 = z;
+    next34 = z;
+    next56 = z;
+
+    for(; (i + 72 < len); i += 32) {
+        __m128i in1in2, in3in4;
+        __m128i in1in2_, in3in4_;
+        __m128i ab1, ab2, ab3, ab4;
+        __m128i cd1, cd2, cd3, cd4;
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
+
+        in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
+        in3in4 = _mm_xor_si128(in3in4, in3in4_);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        __m128i a2_ = _mm_slli_si128(ab2, 8);
+        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
+        __m128i a4_ = _mm_slli_si128(ab4, 8);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_srli_si128(cd2, 8);
+        __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
+        next12 = _mm_xor_si128(next12, next56);
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_srli_si128(cd4, 8);
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    __m128i *final128 = (__m128i*)final;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
+
+    uint8_t* final_bytes = (uint8_t*) final;
+
+    for(size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbufferbytes[(j+i)]) & 0xff] ^ (crc >> 8);
+    }
+    return crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c;
+    uint64_t* aligned_buf;
+    size_t aligned_len;
+
+    c = (~crc) & 0xffffffff;
+    uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;
+    if (algn_diff < len) {
+        if (algn_diff) {
+            c = crc32_braid_internal(c, buf, algn_diff);
+        }
+        aligned_buf = (uint64_t*) (buf + algn_diff);
+        aligned_len = len - algn_diff;
+        if(aligned_len > CHORBA_LARGE_THRESHOLD) {
+            c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
+        } else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD &&
+                   aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) {
+            c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len);
+        } else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) {
+            c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len);
+        } else {
+            c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
+        }
+    }
+    else {
+        c = crc32_braid_internal(c, buf, len);
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return c ^ 0xffffffff;
+}
+#endif
index 806e99a6f62b02f5d07e3da853a083597662633b..d2d25fb7cda3ffefeaf582c4cd20ac8fa4b5f45c 100644 (file)
@@ -85,6 +85,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
 
     features->has_sse2 = edx & 0x4000000;
     features->has_ssse3 = ecx & 0x200;
+    features->has_sse41 = ecx & 0x80000;
     features->has_sse42 = ecx & 0x100000;
     features->has_pclmulqdq = ecx & 0x2;
 
index 3901ad75becc2b928827010ad9c55e1ccaf3d7ff..2118b8e87ac4bc005a7a56da215ed19131d1b8f4 100644 (file)
@@ -17,6 +17,7 @@ struct x86_cpu_features {
     int has_bmi2;
     int has_sse2;
     int has_ssse3;
+    int has_sse41;
     int has_sse42;
     int has_pclmulqdq;
     int has_vpclmulqdq;
index 8e1943a7d2f2557396aad6a4185d0985d5fd6658..5e13ffe01b35eb25aff1f68c26e936bb8ed2ed94 100644 (file)
@@ -11,7 +11,7 @@
  * Further context:
  * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
 #if defined(_MSC_VER) && !defined(_M_AMD64) && _MSC_VER >= 1920 && _MSC_VER <= 1929
-#define NO_CHORBA_SSE2
+#define NO_CHORBA_SSE
 #endif
 
 #ifdef X86_SSE2
@@ -36,6 +36,12 @@ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsig
 void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
 #endif
 
+#ifdef X86_SSE41
+#   if !defined(WITHOUT_CHORBA)
+    uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len);
+#   endif
+#endif
+
 #ifdef X86_SSE42
 uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 #endif
@@ -104,7 +110,7 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #      define native_longest_match longest_match_sse2
 #      undef native_longest_match_slow
 #      define native_longest_match_slow longest_match_slow_sse2
-#      if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+#      if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
 #          undef native_crc32
 #          define native_crc32 crc32_chorba_sse2
 #      endif
@@ -119,6 +125,10 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #    undef native_inflate_fast
 #    define native_inflate_fast inflate_fast_ssse3
 #  endif
+#  if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && defined(__SSE4_1__) && !defined(NO_CHORBA_SSE)
+#   undef native_crc32
+#   define native_crc32 crc32_chorba_sse41
+#   endif
 // X86 - SSE4.2
 #  if defined(X86_SSE42) && defined(__SSE4_2__)
 #    undef native_adler32_fold_copy
index cf0ee5268d9cb39224cf96a5eb525d7061064ab3..66872766d92a6289a95648e0d225ef566ea949ae 100644 (file)
@@ -565,6 +565,29 @@ macro(check_ssse3_intrinsics)
     )
 endmacro()
 
+macro(check_sse41_intrinsics)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSE41FLAG "-msse4.1")
+            else()
+                set(SSE41FLAG "/arch:SSE4.1")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(SSE41FLAG "-msse4.1")
+        endif()
+    endif()
+    # Check whether compiler supports SSE4.1 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE41FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <smmintrin.h>
+        __m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
+        int main(void) { return 0; }"
+        HAVE_SSE41_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
 macro(check_sse42_intrinsics)
     if(NOT NATIVEFLAG)
         if(CMAKE_C_COMPILER_ID MATCHES "Intel")
index 05b7185811b89b4076b905fc16c056f02501172f..f138118c106fc2876d59aeafdab3cc07fdd0ca12 100755 (executable)
--- a/configure
+++ b/configure
@@ -111,6 +111,7 @@ avx512vnniflag="${avx512flag} -mavx512vnni"
 avx2flag="-mavx2 -mbmi2"
 sse2flag="-msse2"
 ssse3flag="-mssse3"
+sse41flag="-msse4.1"
 sse42flag="-msse4.2"
 pclmulflag="-mpclmul"
 vpclmulflag="-mvpclmulqdq -mavx512f"
@@ -1590,6 +1591,22 @@ EOF
     fi
 }
 
+check_sse41_intrinsics() {
+    # Check whether compiler supports SSE4.1 intrinsics
+    cat > $test.c << EOF
+#include <smmintrin.h>
+__m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
+int main(void) { return 0; }
+EOF
+    if try ${CC} ${CFLAGS} ${sse41flag} $test.c; then
+        echo "Checking for SSE4.1 intrinsics ... Yes." | tee -a configure.log
+        HAVE_SSE41_INTRIN=1
+    else
+        echo "Checking for SSE4.1 intrinsics ... No." | tee -a configure.log
+        HAVE_SSE41_INTRIN=0
+    fi
+}
+
 check_sse42_intrinsics() {
     # Check whether compiler supports SSE4.2 intrinsics
     cat > $test.c << EOF
@@ -1717,6 +1734,15 @@ case "${ARCH}" in
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo chunkset_ssse3.lo"
             fi
 
+            check_sse41_intrinsics
+
+            if test ${HAVE_SSE41_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_SSE41"
+                SFLAGS="${SFLAGS} -DX86_SSE41"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chorba_sse41.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chorba_sse41.lo"
+            fi
+
             check_sse42_intrinsics
 
             if test ${HAVE_SSE42_INTRIN} -eq 1; then
@@ -2263,6 +2289,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
 /^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
 /^SSE2FLAG *=/s#=.*#=$sse2flag#
 /^SSSE3FLAG *=/s#=.*#=$ssse3flag#
+/^SSE41FLAG *=/s#=.*#=$sse41flag#
 /^SSE42FLAG *=/s#=.*#=$sse42flag#
 /^PCLMULFLAG *=/s#=.*#=$pclmulflag#
 /^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#
index a7c1bd23ccd7f7c83dacdfa441cc47c9d1fed506..1d38637fc90a0f2d57ca33b460eadf6ee1fa37bb 100644 (file)
@@ -75,7 +75,7 @@ static void init_functable(void) {
     {
         ft.chunkmemset_safe = &chunkmemset_safe_sse2;
         ft.chunksize = &chunksize_sse2;
-#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
         ft.crc32 = &crc32_chorba_sse2;
 #endif
         ft.inflate_fast = &inflate_fast_sse2;
@@ -95,6 +95,16 @@ static void init_functable(void) {
         ft.inflate_fast = &inflate_fast_ssse3;
     }
 #endif
+
+    // X86 - SSE4.1
+#ifdef X86_SSE41
+    if (cf.x86.has_sse41) {
+#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
+        ft.crc32 = &crc32_chorba_sse41;
+#endif
+    }
+#endif
+
     // X86 - SSE4.2
 #ifdef X86_SSE42
     if (cf.x86.has_sse42) {
index e51cff7bba8488ed8a8bccca637a7ab0187d6afd..e6947715ff6aa7b0c5d1284c32ed571776487ac4 100644 (file)
@@ -69,8 +69,11 @@ BENCHMARK_CRC32(native, native_crc32, 1);
 #else
 
 #ifndef WITHOUT_CHORBA
-#   if defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+#   if defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
     BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+#       if defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
+        BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
+#       endif
 #   endif
 #endif
 
index f6aac12a9742fcb11ce7555fad0d715f5234d433..56667f0283298869a9f2a418497e0a351ddd5017 100644 (file)
@@ -193,7 +193,8 @@ static const crc32_test tests[] = {
     "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
     "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
     "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&", 600, 0x888AFA5B},
-  {0x0, buf32k, 32768, 0x217726B2}
+  {0x0, buf32k, 32768, 0x217726B2},
+  {0x0, buf32k, 16384, 0xE81722F0}
 };
 
 class crc32_variant : public ::testing::TestWithParam<crc32_test> {
@@ -281,8 +282,11 @@ TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
 #ifdef X86_VPCLMULQDQ_CRC
 TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
 #endif
-#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
 TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
 #endif
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
+TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41)
+#endif
 
 #endif