SSE4.1 optimized chorba

author Adam Stylinski <kungfujesus06@gmail.com>

Tue, 11 Mar 2025 01:17:25 +0000 (21:17 -0400)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Tue, 15 Apr 2025 12:11:12 +0000 (14:11 +0200)
author Adam Stylinski <kungfujesus06@gmail.com>
Tue, 11 Mar 2025 01:17:25 +0000 (21:17 -0400)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 15 Apr 2025 12:11:12 +0000 (14:11 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index df137738a764c989b63636ccb48c78ae4fe6ba6e..2324ecdad9f6c6bdf057d38d63431afa8bf58314 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,7 +131,8 @@ elseif(BASEARCH_S360_FOUND)
  elseif(BASEARCH_X86_FOUND)
      option(WITH_SSE2 "Build with SSE2" ON)
      cmake_dependent_option(WITH_SSSE3 "Build with SSSE3" ON "WITH_SSE2" OFF)
-    cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSSE3" OFF)
+    cmake_dependent_option(WITH_SSE41 "Build with SSE41" ON "WITH_SSSE3" OFF)
+    cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSE41" OFF)
      cmake_dependent_option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON "WITH_SSE42" OFF)
      cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF)
      cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF)
@@ -151,7 +152,7 @@ mark_as_advanced(FORCE
      WITH_DFLTCC_INFLATE
      WITH_CRC32_VX
      WITH_AVX2 WITH_SSE2
-    WITH_SSSE3 WITH_SSE42
+    WITH_SSSE3 WITH_SSE41 WITH_SSE42
      WITH_PCLMULQDQ
      WITH_ALTIVEC
      WITH_POWER8
@@ -1035,9 +1036,20 @@ if(WITH_OPTIM)
                  set(WITH_SSSE3 OFF)
              endif()
          endif()
+        if(WITH_SSE41)
+            check_sse41_intrinsics()
+            if(HAVE_SSE41_INTRIN AND WITH_SSSE3)
+                add_definitions(-DX86_SSE41)
+                set(SSE41_SRCS ${ARCHDIR}/chorba_sse41.c)
+                list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
+                set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_SSE41 OFF)
+            endif()
+        endif()
          if(WITH_SSE42)
              check_sse42_intrinsics()
-            if(HAVE_SSE42_INTRIN AND WITH_SSSE3)
+            if(HAVE_SSE42_INTRIN AND WITH_SSE41)
                  add_definitions(-DX86_SSE42)
                  set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
                  add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
@@ -1526,6 +1538,7 @@ elseif(BASEARCH_X86_FOUND)
      add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
      add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
      add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
+    add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
      add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
      add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
      add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in

index 3b00c3ed8f6e5ff4d73f9afec777937d091080a9..53f5325928fdd1b6425a8560e2c472448d62bea3 100644 (file)
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -13,6 +13,7 @@ AVX512VNNIFLAG=-mavx512vnni -mbmi2
  AVX2FLAG=-mavx2 -mbmi2
  SSE2FLAG=-msse2
  SSSE3FLAG=-mssse3
+SSE41FLAG=-msse4.1
  SSE42FLAG=-msse4.2
  PCLMULFLAG=-mpclmul
  VPCLMULFLAG=-mvpclmulqdq
@@ -35,6 +36,7 @@ all: \
         chunkset_sse2.o chunkset_sse2.lo \
         chunkset_ssse3.o chunkset_ssse3.lo \
         chorba_sse2.o chorba_sse2.lo \
+       chorba_sse41.o chorba_sse41.lo \
         compare256_avx2.o compare256_avx2.lo \
         compare256_avx512.o compare256_avx512.lo \
         compare256_sse2.o compare256_sse2.lo \
@@ -79,6 +81,12 @@ chorba_sse2.o:
  chorba_sse2.lo:
         $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse2.c
  
+chorba_sse41.o:
+       $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
+
+chorba_sse41.lo:
+       $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
+
  compare256_avx2.o:
         $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
  
diff --git a/arch/x86/chorba_sse41.c b/arch/x86/chorba_sse41.c

new file mode 100644 (file)

index 0000000..5e03ed1
--- /dev/null
+++ b/arch/x86/chorba_sse41.c
@@ -0,0 +1,342 @@
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41)
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32.h"
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch/generic/generic_functions.h"
+#include <assert.h>
+
+extern uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
+extern uint32_t chorba_small_nondestructive_sse2(uint32_t c, uint64_t *aligned_buf, size_t aligned_len);
+
+#define READ_NEXT(in, off, a, b) do { \
+        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+        } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+        d  = _mm_srli_epi64(invec, 20); \
+        } while (0);
+
+#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
+        out0 = _mm_slli_si128(in0, shift); \
+        out1 = _mm_alignr_epi8(in1, in0, shift); \
+        out2 = _mm_alignr_epi8(in2, in1, shift); \
+        out3 = _mm_alignr_epi8(in3, in2, shift); \
+        out4 = _mm_srli_si128(in3, shift); \
+        } while (0)
+
+#define STORE4(out0, out1, out2, out3, out) do { \
+        _mm_store_si128(out++, out0); \
+        _mm_store_si128(out++, out1); \
+        _mm_store_si128(out++, out2); \
+        _mm_store_si128(out++, out3); \
+    } while (0)
+
+#define READ4(out0, out1, out2, out3, in) do { \
+    out0 = _mm_load_si128(in++); \
+    out1 = _mm_load_si128(in++); \
+    out2 = _mm_load_si128(in++); \
+    out3 = _mm_load_si128(in++); \
+    } while (0)
+
+/* This is intentionally shifted one down to compensate for the deferred store from
+ * the last iteration */
+#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
+    out0 = _mm_xor_si128(in[1], xor0); \
+    out1 = _mm_xor_si128(in[2], xor1); \
+    out2 = _mm_xor_si128(in[3], xor2); \
+    out3 = _mm_xor_si128(in[4], xor3); \
+    } while (0)
+
+static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t* buf, size_t len) {
+    const uint64_t* input = buf;
+    ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+    __m128i *bitbuffer_v = (__m128i*)bitbuffer;
+    const uint8_t* bitbufferbytes = (const uint8_t*) bitbuffer;
+    __m128i z = _mm_setzero_si128();
+
+    __m128i *bitbuf128 = &bitbuffer_v[64];
+    __m128i *bitbuf144 = &bitbuffer_v[72];
+    __m128i *bitbuf182 = &bitbuffer_v[91];
+    __m128i *bitbuf210 = &bitbuffer_v[105];
+    __m128i *bitbuf300 = &bitbuffer_v[150];
+    __m128i *bitbuf0 = bitbuf128;
+    __m128i *inptr = (__m128i*)input;
+
+    /* We only need to zero out the bytes between the 128'th value and the 144th
+     * that are actually read */
+    __m128i *z_cursor = bitbuf128;
+    for (size_t i = 0; i < 2; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We only need to zero out the bytes between the 144'th value and the 182nd that
+     * are actually read */
+    z_cursor = bitbuf144 + 8;
+    for (size_t i = 0; i < 11; ++i) {
+        _mm_store_si128(z_cursor++, z);
+    }
+
+    /* We only need to zero out the bytes between the 182nd value and the 210th that
+     * are actually read. */
+    z_cursor = bitbuf182;
+    for (size_t i = 0; i < 4; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We need to mix this in */
+    __m128i init_crc = _mm_cvtsi64x_si128(crc);
+    crc = 0;
+
+    size_t i = 0;
+
+    /* Previous iteration runs carried over */
+    __m128i buf144 = z;
+    __m128i buf182 = z;
+    __m128i buf210 = z;
+
+    for(; i + 300*8+64 < len && i < 22 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        READ4(in12, in34, in56, in78, inptr);
+
+        if (i == 0) {
+            in12 = _mm_xor_si128(in12, init_crc);
+        }
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e = _mm_xor_si128(buf182, in_1);
+        STORE4(e, in23, in45, in67, bitbuf182);
+        buf182 = in8_;
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len && i < 32 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len && i < 84 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        if (i < 128 * 8) {
+            READ4(in12, in34, in56, in78, inptr);
+        } else {
+            in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+        }
+
+        // [0, 145, 183, 211]
+
+        /* Pre Penryn CPUs the unpack should be faster */
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i n, o, p;
+        __m128i m = _mm_xor_si128(buf210, in_1);
+
+        /* Couldn't tell you why but despite knowing that this is always false,
+         * removing this branch with GCC makes things significantly slower. Some
+         * loop bodies must be being joined or something */
+        if (i < 84 * 8) {
+            n = in23;
+            o = in45;
+            p = in67;
+            buf210 = in8_;
+        } else {
+            READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
+        }
+
+        STORE4(m, n, o, p, bitbuf210);
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    /* Second half of stores bubbled out */
+    _mm_store_si128(bitbuf144, buf144);
+    _mm_store_si128(bitbuf182, buf182);
+    _mm_store_si128(bitbuf210, buf210);
+
+    /* We also have to zero out the tail */
+    size_t left_to_z = len - (300*8 + i);
+    __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
+    while (left_to_z >= 64) {
+       STORE4(z, z, z, z, bitbuf_tail);
+       left_to_z -= 64;
+    }
+
+    while (left_to_z >= 16) {
+       _mm_store_si128(bitbuf_tail++, z);
+       left_to_z -= 16;
+    }
+
+    uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
+    while (left_to_z--) {
+       *tail_bytes++ = 0;
+    }
+
+    ALIGNED_(16) uint64_t final[9] = {0};
+    __m128i next12, next34, next56;
+    next12 = z;
+    next34 = z;
+    next56 = z;
+
+    for(; (i + 72 < len); i += 32) {
+        __m128i in1in2, in3in4;
+        __m128i in1in2_, in3in4_;
+        __m128i ab1, ab2, ab3, ab4;
+        __m128i cd1, cd2, cd3, cd4;
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
+
+        in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
+        in3in4 = _mm_xor_si128(in3in4, in3in4_);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        __m128i a2_ = _mm_slli_si128(ab2, 8);
+        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
+        __m128i a4_ = _mm_slli_si128(ab4, 8);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_srli_si128(cd2, 8);
+        __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
+        next12 = _mm_xor_si128(next12, next56);
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_srli_si128(cd4, 8);
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    __m128i *final128 = (__m128i*)final;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
+
+    uint8_t* final_bytes = (uint8_t*) final;
+
+    for(size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbufferbytes[(j+i)]) & 0xff] ^ (crc >> 8);
+    }
+    return crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c;
+    uint64_t* aligned_buf;
+    size_t aligned_len;
+
+    c = (~crc) & 0xffffffff;
+    uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;
+    if (algn_diff < len) {
+        if (algn_diff) {
+            c = crc32_braid_internal(c, buf, algn_diff);
+        }
+        aligned_buf = (uint64_t*) (buf + algn_diff);
+        aligned_len = len - algn_diff;
+        if(aligned_len > CHORBA_LARGE_THRESHOLD) {
+            c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
+        } else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD &&
+                   aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) {
+            c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len);
+        } else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) {
+            c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len);
+        } else {
+            c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
+        }
+    }
+    else {
+        c = crc32_braid_internal(c, buf, len);
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return c ^ 0xffffffff;
+}
+#endif
diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c

index 806e99a6f62b02f5d07e3da853a083597662633b..d2d25fb7cda3ffefeaf582c4cd20ac8fa4b5f45c 100644 (file)
--- a/arch/x86/x86_features.c
+++ b/arch/x86/x86_features.c
@@ -85,6 +85,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
  
      features->has_sse2 = edx & 0x4000000;
      features->has_ssse3 = ecx & 0x200;
+    features->has_sse41 = ecx & 0x80000;
      features->has_sse42 = ecx & 0x100000;
      features->has_pclmulqdq = ecx & 0x2;
  
diff --git a/arch/x86/x86_features.h b/arch/x86/x86_features.h

index 3901ad75becc2b928827010ad9c55e1ccaf3d7ff..2118b8e87ac4bc005a7a56da215ed19131d1b8f4 100644 (file)
--- a/arch/x86/x86_features.h
+++ b/arch/x86/x86_features.h
@@ -17,6 +17,7 @@ struct x86_cpu_features {
      int has_bmi2;
      int has_sse2;
      int has_ssse3;
+    int has_sse41;
      int has_sse42;
      int has_pclmulqdq;
      int has_vpclmulqdq;
diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h

index 8e1943a7d2f2557396aad6a4185d0985d5fd6658..5e13ffe01b35eb25aff1f68c26e936bb8ed2ed94 100644 (file)
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@@ -11,7 +11,7 @@
   * Further context:
   * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
  #if defined(_MSC_VER) && !defined(_M_AMD64) && _MSC_VER >= 1920 && _MSC_VER <= 1929
-#define NO_CHORBA_SSE2
+#define NO_CHORBA_SSE
  #endif
  
  #ifdef X86_SSE2
@@ -36,6 +36,12 @@ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsig
  void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
  #endif
  
+#ifdef X86_SSE41
+#   if !defined(WITHOUT_CHORBA)
+    uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len);
+#   endif
+#endif
+
  #ifdef X86_SSE42
  uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
  #endif
@@ -104,7 +110,7 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
  #      define native_longest_match longest_match_sse2
  #      undef native_longest_match_slow
  #      define native_longest_match_slow longest_match_slow_sse2
-#      if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+#      if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
  #          undef native_crc32
  #          define native_crc32 crc32_chorba_sse2
  #      endif
@@ -119,6 +125,10 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
  #    undef native_inflate_fast
  #    define native_inflate_fast inflate_fast_ssse3
  #  endif
+#  if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && defined(__SSE4_1__) && !defined(NO_CHORBA_SSE)
+#   undef native_crc32
+#   define native_crc32 crc32_chorba_sse41
+#   endif
  // X86 - SSE4.2
  #  if defined(X86_SSE42) && defined(__SSE4_2__)
  #    undef native_adler32_fold_copy
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake

index cf0ee5268d9cb39224cf96a5eb525d7061064ab3..66872766d92a6289a95648e0d225ef566ea949ae 100644 (file)
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -565,6 +565,29 @@ macro(check_ssse3_intrinsics)
      )
  endmacro()
  
+macro(check_sse41_intrinsics)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSE41FLAG "-msse4.1")
+            else()
+                set(SSE41FLAG "/arch:SSE4.1")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(SSE41FLAG "-msse4.1")
+        endif()
+    endif()
+    # Check whether compiler supports SSE4.1 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE41FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <smmintrin.h>
+        __m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
+        int main(void) { return 0; }"
+        HAVE_SSE41_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
  macro(check_sse42_intrinsics)
      if(NOT NATIVEFLAG)
          if(CMAKE_C_COMPILER_ID MATCHES "Intel")
diff --git a/configure b/configure

index 05b7185811b89b4076b905fc16c056f02501172f..f138118c106fc2876d59aeafdab3cc07fdd0ca12 100755 (executable)
--- a/configure
+++ b/configure
@@ -111,6 +111,7 @@ avx512vnniflag="${avx512flag} -mavx512vnni"
  avx2flag="-mavx2 -mbmi2"
  sse2flag="-msse2"
  ssse3flag="-mssse3"
+sse41flag="-msse4.1"
  sse42flag="-msse4.2"
  pclmulflag="-mpclmul"
  vpclmulflag="-mvpclmulqdq -mavx512f"
@@ -1590,6 +1591,22 @@ EOF
      fi
  }
  
+check_sse41_intrinsics() {
+    # Check whether compiler supports SSE4.1 intrinsics
+    cat > $test.c << EOF
+#include <smmintrin.h>
+__m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
+int main(void) { return 0; }
+EOF
+    if try ${CC} ${CFLAGS} ${sse41flag} $test.c; then
+        echo "Checking for SSE4.1 intrinsics ... Yes." | tee -a configure.log
+        HAVE_SSE41_INTRIN=1
+    else
+        echo "Checking for SSE4.1 intrinsics ... No." | tee -a configure.log
+        HAVE_SSE41_INTRIN=0
+    fi
+}
+
  check_sse42_intrinsics() {
      # Check whether compiler supports SSE4.2 intrinsics
      cat > $test.c << EOF
@@ -1717,6 +1734,15 @@ case "${ARCH}" in
                  ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo chunkset_ssse3.lo"
              fi
  
+            check_sse41_intrinsics
+
+            if test ${HAVE_SSE41_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_SSE41"
+                SFLAGS="${SFLAGS} -DX86_SSE41"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chorba_sse41.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chorba_sse41.lo"
+            fi
+
              check_sse42_intrinsics
  
              if test ${HAVE_SSE42_INTRIN} -eq 1; then
@@ -2263,6 +2289,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
  /^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
  /^SSE2FLAG *=/s#=.*#=$sse2flag#
  /^SSSE3FLAG *=/s#=.*#=$ssse3flag#
+/^SSE41FLAG *=/s#=.*#=$sse41flag#
  /^SSE42FLAG *=/s#=.*#=$sse42flag#
  /^PCLMULFLAG *=/s#=.*#=$pclmulflag#
  /^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#
diff --git a/functable.c b/functable.c

index a7c1bd23ccd7f7c83dacdfa441cc47c9d1fed506..1d38637fc90a0f2d57ca33b460eadf6ee1fa37bb 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -75,7 +75,7 @@ static void init_functable(void) {
      {
          ft.chunkmemset_safe = &chunkmemset_safe_sse2;
          ft.chunksize = &chunksize_sse2;
-#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
          ft.crc32 = &crc32_chorba_sse2;
  #endif
          ft.inflate_fast = &inflate_fast_sse2;
@@ -95,6 +95,16 @@ static void init_functable(void) {
          ft.inflate_fast = &inflate_fast_ssse3;
      }
  #endif
+
+    // X86 - SSE4.1
+#ifdef X86_SSE41
+    if (cf.x86.has_sse41) {
+#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
+        ft.crc32 = &crc32_chorba_sse41;
+#endif
+    }
+#endif
+
      // X86 - SSE4.2
  #ifdef X86_SSE42
      if (cf.x86.has_sse42) {
diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc

index e51cff7bba8488ed8a8bccca637a7ab0187d6afd..e6947715ff6aa7b0c5d1284c32ed571776487ac4 100644 (file)
--- a/test/benchmarks/benchmark_crc32.cc
+++ b/test/benchmarks/benchmark_crc32.cc
@@ -69,8 +69,11 @@ BENCHMARK_CRC32(native, native_crc32, 1);
  #else
  
  #ifndef WITHOUT_CHORBA
-#   if defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+#   if defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
      BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+#       if defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
+        BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
+#       endif
  #   endif
  #endif
  
diff --git a/test/test_crc32.cc b/test/test_crc32.cc

index f6aac12a9742fcb11ce7555fad0d715f5234d433..56667f0283298869a9f2a418497e0a351ddd5017 100644 (file)
--- a/test/test_crc32.cc
+++ b/test/test_crc32.cc
@@ -193,7 +193,8 @@ static const crc32_test tests[] = {
      "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
      "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
      "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&", 600, 0x888AFA5B},
-  {0x0, buf32k, 32768, 0x217726B2}
+  {0x0, buf32k, 32768, 0x217726B2},
+  {0x0, buf32k, 16384, 0xE81722F0}
  };
  
  class crc32_variant : public ::testing::TestWithParam<crc32_test> {
@@ -281,8 +282,11 @@ TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
  #ifdef X86_VPCLMULQDQ_CRC
  TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
  #endif
-#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
  TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
  #endif
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
+TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41)
+#endif
  
  #endif
author	Adam Stylinski <kungfujesus06@gmail.com>
	Tue, 11 Mar 2025 01:17:25 +0000 (21:17 -0400)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Tue, 15 Apr 2025 12:11:12 +0000 (14:11 +0200)
CMakeLists.txt		patch \| blob \| blame \| history
arch/x86/Makefile.in		patch \| blob \| blame \| history
arch/x86/chorba_sse41.c	[new file with mode: 0644]	patch \| blob
arch/x86/x86_features.c		patch \| blob \| blame \| history
arch/x86/x86_features.h		patch \| blob \| blame \| history
arch/x86/x86_functions.h		patch \| blob \| blame \| history
cmake/detect-intrinsics.cmake		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
test/benchmarks/benchmark_crc32.cc		patch \| blob \| blame \| history
test/test_crc32.cc		patch \| blob \| blame \| history