Enable use of _mm_shuffle_epi8 on machines without SSE4.1

author Cameron Cawley <ccawley2011@gmail.com>

Tue, 28 Mar 2023 18:01:44 +0000 (19:01 +0100)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Sat, 1 Apr 2023 15:27:49 +0000 (17:27 +0200)
author Cameron Cawley <ccawley2011@gmail.com>
Tue, 28 Mar 2023 18:01:44 +0000 (19:01 +0100)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Sat, 1 Apr 2023 15:27:49 +0000 (17:27 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 9ed3b24175b855444533042e6e7b9253ef62f360..e5184cca30f033bdd1b957014c5d14de0a6be898 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,7 +117,6 @@ elseif(BASEARCH_X86_FOUND)
      option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON)
      option(WITH_SSE2 "Build with SSE2" ON)
      option(WITH_SSSE3 "Build with SSSE3" ON)
-    option(WITH_SSE41 "Build with SSE41" ON)
      option(WITH_SSE42 "Build with SSE42" ON)
      option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON)
      option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON)
@@ -133,8 +132,7 @@ mark_as_advanced(FORCE
      WITH_DFLTCC_INFLATE
      WITH_CRC32_VX
      WITH_AVX2 WITH_SSE2
-    WITH_SSSE3 WITH_SSE41
-    WITH_SSE42
+    WITH_SSSE3 WITH_SSE42
      WITH_PCLMULQDQ
      WITH_ALTIVEC
      WITH_POWER8
@@ -787,17 +785,6 @@ if(WITH_OPTIM)
                  set(WITH_AVX512VNNI OFF)
              endif()
          endif()
-        if(WITH_SSE41)
-            check_sse41_intrinsics()
-            if(HAVE_SSE41_INTRIN)
-                add_definitions(-DX86_SSE41)
-                list(APPEND SSE41_SRCS ${ARCHDIR}/chunkset_sse41.c)
-                list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
-                set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
-            else()
-                set(WITH_SSE41 OFF)
-            endif()
-        endif()
          if(WITH_SSE42)
              check_sse42_intrinsics()
              if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
@@ -835,7 +822,7 @@ if(WITH_OPTIM)
              check_ssse3_intrinsics()
              if(HAVE_SSSE3_INTRIN)
                  add_definitions(-DX86_SSSE3)
-                set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c)
+                set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
                  add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"")
                  list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
                  set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
@@ -1226,7 +1213,6 @@ elseif(BASEARCH_X86_FOUND)
      add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
      add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
      add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
-    add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
      add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
      add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
      add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
diff --git a/README.md b/README.md

index 5b8e8ccdc25755294ef01acb43ef41ca224f3413..aa72365c9526b306c0b36b0c672c47d8a38ddbb1 100644 (file)
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Features
    * Hash table implementation using CRC32-C intrinsics on x86 and ARM
    * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
    * Compare256 implementations using SSE2, AVX2, Neon, & POWER9
-  * Inflate chunk copying using SSE2, AVX, Neon & VSX
+  * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
    * Support for hardware-accelerated deflate using IBM Z DFLTCC
  * Unaligned memory read/writes and large bit buffer improvements
  * Includes improvements from Cloudflare and Intel forks
@@ -213,7 +213,7 @@ Advanced Build Options
  | WITH_AVX512                     |                       | Build with AVX512 intrinsics                                        | ON                     |
  | WITH_AVX512VNNI                 |                       | Build with AVX512VNNI intrinsics                                    | ON                     |
  | WITH_SSE2                       |                       | Build with SSE2 intrinsics                                          | ON                     |
-| WITH_SSE41                      |                       | Build with SSE41 intrinsics                                         | ON                     |
+| WITH_SSSE3                      |                       | Build with SSSE3 intrinsics                                         | ON                     |
  | WITH_SSE42                      |                       | Build with SSE42 intrinsics                                         | ON                     |
  | WITH_PCLMULQDQ                  |                       | Build with PCLMULQDQ intrinsics                                     | ON                     |
  | WITH_VPCLMULQDQ                 | --without-vpclmulqdq  | Build with VPCLMULQDQ intrinsics                                    | ON                     |
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c

index 668c0019e96da5d7da072736548591f76e0b8614..1890c91356ee4be85e7650cc5867bab5bba02de1 100644 (file)
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -69,7 +69,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
      *chunk_rem = lut_rem.remval;
  
  #ifdef Z_MEMORY_SANITIZER
-    /* See note in chunkset_sse41.c for why this is ok */
+    /* See note in chunkset_ssse3.c for why this is ok */
      __msan_unpoison(buf + dist, 16 - dist);
  #endif
  
diff --git a/arch/generic/chunk_permute_table.h b/arch/generic/chunk_permute_table.h

index c7b2d2de7f955e8ca88355b7a0f9149d21686867..bad66ccc774b75d67b084d523b6fcfa03d17991c 100644 (file)
--- a/arch/generic/chunk_permute_table.h
+++ b/arch/generic/chunk_permute_table.h
@@ -1,4 +1,4 @@
-/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in

index 4cebe55531aa0d2d81f4d4807b565b588921d333..5fd51929ce28761d423fbbc39ea6493b3a2beb20 100644 (file)
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -13,7 +13,6 @@ AVX512VNNIFLAG=-mavx512vnni
  AVX2FLAG=-mavx2
  SSE2FLAG=-msse2
  SSSE3FLAG=-mssse3
-SSE41FLAG=-msse4.1
  SSE42FLAG=-msse4.2
  PCLMULFLAG=-mpclmul
  VPCLMULFLAG=-mvpclmulqdq
@@ -33,7 +32,7 @@ all: \
         adler32_ssse3.o adler32_ssse3.lo \
         chunkset_avx.o chunkset_avx.lo \
         chunkset_sse2.o chunkset_sse2.lo \
-       chunkset_sse41.o chunkset_sse41.lo \
+       chunkset_ssse3.o chunkset_ssse3.lo \
         compare256_avx2.o compare256_avx2.lo \
         compare256_sse2.o compare256_sse2.lo \
         insert_string_sse42.o insert_string_sse42.lo \
@@ -60,11 +59,11 @@ chunkset_sse2.o:
  chunkset_sse2.lo:
         $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
  
-chunkset_sse41.o:
-       $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+chunkset_ssse3.o:
+       $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
  
-chunkset_sse41.lo:
-       $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+chunkset_ssse3.lo:
+       $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
  
  compare256_avx2.o:
         $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c

index c2df2322fe248568f8727bcd1032cd03abda10e3..abcbb474f5183ce23a810e2a22f5eef02036cc84 100644 (file)
--- a/arch/x86/chunkset_avx.c
+++ b/arch/x86/chunkset_avx.c
@@ -85,7 +85,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
      *chunk_rem = lut_rem.remval;
  
  #ifdef Z_MEMORY_SANITIZER
-    /* See note in chunkset_sse4.c for why this is ok */
+    /* See note in chunkset_ssse3.c for why this is ok */
      __msan_unpoison(buf + dist, 32 - dist);
  #endif
  
diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_ssse3.c

similarity index 87%

rename from arch/x86/chunkset_sse41.c

rename to arch/x86/chunkset_ssse3.c

index 4b7396bcacec773ab9ca87fb78f200a391c77571..0bd6263859cb42c4aee69ea977b8706f70099df1 100644 (file)
--- a/arch/x86/chunkset_sse41.c
+++ b/arch/x86/chunkset_ssse3.c
@@ -1,13 +1,13 @@
-/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
  #include "zbuild.h"
  
-/* This requires SSE2 support. While it's implicit with SSE4, we can minimize
+/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
   * code size by sharing the chunkcopy functions, which will certainly compile
   * to identical machine code */
-#if defined(X86_SSE41) && defined(X86_SSE2)
+#if defined(X86_SSSE3) && defined(X86_SSE2)
  #include <immintrin.h>
  #include "../generic/chunk_permute_table.h"
  
@@ -88,15 +88,15 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
  extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
  extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
  
-#define CHUNKSIZE        chunksize_sse41
-#define CHUNKMEMSET      chunkmemset_sse41
-#define CHUNKMEMSET_SAFE chunkmemset_safe_sse41
+#define CHUNKSIZE        chunksize_ssse3
+#define CHUNKMEMSET      chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
  #define CHUNKCOPY        chunkcopy_sse2
  #define CHUNKUNROLL      chunkunroll_sse2
  
  #include "chunkset_tpl.h"
  
-#define INFLATE_FAST     inflate_fast_sse41
+#define INFLATE_FAST     inflate_fast_ssse3
  
  #include "inffast_tpl.h"
  
diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c

index f60ddbcf94ba6d8de102f4546c3115bf40475f73..3272e3fdd9721cb84442645030ef29bac1b8ac6f 100644 (file)
--- a/arch/x86/x86_features.c
+++ b/arch/x86/x86_features.c
@@ -66,7 +66,6 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
  
      features->has_sse2 = edx & 0x4000000;
      features->has_ssse3 = ecx & 0x200;
-    features->has_sse41 = ecx & 0x80000;
      features->has_sse42 = ecx & 0x100000;
      features->has_pclmulqdq = ecx & 0x2;
  
diff --git a/arch/x86/x86_features.h b/arch/x86/x86_features.h

index 00b510ffc1171547687b23ec001b00be763624af..4a36bde835d32efc64f19a7968456a03f698ee1b 100644 (file)
--- a/arch/x86/x86_features.h
+++ b/arch/x86/x86_features.h
@@ -12,7 +12,6 @@ struct x86_cpu_features {
      int has_avx512vnni;
      int has_sse2;
      int has_ssse3;
-    int has_sse41;
      int has_sse42;
      int has_pclmulqdq;
      int has_vpclmulqdq;
diff --git a/chunkset_tpl.h b/chunkset_tpl.h

index f70ef42cdb836f93d15241ce8d4a961b0cfbb775..f909a12557f0c8cd2765db2cf31ca8425249397e 100644 (file)
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -5,8 +5,8 @@
  #include "zbuild.h"
  #include <stdlib.h>
  
-#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
-extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
  #endif
  
  /* Returns the chunk size */
@@ -98,9 +98,9 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
         Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
      Assert(dist > 0, "chunkmemset cannot have a distance 0");
      /* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
      if (len <= 16) {
-        return chunkmemset_sse41(out, dist, len);
+        return chunkmemset_ssse3(out, dist, len);
      }
  #endif
  
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake

index 7b59cec53c42484c28555dab9a60cdccc0bbfebb..186d87d814f954c7663aa988b166eea89a41a594 100644 (file)
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -435,34 +435,6 @@ macro(check_ssse3_intrinsics)
      )
  endmacro()
  
-macro(check_sse41_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(SSE41FLAG "-msse4.1")
-        else()
-            set(SSE41FLAG "/arch:SSE4.1")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
-            set(SSE41FLAG "-msse4.1")
-        endif()
-    endif()
-    # Check whether compiler supports SSE4.1 intrinsics
-    set(CMAKE_REQUIRED_FLAGS "${SSE41FLAG} ${NATIVEFLAG}")
-    check_c_source_compile_or_run(
-        "#include <immintrin.h>
-        int main(void) {
-            __m128i u, v, w;
-            u = _mm_set1_epi8(1);
-            v = _mm_set1_epi8(2);
-            w = _mm_sad_epu8(u, v);
-            (void)w;
-            return 0;
-        }"
-        HAVE_SSE41_INTRIN
-    )
-endmacro()
-
  macro(check_sse42_intrinsics)
      if(CMAKE_C_COMPILER_ID MATCHES "Intel")
          if(CMAKE_HOST_UNIX OR APPLE)
diff --git a/configure b/configure

index eb9e57e90d4c304f00f234cc1206c78d7a13a216..2c320227fe2ba75bca997f18c3ae13719a735334 100755 (executable)
--- a/configure
+++ b/configure
@@ -110,7 +110,6 @@ avx512vnniflag="-mavx512vnni ${avx512flag}"
  avx2flag="-mavx2"
  sse2flag="-msse2"
  ssse3flag="-mssse3"
-sse41flag="-msse4.1"
  sse42flag="-msse4.2"
  pclmulflag="-mpclmul"
  vpclmulflag="-mvpclmulqdq -mavx512f"
@@ -1399,29 +1398,6 @@ EOF
      fi
  }
  
-check_sse41_intrinsics() {
-    # Check whether compiler supports SSE4.1 intrinsics
-    cat > $test.c << EOF
-#include <smmintrin.h>
-int main(void)
-{
-    __m128i u, v, w;
-    u = _mm_set1_epi8(1);
-    v = _mm_set1_epi8(2);
-    w = _mm_sad_epu8(u, v);
-    (void)w;
-    return 0;
-}
-EOF
-    if try ${CC} ${CFLAGS} ${sse41flag} $test.c; then
-        echo "Checking for SSE4.1 intrinsics ... Yes." | tee -a configure.log
-        HAVE_SSE41_INTRIN=1
-    else
-        echo "Checking for SSE4.1 intrinsics ... No." | tee -a configure.log
-        HAVE_SSE41_INTRIN=0
-    fi
-}
-
  check_sse42_intrinsics() {
      # Check whether compiler supports SSE4 CRC inline asm
      cat > $test.c << EOF
@@ -1615,16 +1591,6 @@ case "${ARCH}" in
                  ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo"
              fi
  
-            check_sse41_intrinsics
-
-            if test ${HAVE_SSE41_INTRIN} -eq 1; then
-                CFLAGS="${CFLAGS} -DX86_SSE41"
-                SFLAGS="${SFLAGS} -DX86_SSE41"
-
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse41.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse41.lo"
-            fi
-
              check_sse42_intrinsics
  
              if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
@@ -1659,8 +1625,8 @@ case "${ARCH}" in
              if test ${HAVE_SSSE3_INTRIN} -eq 1; then
                  CFLAGS="${CFLAGS} -DX86_SSSE3"
                  SFLAGS="${SFLAGS} -DX86_SSSE3"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o chunkset_ssse3.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo chunkset_ssse3.lo"
              fi
  
              check_pclmulqdq_intrinsics
@@ -2111,7 +2077,6 @@ echo sharedlibdir = $sharedlibdir >> configure.log
  echo uname = $uname >> configure.log
  echo sse2flag = $sse2flag >> configure.log
  echo ssse3flag = $ssse3flag >> configure.log
-echo sse41flag = $sse41flag >> configure.log
  echo sse42flag = $sse42flag >> configure.log
  echo pclmulflag = $pclmulflag >> configure.log
  echo vpclmulflag = $vpclmulflag >> configure.log
@@ -2250,7 +2215,6 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
  /^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
  /^SSE2FLAG *=/s#=.*#=$sse2flag#
  /^SSSE3FLAG *=/s#=.*#=$ssse3flag#
-/^SSE41FLAG *=/s#=.*#=$sse41flag#
  /^SSE42FLAG *=/s#=.*#=$sse42flag#
  /^PCLMULFLAG *=/s#=.*#=$pclmulflag#
  /^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#
diff --git a/cpu_features.h b/cpu_features.h

index 462671a187ee264a01bcb8c86f1f5d1d7457f6c5..e47f947826c9a69fa11fd844230de5741876f4a1 100644 (file)
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -99,8 +99,8 @@ extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, un
  extern uint32_t chunksize_sse2(void);
  extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
  #endif
-#ifdef X86_SSE41
-extern uint8_t* chunkmemset_safe_sse41(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSSE3
+extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
  #endif
  #ifdef X86_AVX2
  extern uint32_t chunksize_avx(void);
@@ -126,8 +126,8 @@ extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
  #ifdef X86_SSE2
  extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
  #endif
-#ifdef X86_SSE41
-extern void inflate_fast_sse41(PREFIX3(stream) *strm, uint32_t start);
+#ifdef X86_SSSE3
+extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
  #endif
  #ifdef X86_AVX2
  extern void inflate_fast_avx(PREFIX3(stream) *strm, uint32_t start);
diff --git a/functable.c b/functable.c

index c7d477c7f066a2934a05e91b019f04651d450731..4212da09075a2f7437d597dd105ba64cc8fda927 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -75,16 +75,15 @@ static void init_functable(void) {
  #endif
      // X86 - SSSE3
  #ifdef X86_SSSE3
-    if (cf.x86.has_ssse3)
+    if (cf.x86.has_ssse3) {
          ft.adler32 = &adler32_ssse3;
-#endif
-    // X86 - SSE4
-#if defined(X86_SSE41) && defined(X86_SSE2)
-    if (cf.x86.has_sse41) {
-        ft.chunkmemset_safe = &chunkmemset_safe_sse41;
-        ft.inflate_fast = &inflate_fast_sse41;
+#  ifdef X86_SSE2
+        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
+        ft.inflate_fast = &inflate_fast_ssse3;
+#  endif
      }
  #endif
+    // X86 - SSE4.2
  #ifdef X86_SSE42
      if (cf.x86.has_sse42) {
          ft.adler32_fold_copy = &adler32_fold_copy_sse42;
diff --git a/win32/Makefile.msc b/win32/Makefile.msc

index d2a98d6f0a1b94386cd30eb54d4d1d32c4d494c3..8a01e31710c07a00e4da53118098c965db14be53 100644 (file)
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -56,6 +56,7 @@ OBJS = \
         chunkset.obj \
         chunkset_avx.obj \
         chunkset_sse2.obj \
+       chunkset_ssse3.obj \
         compare256.obj \
         compare256_avx2.obj \
         compare256_sse2.obj \
@@ -202,6 +203,7 @@ uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
  chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
  chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
  chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+chunkset_ssse3.obj: $(SRCDIR)/arch/x86/chunkset_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
  cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
  crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
  crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
author	Cameron Cawley <ccawley2011@gmail.com>
	Tue, 28 Mar 2023 18:01:44 +0000 (19:01 +0100)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Sat, 1 Apr 2023 15:27:49 +0000 (17:27 +0200)
CMakeLists.txt		patch \| blob \| blame \| history
README.md		patch \| blob \| blame \| history
arch/arm/chunkset_neon.c		patch \| blob \| blame \| history
arch/generic/chunk_permute_table.h		patch \| blob \| blame \| history
arch/x86/Makefile.in		patch \| blob \| blame \| history
arch/x86/chunkset_avx.c		patch \| blob \| blame \| history
arch/x86/chunkset_ssse3.c	[moved from arch/x86/chunkset_sse41.c with 87% similarity]	patch \| blob \| blame \| history
arch/x86/x86_features.c		patch \| blob \| blame \| history
arch/x86/x86_features.h		patch \| blob \| blame \| history
chunkset_tpl.h		patch \| blob \| blame \| history
cmake/detect-intrinsics.cmake		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
cpu_features.h		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
win32/Makefile.msc		patch \| blob \| blame \| history