From: Cameron Cawley Date: Tue, 28 Mar 2023 18:01:44 +0000 (+0100) Subject: Enable use of _mm_shuffle_epi8 on machines without SSE4.1 X-Git-Tag: 2.1.0-beta1~16 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b09215f75a0b69e19095133c3556e3e9c0686cc0;p=thirdparty%2Fzlib-ng.git Enable use of _mm_shuffle_epi8 on machines without SSE4.1 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ed3b2417..e5184cca3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,7 +117,6 @@ elseif(BASEARCH_X86_FOUND) option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON) option(WITH_SSE2 "Build with SSE2" ON) option(WITH_SSSE3 "Build with SSSE3" ON) - option(WITH_SSE41 "Build with SSE41" ON) option(WITH_SSE42 "Build with SSE42" ON) option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON) option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON) @@ -133,8 +132,7 @@ mark_as_advanced(FORCE WITH_DFLTCC_INFLATE WITH_CRC32_VX WITH_AVX2 WITH_SSE2 - WITH_SSSE3 WITH_SSE41 - WITH_SSE42 + WITH_SSSE3 WITH_SSE42 WITH_PCLMULQDQ WITH_ALTIVEC WITH_POWER8 @@ -787,17 +785,6 @@ if(WITH_OPTIM) set(WITH_AVX512VNNI OFF) endif() endif() - if(WITH_SSE41) - check_sse41_intrinsics() - if(HAVE_SSE41_INTRIN) - add_definitions(-DX86_SSE41) - list(APPEND SSE41_SRCS ${ARCHDIR}/chunkset_sse41.c) - list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS}) - set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}") - else() - set(WITH_SSE41 OFF) - endif() - endif() if(WITH_SSE42) check_sse42_intrinsics() if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN) @@ -835,7 +822,7 @@ if(WITH_OPTIM) check_ssse3_intrinsics() if(HAVE_SSSE3_INTRIN) add_definitions(-DX86_SSSE3) - set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c) + set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c) add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS}) set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}") @@ -1226,7 +1213,6 @@ elseif(BASEARCH_X86_FOUND) add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI") add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2") add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3") - add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41") add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42") add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ") add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ") diff --git a/README.md b/README.md index 5b8e8ccdc..aa72365c9 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Features * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX * Compare256 implementations using SSE2, AVX2, Neon, & POWER9 - * Inflate chunk copying using SSE2, AVX, Neon & VSX + * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX * Support for hardware-accelerated deflate using IBM Z DFLTCC * Unaligned memory read/writes and large bit buffer improvements * Includes improvements from Cloudflare and Intel forks @@ -213,7 +213,7 @@ Advanced Build Options | WITH_AVX512 | | Build with AVX512 intrinsics | ON | | WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON | | WITH_SSE2 | | Build with SSE2 intrinsics | ON | -| WITH_SSE41 | | Build with SSE41 intrinsics | ON | +| WITH_SSSE3 | | Build with SSSE3 intrinsics | ON | | WITH_SSE42 | | Build with SSE42 intrinsics | ON | | WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON | | WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON | diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 668c0019e..1890c9135 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -69,7 +69,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t *chunk_rem = lut_rem.remval; #ifdef Z_MEMORY_SANITIZER - /* See note in chunkset_sse41.c for why this is ok */ + /* See note in chunkset_ssse3.c for why this is ok */ __msan_unpoison(buf + dist, 16 - dist); #endif diff --git a/arch/generic/chunk_permute_table.h b/arch/generic/chunk_permute_table.h index c7b2d2de7..bad66ccc7 100644 --- a/arch/generic/chunk_permute_table.h +++ b/arch/generic/chunk_permute_table.h @@ -1,4 +1,4 @@ -/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions. +/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions. * For conditions of distribution and use, see copyright notice in zlib.h */ diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 4cebe5553..5fd51929c 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -13,7 +13,6 @@ AVX512VNNIFLAG=-mavx512vnni AVX2FLAG=-mavx2 SSE2FLAG=-msse2 SSSE3FLAG=-mssse3 -SSE41FLAG=-msse4.1 SSE42FLAG=-msse4.2 PCLMULFLAG=-mpclmul VPCLMULFLAG=-mvpclmulqdq @@ -33,7 +32,7 @@ all: \ adler32_ssse3.o adler32_ssse3.lo \ chunkset_avx.o chunkset_avx.lo \ chunkset_sse2.o chunkset_sse2.lo \ - chunkset_sse41.o chunkset_sse41.lo \ + chunkset_ssse3.o chunkset_ssse3.lo \ compare256_avx2.o compare256_avx2.lo \ compare256_sse2.o compare256_sse2.lo \ insert_string_sse42.o insert_string_sse42.lo \ @@ -60,11 +59,11 @@ chunkset_sse2.o: chunkset_sse2.lo: $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c -chunkset_sse41.o: - $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c +chunkset_ssse3.o: + $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c -chunkset_sse41.lo: - $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c +chunkset_ssse3.lo: + $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c compare256_avx2.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c index c2df2322f..abcbb474f 100644 --- a/arch/x86/chunkset_avx.c +++ b/arch/x86/chunkset_avx.c @@ -85,7 +85,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t *chunk_rem = lut_rem.remval; #ifdef Z_MEMORY_SANITIZER - /* See note in chunkset_sse4.c for why this is ok */ + /* See note in chunkset_ssse3.c for why this is ok */ __msan_unpoison(buf + dist, 32 - dist); #endif diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_ssse3.c similarity index 87% rename from arch/x86/chunkset_sse41.c rename to arch/x86/chunkset_ssse3.c index 4b7396bca..0bd626385 100644 --- a/arch/x86/chunkset_sse41.c +++ b/arch/x86/chunkset_ssse3.c @@ -1,13 +1,13 @@ -/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks. +/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks. * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" -/* This requires SSE2 support. While it's implicit with SSE4, we can minimize +/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize * code size by sharing the chunkcopy functions, which will certainly compile * to identical machine code */ -#if defined(X86_SSE41) && defined(X86_SSE2) +#if defined(X86_SSSE3) && defined(X86_SSE2) #include #include "../generic/chunk_permute_table.h" @@ -88,15 +88,15 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); -#define CHUNKSIZE chunksize_sse41 -#define CHUNKMEMSET chunkmemset_sse41 -#define CHUNKMEMSET_SAFE chunkmemset_safe_sse41 +#define CHUNKSIZE chunksize_ssse3 +#define CHUNKMEMSET chunkmemset_ssse3 +#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3 #define CHUNKCOPY chunkcopy_sse2 #define CHUNKUNROLL chunkunroll_sse2 #include "chunkset_tpl.h" -#define INFLATE_FAST inflate_fast_sse41 +#define INFLATE_FAST inflate_fast_ssse3 #include "inffast_tpl.h" diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c index f60ddbcf9..3272e3fdd 100644 --- a/arch/x86/x86_features.c +++ b/arch/x86/x86_features.c @@ -66,7 +66,6 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { features->has_sse2 = edx & 0x4000000; features->has_ssse3 = ecx & 0x200; - features->has_sse41 = ecx & 0x80000; features->has_sse42 = ecx & 0x100000; features->has_pclmulqdq = ecx & 0x2; diff --git a/arch/x86/x86_features.h b/arch/x86/x86_features.h index 00b510ffc..4a36bde83 100644 --- a/arch/x86/x86_features.h +++ b/arch/x86/x86_features.h @@ -12,7 +12,6 @@ struct x86_cpu_features { int has_avx512vnni; int has_sse2; int has_ssse3; - int has_sse41; int has_sse42; int has_pclmulqdq; int has_vpclmulqdq; diff --git a/chunkset_tpl.h b/chunkset_tpl.h index f70ef42cd..f909a1255 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -5,8 +5,8 @@ #include "zbuild.h" #include -#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2) -extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len); +#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2) +extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len); #endif /* Returns the chunk size */ @@ -98,9 +98,9 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ Assert(dist > 0, "chunkmemset cannot have a distance 0"); /* Only AVX2 */ -#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2) +#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2) if (len <= 16) { - return chunkmemset_sse41(out, dist, len); + return chunkmemset_ssse3(out, dist, len); } #endif diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 7b59cec53..186d87d81 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -435,34 +435,6 @@ macro(check_ssse3_intrinsics) ) endmacro() -macro(check_sse41_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "Intel") - if(CMAKE_HOST_UNIX OR APPLE) - set(SSE41FLAG "-msse4.1") - else() - set(SSE41FLAG "/arch:SSE4.1") - endif() - elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) - set(SSE41FLAG "-msse4.1") - endif() - endif() - # Check whether compiler supports SSE4.1 intrinsics - set(CMAKE_REQUIRED_FLAGS "${SSE41FLAG} ${NATIVEFLAG}") - check_c_source_compile_or_run( - "#include - int main(void) { - __m128i u, v, w; - u = _mm_set1_epi8(1); - v = _mm_set1_epi8(2); - w = _mm_sad_epu8(u, v); - (void)w; - return 0; - }" - HAVE_SSE41_INTRIN - ) -endmacro() - macro(check_sse42_intrinsics) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) diff --git a/configure b/configure index eb9e57e90..2c320227f 100755 --- a/configure +++ b/configure @@ -110,7 +110,6 @@ avx512vnniflag="-mavx512vnni ${avx512flag}" avx2flag="-mavx2" sse2flag="-msse2" ssse3flag="-mssse3" -sse41flag="-msse4.1" sse42flag="-msse4.2" pclmulflag="-mpclmul" vpclmulflag="-mvpclmulqdq -mavx512f" @@ -1399,29 +1398,6 @@ EOF fi } -check_sse41_intrinsics() { - # Check whether compiler supports SSE4.1 intrinsics - cat > $test.c << EOF -#include -int main(void) -{ - __m128i u, v, w; - u = _mm_set1_epi8(1); - v = _mm_set1_epi8(2); - w = _mm_sad_epu8(u, v); - (void)w; - return 0; -} -EOF - if try ${CC} ${CFLAGS} ${sse41flag} $test.c; then - echo "Checking for SSE4.1 intrinsics ... Yes." | tee -a configure.log - HAVE_SSE41_INTRIN=1 - else - echo "Checking for SSE4.1 intrinsics ... No." | tee -a configure.log - HAVE_SSE41_INTRIN=0 - fi -} - check_sse42_intrinsics() { # Check whether compiler supports SSE4 CRC inline asm cat > $test.c << EOF @@ -1615,16 +1591,6 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo" fi - check_sse41_intrinsics - - if test ${HAVE_SSE41_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_SSE41" - SFLAGS="${SFLAGS} -DX86_SSE41" - - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse41.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse41.lo" - fi - check_sse42_intrinsics if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then @@ -1659,8 +1625,8 @@ case "${ARCH}" in if test ${HAVE_SSSE3_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSSE3" SFLAGS="${SFLAGS} -DX86_SSSE3" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o chunkset_ssse3.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo chunkset_ssse3.lo" fi check_pclmulqdq_intrinsics @@ -2111,7 +2077,6 @@ echo sharedlibdir = $sharedlibdir >> configure.log echo uname = $uname >> configure.log echo sse2flag = $sse2flag >> configure.log echo ssse3flag = $ssse3flag >> configure.log -echo sse41flag = $sse41flag >> configure.log echo sse42flag = $sse42flag >> configure.log echo pclmulflag = $pclmulflag >> configure.log echo vpclmulflag = $vpclmulflag >> configure.log @@ -2250,7 +2215,6 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in " /^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag# /^SSE2FLAG *=/s#=.*#=$sse2flag# /^SSSE3FLAG *=/s#=.*#=$ssse3flag# -/^SSE41FLAG *=/s#=.*#=$sse41flag# /^SSE42FLAG *=/s#=.*#=$sse42flag# /^PCLMULFLAG *=/s#=.*#=$pclmulflag# /^VPCLMULFLAG *=/s#=.*#=$vpclmulflag# diff --git a/cpu_features.h b/cpu_features.h index 462671a18..e47f94782 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -99,8 +99,8 @@ extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, un extern uint32_t chunksize_sse2(void); extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif -#ifdef X86_SSE41 -extern uint8_t* chunkmemset_safe_sse41(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#ifdef X86_SSSE3 +extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif #ifdef X86_AVX2 extern uint32_t chunksize_avx(void); @@ -126,8 +126,8 @@ extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); #ifdef X86_SSE2 extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start); #endif -#ifdef X86_SSE41 -extern void inflate_fast_sse41(PREFIX3(stream) *strm, uint32_t start); +#ifdef X86_SSSE3 +extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); #endif #ifdef X86_AVX2 extern void inflate_fast_avx(PREFIX3(stream) *strm, uint32_t start); diff --git a/functable.c b/functable.c index c7d477c7f..4212da090 100644 --- a/functable.c +++ b/functable.c @@ -75,16 +75,15 @@ static void init_functable(void) { #endif // X86 - SSSE3 #ifdef X86_SSSE3 - if (cf.x86.has_ssse3) + if (cf.x86.has_ssse3) { ft.adler32 = &adler32_ssse3; -#endif - // X86 - SSE4 -#if defined(X86_SSE41) && defined(X86_SSE2) - if (cf.x86.has_sse41) { - ft.chunkmemset_safe = &chunkmemset_safe_sse41; - ft.inflate_fast = &inflate_fast_sse41; +# ifdef X86_SSE2 + ft.chunkmemset_safe = &chunkmemset_safe_ssse3; + ft.inflate_fast = &inflate_fast_ssse3; +# endif } #endif + // X86 - SSE4.2 #ifdef X86_SSE42 if (cf.x86.has_sse42) { ft.adler32_fold_copy = &adler32_fold_copy_sse42; diff --git a/win32/Makefile.msc b/win32/Makefile.msc index d2a98d6f0..8a01e3171 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -56,6 +56,7 @@ OBJS = \ chunkset.obj \ chunkset_avx.obj \ chunkset_sse2.obj \ + chunkset_ssse3.obj \ compare256.obj \ compare256_avx2.obj \ compare256_sse2.obj \ @@ -202,6 +203,7 @@ uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h +chunkset_ssse3.obj: $(SRCDIR)/arch/x86/chunkset_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h