option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON)
option(WITH_SSE2 "Build with SSE2" ON)
option(WITH_SSSE3 "Build with SSSE3" ON)
- option(WITH_SSE41 "Build with SSE41" ON)
option(WITH_SSE42 "Build with SSE42" ON)
option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON)
option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON)
WITH_DFLTCC_INFLATE
WITH_CRC32_VX
WITH_AVX2 WITH_SSE2
- WITH_SSSE3 WITH_SSE41
- WITH_SSE42
+ WITH_SSSE3 WITH_SSE42
WITH_PCLMULQDQ
WITH_ALTIVEC
WITH_POWER8
set(WITH_AVX512VNNI OFF)
endif()
endif()
- if(WITH_SSE41)
- check_sse41_intrinsics()
- if(HAVE_SSE41_INTRIN)
- add_definitions(-DX86_SSE41)
- list(APPEND SSE41_SRCS ${ARCHDIR}/chunkset_sse41.c)
- list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
- set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
- else()
- set(WITH_SSE41 OFF)
- endif()
- endif()
if(WITH_SSE42)
check_sse42_intrinsics()
if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
check_ssse3_intrinsics()
if(HAVE_SSSE3_INTRIN)
add_definitions(-DX86_SSSE3)
- set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c)
+ set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
- add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
* Compare256 implementations using SSE2, AVX2, Neon, & POWER9
- * Inflate chunk copying using SSE2, AVX, Neon & VSX
+ * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
* Includes improvements from Cloudflare and Intel forks
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
-| WITH_SSE41 | | Build with SSE41 intrinsics | ON |
+| WITH_SSSE3 | | Build with SSSE3 intrinsics | ON |
| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
*chunk_rem = lut_rem.remval;
#ifdef Z_MEMORY_SANITIZER
- /* See note in chunkset_sse41.c for why this is ok */
+ /* See note in chunkset_ssse3.c for why this is ok */
__msan_unpoison(buf + dist, 16 - dist);
#endif
-/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
-SSE41FLAG=-msse4.1
SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
VPCLMULFLAG=-mvpclmulqdq
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx.o chunkset_avx.lo \
chunkset_sse2.o chunkset_sse2.lo \
- chunkset_sse41.o chunkset_sse41.lo \
+ chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
insert_string_sse42.o insert_string_sse42.lo \
chunkset_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
-chunkset_sse41.o:
- $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+chunkset_ssse3.o:
+ $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
-chunkset_sse41.lo:
- $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+chunkset_ssse3.lo:
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
compare256_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
*chunk_rem = lut_rem.remval;
#ifdef Z_MEMORY_SANITIZER
- /* See note in chunkset_sse4.c for why this is ok */
+ /* See note in chunkset_ssse3.c for why this is ok */
__msan_unpoison(buf + dist, 32 - dist);
#endif
-/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
-/* This requires SSE2 support. While it's implicit with SSE4, we can minimize
+/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
* code size by sharing the chunkcopy functions, which will certainly compile
* to identical machine code */
-#if defined(X86_SSE41) && defined(X86_SSE2)
+#if defined(X86_SSSE3) && defined(X86_SSE2)
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
-#define CHUNKSIZE chunksize_sse41
-#define CHUNKMEMSET chunkmemset_sse41
-#define CHUNKMEMSET_SAFE chunkmemset_safe_sse41
+#define CHUNKSIZE chunksize_ssse3
+#define CHUNKMEMSET chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#include "chunkset_tpl.h"
-#define INFLATE_FAST inflate_fast_sse41
+#define INFLATE_FAST inflate_fast_ssse3
#include "inffast_tpl.h"
features->has_sse2 = edx & 0x4000000;
features->has_ssse3 = ecx & 0x200;
- features->has_sse41 = ecx & 0x80000;
features->has_sse42 = ecx & 0x100000;
features->has_pclmulqdq = ecx & 0x2;
int has_avx512vnni;
int has_sse2;
int has_ssse3;
- int has_sse41;
int has_sse42;
int has_pclmulqdq;
int has_vpclmulqdq;
#include "zbuild.h"
#include <stdlib.h>
-#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
-extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
#endif
/* Returns the chunk size */
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
/* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
if (len <= 16) {
- return chunkmemset_sse41(out, dist, len);
+ return chunkmemset_ssse3(out, dist, len);
}
#endif
)
endmacro()
-macro(check_sse41_intrinsics)
- if(CMAKE_C_COMPILER_ID MATCHES "Intel")
- if(CMAKE_HOST_UNIX OR APPLE)
- set(SSE41FLAG "-msse4.1")
- else()
- set(SSE41FLAG "/arch:SSE4.1")
- endif()
- elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
- if(NOT NATIVEFLAG)
- set(SSE41FLAG "-msse4.1")
- endif()
- endif()
- # Check whether compiler supports SSE4.1 intrinsics
- set(CMAKE_REQUIRED_FLAGS "${SSE41FLAG} ${NATIVEFLAG}")
- check_c_source_compile_or_run(
- "#include <immintrin.h>
- int main(void) {
- __m128i u, v, w;
- u = _mm_set1_epi8(1);
- v = _mm_set1_epi8(2);
- w = _mm_sad_epu8(u, v);
- (void)w;
- return 0;
- }"
- HAVE_SSE41_INTRIN
- )
-endmacro()
-
macro(check_sse42_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
avx2flag="-mavx2"
sse2flag="-msse2"
ssse3flag="-mssse3"
-sse41flag="-msse4.1"
sse42flag="-msse4.2"
pclmulflag="-mpclmul"
vpclmulflag="-mvpclmulqdq -mavx512f"
fi
}
-check_sse41_intrinsics() {
- # Check whether compiler supports SSE4.1 intrinsics
- cat > $test.c << EOF
-#include <smmintrin.h>
-int main(void)
-{
- __m128i u, v, w;
- u = _mm_set1_epi8(1);
- v = _mm_set1_epi8(2);
- w = _mm_sad_epu8(u, v);
- (void)w;
- return 0;
-}
-EOF
- if try ${CC} ${CFLAGS} ${sse41flag} $test.c; then
- echo "Checking for SSE4.1 intrinsics ... Yes." | tee -a configure.log
- HAVE_SSE41_INTRIN=1
- else
- echo "Checking for SSE4.1 intrinsics ... No." | tee -a configure.log
- HAVE_SSE41_INTRIN=0
- fi
-}
-
check_sse42_intrinsics() {
# Check whether compiler supports SSE4 CRC inline asm
cat > $test.c << EOF
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo"
fi
- check_sse41_intrinsics
-
- if test ${HAVE_SSE41_INTRIN} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_SSE41"
- SFLAGS="${SFLAGS} -DX86_SSE41"
-
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse41.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse41.lo"
- fi
-
check_sse42_intrinsics
if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
if test ${HAVE_SSSE3_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSSE3"
SFLAGS="${SFLAGS} -DX86_SSSE3"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o chunkset_ssse3.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo chunkset_ssse3.lo"
fi
check_pclmulqdq_intrinsics
echo uname = $uname >> configure.log
echo sse2flag = $sse2flag >> configure.log
echo ssse3flag = $ssse3flag >> configure.log
-echo sse41flag = $sse41flag >> configure.log
echo sse42flag = $sse42flag >> configure.log
echo pclmulflag = $pclmulflag >> configure.log
echo vpclmulflag = $vpclmulflag >> configure.log
/^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
/^SSE2FLAG *=/s#=.*#=$sse2flag#
/^SSSE3FLAG *=/s#=.*#=$ssse3flag#
-/^SSE41FLAG *=/s#=.*#=$sse41flag#
/^SSE42FLAG *=/s#=.*#=$sse42flag#
/^PCLMULFLAG *=/s#=.*#=$pclmulflag#
/^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#
extern uint32_t chunksize_sse2(void);
extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
-#ifdef X86_SSE41
-extern uint8_t* chunkmemset_safe_sse41(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSSE3
+extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef X86_AVX2
extern uint32_t chunksize_avx(void);
#ifdef X86_SSE2
extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
#endif
-#ifdef X86_SSE41
-extern void inflate_fast_sse41(PREFIX3(stream) *strm, uint32_t start);
+#ifdef X86_SSSE3
+extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_AVX2
extern void inflate_fast_avx(PREFIX3(stream) *strm, uint32_t start);
#endif
// X86 - SSSE3
#ifdef X86_SSSE3
- if (cf.x86.has_ssse3)
+ if (cf.x86.has_ssse3) {
ft.adler32 = &adler32_ssse3;
-#endif
- // X86 - SSE4
-#if defined(X86_SSE41) && defined(X86_SSE2)
- if (cf.x86.has_sse41) {
- ft.chunkmemset_safe = &chunkmemset_safe_sse41;
- ft.inflate_fast = &inflate_fast_sse41;
+# ifdef X86_SSE2
+ ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
+ ft.inflate_fast = &inflate_fast_ssse3;
+# endif
}
#endif
+ // X86 - SSE4.2
#ifdef X86_SSE42
if (cf.x86.has_sse42) {
ft.adler32_fold_copy = &adler32_fold_copy_sse42;
chunkset.obj \
chunkset_avx.obj \
chunkset_sse2.obj \
+ chunkset_ssse3.obj \
compare256.obj \
compare256_avx2.obj \
compare256_sse2.obj \
chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+chunkset_ssse3.obj: $(SRCDIR)/arch/x86/chunkset_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h