this drops compilation of optimized functions below that minimum.
It also compiles the optimized functions at or below that minimum
with the same instruction set, potentially allowing the compiler to
make minor further optimizations.
-cmake_minimum_required(VERSION 3.5.1...3.29.0)
-if(CMAKE_VERSION VERSION_LESS 3.12)
- cmake_policy(VERSION ${CMAKE_VERSION})
-endif()
+cmake_minimum_required(VERSION 3.12...3.29.0)
message(STATUS "Using CMake version ${CMAKE_VERSION}")
if(POLICY CMP0169)
Useful when embedding into a larger library.
Default is no prefix (empty prefix).")
-# Add multi-choice option
+# Add multi-choice options
set(WITH_SANITIZER AUTO CACHE STRING "Enable sanitizer support")
set_property(CACHE WITH_SANITIZER PROPERTY STRINGS "Memory" "Address" "Undefined" "Thread")
+if(${ARCH} MATCHES "x86_64")
+ set(WITH_X86_64_ARCHVER "1" CACHE STRING "Minimum arch version required, default x86-64-v1")
+ set_property(CACHE WITH_X86_64_ARCHVER PROPERTY STRINGS "1" "2" "3" "4")
+else()
+ set(WITH_X86_64_ARCHVER "0" CACHE STRING "Disabled on non-x86-64 arch")
+endif()
+
if(BASEARCH_ARM_FOUND)
option(WITH_ARMV8 "Build with ARMv8 CRC32 intrinsics" ON)
option(WITH_NEON "Build with NEON intrinsics" ON)
add_definitions(-DWITHOUT_CHORBA)
endif()
-if(${ARCH} MATCHES "x86_64")
-
-endif()
-
if(NOT WITH_C_FALLBACK)
add_definitions(-DNO_C_FALLBACK=1)
endif()
endif()
elseif(BASEARCH_X86_FOUND)
- add_definitions(-DX86_FEATURES)
- list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_functions.h)
- if(WITH_RUNTIME_CPU_DETECTION)
- list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
- list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
- endif()
- if(MSVC)
- list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
- endif()
- check_xsave_intrinsics()
- if(HAVE_XSAVE_INTRIN)
- add_feature_info(XSAVE 1 "Support XSAVE intrinsics using \"${XSAVEFLAG}\"")
- if(WITH_RUNTIME_CPU_DETECTION)
- set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
- endif()
- if(NOT (CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8.2))
- add_definitions(-DX86_HAVE_XSAVE_INTRIN)
- endif()
- endif()
- if(WITH_SSE2)
- check_sse2_intrinsics()
- # FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true
- if("${ARCH}" MATCHES "i[3-6]86")
- cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF)
- endif()
- if(HAVE_SSE2_INTRIN)
- add_definitions(-DX86_SSE2)
- set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/chorba_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
- list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
- if(NOT ${ARCH} MATCHES "x86_64")
- set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
- add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable")
- if(FORCE_SSE2)
- add_definitions(-DX86_NOCHECK_SSE2)
- endif()
- endif()
- else()
- set(WITH_SSE2 OFF)
- endif()
- endif()
- if(WITH_SSSE3)
- check_ssse3_intrinsics()
- if(HAVE_SSSE3_INTRIN AND WITH_SSE2)
- add_definitions(-DX86_SSSE3)
- set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
- add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
- set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
- else()
- set(WITH_SSSE3 OFF)
- endif()
- endif()
- if(WITH_SSE41)
- check_sse41_intrinsics()
- if(HAVE_SSE41_INTRIN AND WITH_SSSE3)
- add_definitions(-DX86_SSE41)
- set(SSE41_SRCS ${ARCHDIR}/chorba_sse41.c)
- list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
- set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
- else()
- set(WITH_SSE41 OFF)
- endif()
- endif()
- if(WITH_SSE42)
- check_sse42_intrinsics()
- if(HAVE_SSE42_INTRIN AND WITH_SSE41)
- add_definitions(-DX86_SSE42)
- set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
- add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
- set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
- else()
- set(WITH_SSE42 OFF)
- endif()
- endif()
- if(WITH_PCLMULQDQ)
- check_pclmulqdq_intrinsics()
- if(HAVE_PCLMULQDQ_INTRIN AND WITH_SSE42)
- add_definitions(-DX86_PCLMULQDQ_CRC)
- set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
- add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSE42FLAG} ${PCLMULFLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
- set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
- else()
- set(WITH_PCLMULQDQ OFF)
- endif()
- endif()
- if(WITH_AVX2)
- check_avx2_intrinsics()
- if(HAVE_AVX2_INTRIN AND WITH_SSE42)
- add_definitions(-DX86_AVX2)
- set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
- add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
- list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
- add_feature_info(AVX2_CHUNKSET 1 "Support AVX2 optimized chunkset, using \"${AVX2FLAG}\"")
- list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
- add_feature_info(AVX2_COMPARE256 1 "Support AVX2 optimized compare256, using \"${AVX2FLAG}\"")
- list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
- add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
- set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
- else()
- set(WITH_AVX2 OFF)
- endif()
- endif()
- if(WITH_AVX512)
- check_avx512_intrinsics()
- if(HAVE_AVX512_INTRIN AND WITH_AVX2)
- add_definitions(-DX86_AVX512)
- list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
- add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
- list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
- add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"")
- list(APPEND AVX512_SRCS ${ARCHDIR}/compare256_avx512.c)
- add_feature_info(AVX512_COMPARE256 1 "Support AVX512 optimized compare256, using \"${AVX512FLAG}\"")
- list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
- list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
- set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
- else()
- set(WITH_AVX512 OFF)
- endif()
- endif()
- if(WITH_AVX512VNNI)
- check_avx512vnni_intrinsics()
- if(HAVE_AVX512VNNI_INTRIN AND WITH_AVX2)
- add_definitions(-DX86_AVX512VNNI)
- add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"")
- list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
- list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
- set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
- else()
- set(WITH_AVX512VNNI OFF)
- endif()
- endif()
- if(WITH_VPCLMULQDQ)
- check_vpclmulqdq_intrinsics()
- if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX512)
- add_definitions(-DX86_VPCLMULQDQ_CRC)
- set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
- add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
- set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
- else()
- set(WITH_VPCLMULQDQ OFF)
- endif()
- endif()
+ include(cmake/arch-x86.cmake)
endif()
endif()
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
#endif
#ifdef X86_SSE2
-uint32_t chunksize_sse2(void);
-uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+# if !defined(ZARCHVER) || ZARCHVER == 1
+ uint32_t chunksize_sse2(void);
+ uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+ void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+# if !defined(WITHOUT_CHORBA)
+ uint32_t crc32_chorba_sse2(uint32_t crc32, const uint8_t *buf, size_t len);
+# endif
+# endif
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
- uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
+# if !defined(ZARCHVER) || ZARCHVER <= 2
void slide_hash_sse2(deflate_state *s);
+# ifdef HAVE_BUILTIN_CTZ
+ uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+ uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
+ uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
+# endif
# endif
- void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
-# if !defined(WITHOUT_CHORBA)
- uint32_t crc32_chorba_sse2(uint32_t crc32, const uint8_t *buf, size_t len);
-# endif
#endif
#ifdef X86_SSSE3
-uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
-uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
-void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+# if !defined(ZARCHVER) || ZARCHVER == 1 || X86_AVX512VNNI
+ uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+# endif
+
+# if !defined(ZARCHVER) || ZARCHVER <= 2
+ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+ void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+# endif
#endif
#ifdef X86_SSE41
-# if !defined(WITHOUT_CHORBA)
+# if !defined(WITHOUT_CHORBA)
uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len);
-# endif
+# endif
#endif
#ifdef X86_SSE42
-uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+# if !defined(ZARCHVER) || ZARCHVER <= 2
+ uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+# endif
#endif
#ifdef X86_AVX2
-uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
-uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint32_t chunksize_avx2(void);
-uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+# if !defined(ZARCHVER) || ZARCHVER <= 3
+ uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+ uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+ uint32_t chunksize_avx2(void);
+ uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+ void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+# endif
# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
- uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
+# if !defined(ZARCHVER) || ZARCHVER <= 3
+ uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+ uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
+ uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
+# endif
void slide_hash_avx2(deflate_state *s);
# endif
- void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
#endif
+
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
--- /dev/null
+add_definitions(-DX86_FEATURES)
+list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_functions.h)
+if(WITH_RUNTIME_CPU_DETECTION)
+ list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
+endif()
+
+if(MSVC)
+ list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
+endif()
+
+check_xsave_intrinsics()
+check_sse2_intrinsics()
+check_ssse3_intrinsics()
+check_sse41_intrinsics()
+check_sse42_intrinsics()
+check_avx2_intrinsics()
+check_avx512_intrinsics()
+check_avx512vnni_intrinsics()
+check_pclmulqdq_intrinsics()
+check_vpclmulqdq_intrinsics()
+
+if(${ARCH} MATCHES "x86_64" AND ${WITH_X86_64_ARCHVER} GREATER_EQUAL "2" AND ${WITH_X86_64_ARCHVER} LESS_EQUAL "4")
+ if(WITH_NATIVE_INSTRUCTIONS)
+ MESSAGE(FATAL_ERROR "WITH_NATIVE_INSTRUCTIONS cannot be enabled together with WITH_X86_64_ARCHVER > 1")
+ endif()
+ add_definitions(-DZARCHVER=${WITH_X86_64_ARCHVER})
+endif()
+
+## Handle sets of instruction sets based on different x86-64 arch versions
+# x86-64-v2 (up to SSE4.2)
+if(${WITH_X86_64_ARCHVER} EQUAL "2")
+ if(WITH_SSE2 AND HAVE_SSE2_INTRIN AND WITH_SSSE3 AND HAVE_SSSE3_INTRIN AND WITH_SSE41 AND HAVE_SSE41_INTRIN
+ AND WITH_SSE42 AND HAVE_SSE42_INTRIN)
+
+ add_definitions(-DX86_SSE2 -DX86_SSSE3 -DX86_SSE41 -DX86_SSE42)
+ set(ARCHVER_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/chunkset_ssse3.c ${ARCHDIR}/chorba_sse2.c ${ARCHDIR}/chorba_sse41.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+ set_property(SOURCE ${ARCHVER_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
+
+ add_feature_info(SSE42_ADLER32 1 "Support SSE42-accelerated adler32, using \"${SSE42FLAG}\"")
+ add_feature_info(SSSE3_CHUNKSET 1 "Support SSSE3-accelerated chunkset, using \"${SSE42FLAG}\"")
+ add_feature_info(SSE41_CHORBA 1 "Support SSE41-accelerated chorba, using \"${SSE42FLAG}\"")
+ add_feature_info(SSE2_COMPARE256 1 "Support SSE2-accelerated compare256, using \"${SSE42FLAG}\"")
+ add_feature_info(SSE2_SLIDEHASH 1 "Support SSE2-accelerated slidehash, using \"${SSE42FLAG}\"")
+
+ else()
+ MESSAGE(FATAL_ERROR "WITH_X86_64_ARCHVER=2 Requires SSE2, SSSE3, SSE41 and SSE42, and their intrinsics to be supported and enabled.")
+ endif()
+endif()
+
+# x86-64-v3 (up to AVX2)
+if(${WITH_X86_64_ARCHVER} EQUAL "3")
+ if(WITH_SSE2 AND HAVE_SSE2_INTRIN AND WITH_SSSE3 AND HAVE_SSSE3_INTRIN AND WITH_SSE41 AND HAVE_SSE41_INTRIN
+ AND WITH_SSE42 AND HAVE_SSE42_INTRIN AND WITH_AVX2 AND HAVE_AVX2_INTRIN)
+
+ add_definitions(-DX86_SSE2 -DX86_SSSE3 -DX86_SSE41 -DX86_SSE42 -DX86_AVX2)
+ set(ARCHVER_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/adler32_avx2.c ${ARCHDIR}/chunkset_avx2.c ${ARCHDIR}/chorba_sse2.c ${ARCHDIR}/chorba_sse41.c ${ARCHDIR}/compare256_avx2.c ${ARCHDIR}/slide_hash_avx2.c)
+ set_property(SOURCE ${ARCHVER_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
+
+ add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"")
+ add_feature_info(AVX2_CHUNKSET 1 "Support AVX2-accelerated chunkset, using \"${AVX2FLAG}\"")
+ add_feature_info(SSE41_CHORBA 1 "Support SSE41-accelerated chorba, using \"${AVX2FLAG}\"")
+ add_feature_info(AVX2_COMPARE256 1 "Support AVX2-accelerated compare256, using \"${AVX2FLAG}\"")
+ add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2-accelerated slidehash, using \"${AVX2FLAG}\"")
+
+ else()
+ MESSAGE(FATAL_ERROR "WITH_X86_64_ARCHVER=2 Requires SSE2, SSSE3, SSE41, SSE42 and AVX2, and their intrinsics to be supported and enabled.")
+ endif()
+endif()
+
+# x86-64-v4 (up to AVX512)
+if(${WITH_X86_64_ARCHVER} EQUAL "4")
+ if(WITH_SSE2 AND HAVE_SSE2_INTRIN AND WITH_SSSE3 AND HAVE_SSSE3_INTRIN AND WITH_SSE41 AND HAVE_SSE41_INTRIN
+ AND WITH_SSE42 AND HAVE_SSE42_INTRIN AND WITH_AVX2 AND HAVE_AVX2_INTRIN AND WITH_AVX512 AND HAVE_AVX512_INTRIN)
+
+ add_definitions(-DX86_SSE2 -DX86_SSSE3 -DX86_SSE41 -DX86_SSE42 -DX86_AVX2 -DX86_AVX512)
+ set(ARCHVER_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/adler32_avx2.c ${ARCHDIR}/adler32_avx512.c ${ARCHDIR}/chunkset_avx512.c ${ARCHDIR}/chorba_sse2.c ${ARCHDIR}/chorba_sse41.c ${ARCHDIR}/compare256_avx512.c ${ARCHDIR}/slide_hash_avx2.c)
+ list(APPEND ARCHVER_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+ set_property(SOURCE ${ARCHVER_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
+
+ add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
+ add_feature_info(AVX512_CHUNKSET 1 "Support AVX512-accelerated chunkset, using \"${AVX512FLAG}\"")
+ add_feature_info(SSE41_CHORBA 1 "Support SSE41-accelerated chorba, using \"${AVX512FLAG}\"")
+ add_feature_info(AVX512_COMPARE256 1 "Support AVX512-accelerated compare256, using \"${AVX512FLAG}\"")
+ add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2-accelerated slidehash, using \"${AVX512FLAG}\"")
+
+ else()
+ MESSAGE(FATAL_ERROR "WITH_X86_64_ARCHVER=2 Requires SSE2, SSSE3, SSE41, SSE42, AVX2 and AVX512, and their intrinsics to be supported and enabled.")
+ endif()
+endif()
+
+list(APPEND ZLIB_ARCH_SRCS ${ARCHVER_SRCS})
+list(APPEND ZLIB_ARCH_HDRS ${ARCHVER_HDRS})
+
+
+# Handle Individual instruction sets from x86-64-v1
+if(NOT ${ARCH} MATCHES "x86_64" OR ${WITH_X86_64_ARCHVER} EQUAL "1")
+ if(WITH_SSE2)
+ if("${ARCH}" MATCHES "i[3-6]86")
+ cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF)
+ endif()
+ if(HAVE_SSE2_INTRIN)
+ add_definitions(-DX86_SSE2)
+ set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/chorba_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+ list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
+ if(NOT ${ARCH} MATCHES "x86_64")
+ set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
+ endif()
+ else()
+ set(WITH_SSE2 OFF)
+ endif()
+ endif()
+
+ if(WITH_SSSE3)
+ if(HAVE_SSSE3_INTRIN AND WITH_SSE2)
+ add_definitions(-DX86_SSSE3)
+ set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
+ add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
+ set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_SSSE3 OFF)
+ endif()
+ endif()
+
+ if(WITH_SSE41)
+ if(HAVE_SSE41_INTRIN AND WITH_SSSE3)
+ add_definitions(-DX86_SSE41)
+ set(SSE41_SRCS ${ARCHDIR}/chorba_sse41.c)
+ list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
+ set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_SSE41 OFF)
+ endif()
+ endif()
+
+ if(WITH_SSE42)
+ if(HAVE_SSE42_INTRIN AND WITH_SSE41)
+ add_definitions(-DX86_SSE42)
+ set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
+ add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
+ set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_SSE42 OFF)
+ endif()
+ endif()
+endif()
+
+# Handle Individual instruction sets from x86-64-v2
+if(NOT ${ARCH} MATCHES "x86_64" OR ${WITH_X86_64_ARCHVER} LESS_EQUAL "2")
+ if(WITH_AVX2)
+ if(HAVE_AVX2_INTRIN AND WITH_SSE42)
+ add_definitions(-DX86_AVX2)
+ set(AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
+ add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"")
+ list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
+ add_feature_info(AVX2_CHUNKSET 1 "Support AVX2 optimized chunkset, using \"${AVX2FLAG}\"")
+ list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
+ add_feature_info(AVX2_COMPARE256 1 "Support AVX2 optimized compare256, using \"${AVX2FLAG}\"")
+ list(APPEND AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
+ add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
+ set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_AVX2 OFF)
+ endif()
+ endif()
+endif()
+
+# Handle Individual instruction sets from x86-64-v3
+if(NOT ${ARCH} MATCHES "x86_64" OR ${WITH_X86_64_ARCHVER} LESS_EQUAL "3")
+ if(WITH_AVX512)
+ if(HAVE_AVX512_INTRIN AND WITH_AVX2)
+ add_definitions(-DX86_AVX512)
+ list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
+ add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
+ list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
+ add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"")
+ list(APPEND AVX512_SRCS ${ARCHDIR}/compare256_avx512.c)
+ add_feature_info(AVX512_COMPARE256 1 "Support AVX512 optimized compare256, using \"${AVX512FLAG}\"")
+ list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+ list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+ set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_AVX512 OFF)
+ endif()
+ endif()
+endif()
+
+# Handle Individual instruction sets not part of any arch-version
+if(HAVE_XSAVE_INTRIN)
+ add_feature_info(XSAVE 1 "Support XSAVE intrinsics using \"${XSAVEFLAG}\"")
+ if(WITH_RUNTIME_CPU_DETECTION)
+ set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
+ endif()
+ if(NOT (CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8.2))
+ add_definitions(-DX86_HAVE_XSAVE_INTRIN)
+ endif()
+endif()
+
+if(WITH_PCLMULQDQ)
+ if(HAVE_PCLMULQDQ_INTRIN AND WITH_SSE42)
+ add_definitions(-DX86_PCLMULQDQ_CRC)
+ set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
+ add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSE42FLAG} ${PCLMULFLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
+ set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_PCLMULQDQ OFF)
+ endif()
+endif()
+
+if(WITH_VPCLMULQDQ)
+ if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX512)
+ add_definitions(-DX86_VPCLMULQDQ_CRC)
+ set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
+ add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
+ set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_VPCLMULQDQ OFF)
+ endif()
+endif()
+
+if(WITH_AVX512VNNI)
+ if(HAVE_AVX512VNNI_INTRIN AND WITH_AVX2)
+ add_definitions(-DX86_AVX512VNNI)
+ add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"")
+ list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
+ if(${WITH_X86_64_ARCHVER} EQUAL "2")
+ add_definitions(-DX86_SSSE3)
+ list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_ssse3.c)
+ endif()
+ list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
+ set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_AVX512VNNI OFF)
+ endif()
+endif()
+
// X86 - SSE2
#ifdef X86_SSE2
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+# if !defined(__x86_64__) && !defined(_M_X64)
if (cf.x86.has_sse2)
# endif
{
+# if !defined(ZARCHVER) || ZARCHVER == 1
ft.chunkmemset_safe = &chunkmemset_safe_sse2;
ft.chunksize = &chunksize_sse2;
-#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
- ft.crc32 = &crc32_chorba_sse2;
-#endif
ft.inflate_fast = &inflate_fast_sse2;
+# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
+ ft.crc32 = &crc32_chorba_sse2;
+# endif
+# endif
+# if !defined(ZARCHVER) || ZARCHVER <= 2
ft.slide_hash = &slide_hash_sse2;
-# ifdef HAVE_BUILTIN_CTZ
+# ifdef HAVE_BUILTIN_CTZ
ft.compare256 = &compare256_sse2;
ft.longest_match = &longest_match_sse2;
ft.longest_match_slow = &longest_match_slow_sse2;
+# endif
# endif
}
#endif
// X86 - SSSE3
#ifdef X86_SSSE3
if (cf.x86.has_ssse3) {
+# if !defined(ZARCHVER) || ZARCHVER == 1
ft.adler32 = &adler32_ssse3;
+# endif
+# if !defined(ZARCHVER) || ZARCHVER <= 2
ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
ft.inflate_fast = &inflate_fast_ssse3;
+# endif
}
#endif
// X86 - SSE4.2
#ifdef X86_SSE42
if (cf.x86.has_sse42) {
+# if !defined(ZARCHVER) || ZARCHVER <= 2
ft.adler32_fold_copy = &adler32_fold_copy_sse42;
+# endif
}
#endif
// X86 - PCLMUL
* for the shift results as an operand, eliminating several register-register moves when the original value needs
* to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
if (cf.x86.has_avx2 && cf.x86.has_bmi2) {
+# if !defined(ZARCHVER) || ZARCHVER <= 3
ft.adler32 = &adler32_avx2;
ft.adler32_fold_copy = &adler32_fold_copy_avx2;
ft.chunkmemset_safe = &chunkmemset_safe_avx2;
ft.chunksize = &chunksize_avx2;
ft.inflate_fast = &inflate_fast_avx2;
+# endif
ft.slide_hash = &slide_hash_avx2;
# ifdef HAVE_BUILTIN_CTZ
+# if !defined(ZARCHVER) || ZARCHVER <= 3
ft.compare256 = &compare256_avx2;
ft.longest_match = &longest_match_avx2;
ft.longest_match_slow = &longest_match_slow_avx2;
+# endif
# endif
}
#endif
} \
BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10)
+#ifndef NO_C_FALLBACK
BENCHMARK_ADLER32(c, adler32_c, 1);
+#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_ADLER32(native, native_adler32, 1);
BENCHMARK_ADLER32(ssse3, adler32_ssse3, test_cpu_features.x86.has_ssse3);
#endif
#ifdef X86_AVX2
-BENCHMARK_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2);
+# if !defined(ZARCHVER) || ZARCHVER <= 3
+ BENCHMARK_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2);
+# endif
#endif
#ifdef X86_AVX512
BENCHMARK_ADLER32(avx512, adler32_avx512, test_cpu_features.x86.has_avx512_common);
#endif
#ifdef X86_SSE42
-BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3);
-BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, test_cpu_features.x86.has_sse42);
+# if !defined(ZARCHVER) || ZARCHVER <= 2
+ BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3);
+ BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, test_cpu_features.x86.has_sse42);
+# endif
#endif
#ifdef X86_AVX2
-BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2);
-BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, test_cpu_features.x86.has_avx2);
+# if !defined(ZARCHVER) || ZARCHVER <= 3
+ BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2);
+ BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, test_cpu_features.x86.has_avx2);
+# endif
#endif
#ifdef X86_AVX512
BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, test_cpu_features.x86.has_avx512_common);
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
+# if !defined(ZARCHVER) || ZARCHVER <= 2
+ BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
+# endif
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
+# if !defined(ZARCHVER) || ZARCHVER <= 3
+ BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
+# endif
#endif
#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common);
#ifndef WITHOUT_CHORBA
# if defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
- BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+# if !defined(ZARCHVER) || ZARCHVER == 1
+ BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+# endif
# if defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
- BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
+ BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
# endif
# endif
#endif
BENCHMARK_SLIDEHASH(rvv, slide_hash_rvv, test_cpu_features.riscv.has_rvv);
#endif
#ifdef X86_SSE2
-BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2);
+# if !defined(ZARCHVER) || ZARCHVER <= 2
+ BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2);
+# endif
#endif
#ifdef X86_AVX2
BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2);
TEST_ADLER32(ssse3, adler32_ssse3, test_cpu_features.x86.has_ssse3)
#endif
#ifdef X86_AVX2
-TEST_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2)
+# if !defined(ZARCHVER) || ZARCHVER <= 3
+ TEST_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2)
+# endif
#endif
#ifdef X86_AVX512
TEST_ADLER32(avx512, adler32_avx512, test_cpu_features.x86.has_avx512_common)
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-TEST_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2)
+# if !defined(ZARCHVER) || ZARCHVER <= 2
+ TEST_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2)
+# endif
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-TEST_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2)
+# if !defined(ZARCHVER) || ZARCHVER <= 3
+ TEST_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2)
+# endif
#endif
#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common)
TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
#endif
#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
-TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
+# if !defined(ZARCHVER) || ZARCHVER == 1
+ TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
+# endif
#endif
#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41)