From: Nathan Moinvaziri Date: Sat, 20 Mar 2021 21:10:14 +0000 (-0700) Subject: Only run checks for intrinsics if optimizations are enabled. X-Git-Tag: 2.1.0-beta1~578 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ce6c2b4b00ba463d61dc4cd83384ea40790db1a6;p=thirdparty%2Fzlib-ng.git Only run checks for intrinsics if optimizations are enabled. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 946265289..9fcef6904 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,7 @@ include(FeatureSummary) include(cmake/detect-arch.cmake) include(cmake/detect-install-dirs.cmake) include(cmake/detect-coverage.cmake) +include(cmake/detect-intrinsics.cmake) include(cmake/detect-sanitizer.cmake) if(CMAKE_TOOLCHAIN_FILE) @@ -139,23 +140,11 @@ if(CMAKE_C_COMPILER_ID MATCHES "Intel") set(WARNFLAGS "-w3") set(WARNFLAGS_MAINTAINER "-w3 -Wcheck -Wremarks") set(WARNFLAGS_DISABLE "") - if(BASEARCH_X86_FOUND) - set(AVX2FLAG "-mavx2") - set(SSE2FLAG "-msse2") - set(SSSE3FLAG "-mssse3") - set(SSE4FLAG "-msse4.2") - endif() else() set(WARNFLAGS "/W3") set(WARNFLAGS_MAINTAINER "/W5") set(WARNFLAGS_DISABLE "") - if(BASEARCH_X86_FOUND) - set(AVX2FLAG "/arch:AVX2") - set(SSE2FLAG "/arch:SSE2") - set(SSSE3FLAG "/arch:SSSE3") - set(SSE4FLAG "/arch:SSE4.2") endif() - endif() if(WITH_NATIVE_INSTRUCTIONS) message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not supported on this configuration") endif() @@ -177,38 +166,22 @@ elseif(MSVC) if(NOT "${ARCH}" MATCHES "aarch64") set(NEONFLAG "/arch:VFPv4") endif() - elseif(BASEARCH_X86_FOUND) - if(NOT "${ARCH}" MATCHES "x86_64") - set(SSE2FLAG "/arch:SSE2") endif() - endif() if(WITH_NATIVE_INSTRUCTIONS) message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not supported on this configuration") endif() -else() - # catch all GNU C compilers as well as Clang and AppleClang - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - set(__GNUC__ ON) - endif() +elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") # Enable warnings in GCC and Clang - if(__GNUC__) set(WARNFLAGS "-Wall") set(WARNFLAGS_MAINTAINER "-Wextra -Wpedantic") set(WARNFLAGS_DISABLE "-Wno-implicit-fallthrough") - endif() if(WITH_NATIVE_INSTRUCTIONS) - if(__GNUC__) if(BASEARCH_PPC_FOUND) set(NATIVEFLAG "-mcpu=native") else() set(NATIVEFLAG "-march=native") endif() else() - message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not implemented yet on this configuration") - endif() - endif() - if(NOT NATIVEFLAG) - if(__GNUC__) if(BASEARCH_ARM_FOUND) if("${ARCH}" MATCHES "arm" AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi") # Auto-detect support for ARM floating point ABI @@ -228,31 +201,7 @@ else() message(STATUS "ARM floating point arch not auto-detected") endif() endif() - # NEON - if("${ARCH}" MATCHES "aarch64") - set(NEONFLAG "-march=armv8-a+simd") - else() - # Check whether -mfpu=neon is available - set(CMAKE_REQUIRED_FLAGS "-mfpu=neon") - check_c_source_compiles( - "int main() { return 0; }" - MFPU_NEON_AVAILABLE FAIL_REGEX "not supported") - set(CMAKE_REQUIRED_FLAGS) - if(MFPU_NEON_AVAILABLE) - set(NEONFLAG "-mfpu=neon") endif() - endif() - # ACLE - set(ACLEFLAG "-march=armv8-a+crc") - elseif(BASEARCH_PPC_FOUND) - set(POWER8FLAG "-mcpu=power8") - elseif(BASEARCH_X86_FOUND) - set(AVX2FLAG "-mavx2") - set(SSE2FLAG "-msse2") - set(SSSE3FLAG "-mssse3") - set(SSE4FLAG "-msse4") - set(PCLMULFLAG "-mpclmul") - endif() # Check whether -fno-lto is available set(CMAKE_REQUIRED_FLAGS "-fno-lto") check_c_source_compiles( @@ -263,6 +212,9 @@ else() set(NOLTOFLAG "-fno-lto") endif() endif() +else() + if(WITH_NATIVE_INSTRUCTIONS) + message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not implemented yet on this configuration") endif() endif() @@ -483,119 +435,7 @@ if(MSVC) add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) endif() -if(BASEARCH_PPC_FOUND) - # Check if we have what we need for POWER8 optimizations - set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG}") - check_c_source_compiles( - "#include - int main() { - return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); - }" - HAVE_POWER8 - ) - set(CMAKE_REQUIRED_FLAGS) -elseif(BASEARCH_X86_FOUND) - # Check whether compiler supports SSE2 instrinics - set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG}") - check_c_source_compile_or_run( - "#include - int main(void) { - __m128i zero = _mm_setzero_si128(); - (void)zero; - return 0; - }" - HAVE_SSE2_INTRIN - ) - # Check whether compiler supports SSSE3 intrinsics - set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG}") - check_c_source_compile_or_run( - "#include - int main(void) { - __m128i u, v, w; - u = _mm_set1_epi32(1); - v = _mm_set1_epi32(2); - w = _mm_hadd_epi32(u, v); - (void)w; - return 0; - }" - HAVE_SSSE3_INTRIN - ) - # Check whether compiler supports SSE4 CRC inline asm - set(CMAKE_REQUIRED_FLAGS "${SSE4FLAG}") - check_c_source_compile_or_run( - "int main(void) { - unsigned val = 0, h = 0; - #if defined(_MSC_VER) - { __asm mov edx, h __asm mov eax, val __asm crc32 eax, edx __asm mov val, eax } - #else - __asm__ __volatile__ ( \"crc32 %1,%0\" : \"+r\" (h) : \"r\" (val) ); - #endif - return (int)h; - }" - HAVE_SSE42CRC_INLINE_ASM - ) - # Check whether compiler supports SSE4 CRC intrinsics - check_c_source_compile_or_run( - "#include - int main(void) { - unsigned crc = 0; - char c = 'c'; - #if defined(_MSC_VER) - crc = _mm_crc32_u32(crc, c); - #else - crc = __builtin_ia32_crc32qi(crc, c); - #endif - (void)crc; - return 0; - }" - HAVE_SSE42CRC_INTRIN - ) - # Check whether compiler supports SSE4.2 compare string instrinics - check_c_source_compile_or_run( - "#include - int main(void) { - unsigned char a[64] = { 0 }; - unsigned char b[64] = { 0 }; - __m128i xmm_src0, xmm_src1; - xmm_src0 = _mm_loadu_si128((__m128i *)(char *)a); - xmm_src1 = _mm_loadu_si128((__m128i *)(char *)b); - return _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, 0); - }" - HAVE_SSE42CMPSTR_INTRIN - ) - # Check whether compiler supports PCLMULQDQ intrinsics - set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG}") - if(NOT (APPLE AND "${ARCH}" MATCHES "i386")) - # The pclmul code currently crashes on Mac in 32bit mode. Avoid for now. - check_c_source_compile_or_run( - "#include - int main(void) { - __m128i a = _mm_setzero_si128(); - __m128i b = _mm_setzero_si128(); - __m128i c = _mm_clmulepi64_si128(a, b, 0x10); - (void)c; - return 0; - }" - HAVE_PCLMULQDQ_INTRIN - ) - else() - set(HAVE_PCLMULQDQ_INTRIN NO) - endif() - # Check whether compiler supports AVX2 intrinics - set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG}") - check_c_source_compile_or_run( - "#include - int main(void) { - __m256i x = _mm256_set1_epi16(2); - const __m256i y = _mm256_set1_epi16(1); - x = _mm256_subs_epu16(x, y); - (void)x; - return 0; - }" - HAVE_AVX2_INTRIN - ) - set(CMAKE_REQUIRED_FLAGS) - +if(BASEARCH_X86_FOUND) # FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true if("${ARCH}" MATCHES "i[3-6]86") cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF) @@ -717,13 +557,18 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm.h) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c) if(WITH_ACLE AND NOT MSVC) + check_acle_intrinsics() + if(HAVE_ACLE_INTRIN) add_definitions(-DARM_ACLE_CRC_HASH) set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c) set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}") list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS}) add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"") endif() + endif() if(WITH_NEON) + check_neon_intrinsics() + if(MFPU_NEON_AVAILABLE) add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH) set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c ${ARCHDIR}/slide_neon.c) list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS}) @@ -734,8 +579,11 @@ if(WITH_OPTIM) add_feature_info(NEON_ADLER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"") add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"") endif() + endif() elseif(BASEARCH_PPC_FOUND) - if(WITH_POWER8 AND HAVE_POWER8) + if(WITH_POWER8) + check_power8_intrinsics() + if(HAVE_POWER8_INTRIN) add_definitions(-DPOWER8) add_definitions(-DPOWER_FEATURES) add_definitions(-DPOWER8_VSX_ADLER32) @@ -746,6 +594,7 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS}) set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}") endif() + endif() elseif(BASEARCH_S360_FOUND) if(WITH_DFLTCC_DEFLATE OR WITH_DFLTCC_INFLATE) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_common.c) @@ -765,7 +614,9 @@ if(WITH_OPTIM) if(MSVC) list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) endif() - if(WITH_AVX2 AND HAVE_AVX2_INTRIN) + if(WITH_AVX2) + check_avx2_intrinsics() + if(HAVE_AVX2_INTRIN) add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET) set(AVX2_SRCS ${ARCHDIR}/slide_avx.c) add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") @@ -778,7 +629,10 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS}) set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}") endif() - if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)) + endif() + if(WITH_SSE4) + check_sse4_intrinsics() + if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN) add_definitions(-DX86_SSE42_CRC_HASH) set(SSE42_SRCS ${ARCHDIR}/insert_string_sse.c) add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE4FLAG}\"") @@ -795,7 +649,10 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS}) set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE4FLAG} ${NOLTOFLAG}") endif() - if(WITH_SSE2 AND HAVE_SSE2_INTRIN) + endif() + if(WITH_SSE2) + check_sse2_intrinsics() + if(HAVE_SSE2_INTRIN) add_definitions(-DX86_SSE2 -DX86_SSE2_CHUNKSET -DX86_SSE2_SLIDEHASH) set(SSE2_SRCS ${ARCHDIR}/chunkset_sse.c ${ARCHDIR}/slide_sse.c) list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS}) @@ -807,14 +664,20 @@ if(WITH_OPTIM) endif() endif() endif() - if(WITH_SSSE3 AND HAVE_SSSE3_INTRIN) + endif() + if(WITH_SSSE3) + check_ssse3_intrinsics() + if(HAVE_SSSE3_INTRIN) add_definitions(-DX86_SSSE3 -DX86_SSSE3_ADLER32) set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c) add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS}) set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}") endif() - if(WITH_PCLMULQDQ AND HAVE_PCLMULQDQ_INTRIN AND WITH_SSSE3 AND WITH_SSE4) + endif() + if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE4) + check_pclmulqdq_intrinsics() + if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN) add_definitions(-DX86_PCLMULQDQ_CRC) set(PCLMULQDQ_SRCS ${ARCHDIR}/crc_folding.c) add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG}\"") @@ -823,6 +686,8 @@ if(WITH_OPTIM) endif() endif() endif() +endif() + message(STATUS "Architecture-specific source files: ${ZLIB_ARCH_SRCS}") #============================================================================ diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake new file mode 100644 index 000000000..a360b7e66 --- /dev/null +++ b/cmake/detect-intrinsics.cmake @@ -0,0 +1,221 @@ +# detect-intrinsics.cmake -- Detect compiler intrinsics support +# Licensed under the Zlib license, see LICENSE.md for details + +macro(check_acle_intrinsics) + if(NOT NATIVEFLAG) + set(ACLEFLAG "-march=armv8-a+crc") + endif() + # Check whether compiler supports ACLE flag + set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}") + check_c_source_compiles( + "int main() { return 0; }" + HAVE_ACLE_INTRIN FAIL_REGEX "not supported") + set(CMAKE_REQUIRED_FLAGS) +endmacro() + +macro(check_avx2_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(AVX2FLAG "-mavx2") + else() + set(AVX2FLAG "/arch:AVX2") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(AVX2FLAG "-mavx2") + endif() + endif() + # Check whether compiler supports AVX2 intrinics + set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG}") + check_c_source_compile_or_run( + "#include + int main(void) { + __m256i x = _mm256_set1_epi16(2); + const __m256i y = _mm256_set1_epi16(1); + x = _mm256_subs_epu16(x, y); + (void)x; + return 0; + }" + HAVE_AVX2_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + +macro(check_neon_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + if("${ARCH}" MATCHES "aarch64") + set(NEONFLAG "-march=armv8-a+simd") + else() + set(NEONFLAG "-mfpu=neon") + endif() + endif() + endif() + # Check whether compiler supports NEON flag + set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}") + check_c_source_compiles( + "int main() { return 0; }" + MFPU_NEON_AVAILABLE FAIL_REGEX "not supported") + set(CMAKE_REQUIRED_FLAGS) +endmacro() + +macro(check_pclmulqdq_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(PCLMULFLAG "-mpclmul") + endif() + endif() + # Check whether compiler supports PCLMULQDQ intrinsics + if(NOT (APPLE AND "${ARCH}" MATCHES "i386")) + # The pclmul code currently crashes on Mac in 32bit mode. Avoid for now. + set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG}") + check_c_source_compile_or_run( + "#include + int main(void) { + __m128i a = _mm_setzero_si128(); + __m128i b = _mm_setzero_si128(); + __m128i c = _mm_clmulepi64_si128(a, b, 0x10); + (void)c; + return 0; + }" + HAVE_PCLMULQDQ_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) + else() + set(HAVE_PCLMULQDQ_INTRIN OFF) + endif() +endmacro() + +macro(check_power8_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(POWER8FLAG "-mcpu=power8") + endif() + endif() + # Check if we have what we need for POWER8 optimizations + set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG}") + check_c_source_compiles( + "#include + int main() { + return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); + }" + HAVE_POWER8_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + +macro(check_sse2_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(SSE2FLAG "-msse2") + else() + set(SSE2FLAG "/arch:SSE2") + endif() + elseif(MSVC) + if(NOT "${ARCH}" MATCHES "x86_64") + set(SSE2FLAG "/arch:SSE2") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(SSE2FLAG "-msse2") + endif() + endif() + # Check whether compiler supports SSE2 instrinics + set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG}") + check_c_source_compile_or_run( + "#include + int main(void) { + __m128i zero = _mm_setzero_si128(); + (void)zero; + return 0; + }" + HAVE_SSE2_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() + +macro(check_ssse3_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(SSSE3FLAG "-mssse3") + else() + set(SSSE3FLAG "/arch:SSSE3") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(SSSE3FLAG "-mssse3") + endif() + endif() + # Check whether compiler supports SSSE3 intrinsics + set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG}") + check_c_source_compile_or_run( + "#include + int main(void) { + __m128i u, v, w; + u = _mm_set1_epi32(1); + v = _mm_set1_epi32(2); + w = _mm_hadd_epi32(u, v); + (void)w; + return 0; + }" + HAVE_SSSE3_INTRIN + ) +endmacro() + +macro(check_sse4_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(SSE4FLAG "-msse4.2") + else() + set(SSE4FLAG "/arch:SSE4.2") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(SSE4FLAG "-msse4") + endif() + endif() + # Check whether compiler supports SSE4 CRC inline asm + set(CMAKE_REQUIRED_FLAGS "${SSE4FLAG}") + check_c_source_compile_or_run( + "int main(void) { + unsigned val = 0, h = 0; + #if defined(_MSC_VER) + { __asm mov edx, h __asm mov eax, val __asm crc32 eax, edx __asm mov val, eax } + #else + __asm__ __volatile__ ( \"crc32 %1,%0\" : \"+r\" (h) : \"r\" (val) ); + #endif + return (int)h; + }" + HAVE_SSE42CRC_INLINE_ASM + ) + # Check whether compiler supports SSE4 CRC intrinsics + check_c_source_compile_or_run( + "#include + int main(void) { + unsigned crc = 0; + char c = 'c'; + #if defined(_MSC_VER) + crc = _mm_crc32_u32(crc, c); + #else + crc = __builtin_ia32_crc32qi(crc, c); + #endif + (void)crc; + return 0; + }" + HAVE_SSE42CRC_INTRIN + ) + # Check whether compiler supports SSE4.2 compare string instrinics + check_c_source_compile_or_run( + "#include + int main(void) { + unsigned char a[64] = { 0 }; + unsigned char b[64] = { 0 }; + __m128i xmm_src0, xmm_src1; + xmm_src0 = _mm_loadu_si128((__m128i *)(char *)a); + xmm_src1 = _mm_loadu_si128((__m128i *)(char *)b); + return _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, 0); + }" + HAVE_SSE42CMPSTR_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) +endmacro() diff --git a/configure b/configure index 8cd48786e..3e69da897 100755 --- a/configure +++ b/configure @@ -972,76 +972,116 @@ else echo "Checking for __builtin_ctzll ... No." | tee -a configure.log fi -# Check for SSE2 intrinsics -case "${ARCH}" in - i386 | i486 | i586 | i686 | x86_64) - cat > $test.c << EOF +check_avx2_intrinsics() { + # Check whether compiler supports AVX2 intrinsics + cat > $test.c << EOF #include int main(void) { - __m128i zero = _mm_setzero_si128(); - (void)zero; + __m256i x = _mm256_set1_epi16(2); + const __m256i y = _mm256_set1_epi16(1); + x = _mm256_subs_epu16(x, y); + (void)x; return 0; } EOF - if try ${CC} ${CFLAGS} ${sse2flag} $test.c; then - echo "Checking for SSE2 intrinsics ... Yes." | tee -a configure.log - HAVE_SSE2_INTRIN=1 - else - echo "Checking for SSE2 intrinsics ... No." | tee -a configure.log - HAVE_SSE2_INTRIN=0 - fi - ;; -esac + if try ${CC} ${CFLAGS} ${avx2flag} $test.c; then + echo "Checking for AVX2 intrinsics ... Yes." | tee -a configure.log + HAVE_AVX2_INTRIN=1 + else + echo "Checking for AVX2 intrinsics ... No." | tee -a configure.log + HAVE_AVX2_INTRIN=0 + fi +} -# Check for SSSE3 intrinsics -case "${ARCH}" in - i386 | i486 | i586 | i686 | x86_64) - cat > $test.c << EOF -#include -int main(void) -{ - __m128i u, v, w; - u = _mm_set1_epi32(1); - v = _mm_set1_epi32(2); - w = _mm_hadd_epi32(u, v); - (void)w; +check_neon_intrinsics() { + # Check whether -mfpu=neon is available on ARM processors. + cat > $test.c << EOF +int main() { return 0; } +EOF + if try $CC -c $CFLAGS -mfpu=neon $test.c; then + MFPU_NEON_AVAILABLE=1 + echo "Check whether -mfpu=neon is available ... Yes." | tee -a configure.log + else + MFPU_NEON_AVAILABLE=0 + echo "Check whether -mfpu=neon is available ... No." | tee -a configure.log + fi +} + +check_pclmulqdq_intrinsics() { + # Check whether compiler supports PCLMULQDQ intrinsics + cat > $test.c << EOF +#include +#include +int main(void) { + __m128i a = _mm_setzero_si128(); + __m128i b = _mm_setzero_si128(); + __m128i c = _mm_clmulepi64_si128(a, b, 0x10); + (void)c; return 0; } EOF - if try ${CC} ${CFLAGS} ${ssse3flag} $test.c; then - echo "Checking for SSSE3 intrinsics ... Yes." | tee -a configure.log - HAVE_SSSE3_INTRIN=1 - else - echo "Checking for SSSE3 intrinsics ... No." | tee -a configure.log - HAVE_SSSE3_INTRIN=0 - fi - ;; -esac + if try ${CC} ${CFLAGS} ${pclmulflag} $test.c; then + echo "Checking for PCLMULQDQ intrinsics ... Yes." | tee -a configure.log + HAVE_PCLMULQDQ_INTRIN=1 + else + echo "Checking for PCLMULQDQ intrinsics ... No." | tee -a configure.log + HAVE_PCLMULQDQ_INTRIN=0 + fi +} -# Check for SSE4.2 CRC inline assembly -case "${ARCH}" in - i386 | i486 | i586 | i686 | x86_64) - cat > $test.c << EOF +check_power8_intrinsics() { + # Check whether features needed by POWER optimisations are available + cat > $test.c << EOF +#include +int main() { return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); } +EOF + if try $CC -c $CFLAGS -mcpu=power8 $test.c; then + HAVE_POWER8_INTRIN=1 + echo "Check whether POWER8 instructions are available ... Yes." | tee -a configure.log + else + HAVE_POWER8_INTRIN=0 + echo "Check whether POWER8 instructions are available ... No." | tee -a configure.log + fi +} + +check_sse2_intrinsics() { + # Check whether compiler supports SSE2 intrinsics + cat > $test.c << EOF +#include +int main(void) { + __m128i zero = _mm_setzero_si128(); + (void)zero; + return 0; +} +EOF + if try ${CC} ${CFLAGS} ${sse2flag} $test.c; then + echo "Checking for SSE2 intrinsics ... Yes." | tee -a configure.log + HAVE_SSE2_INTRIN=1 + else + echo "Checking for SSE2 intrinsics ... No." | tee -a configure.log + HAVE_SSE2_INTRIN=0 + fi +} + +check_sse4_intrinsics() { + # Check whether compiler supports SSE4 CRC inline asm + cat > $test.c << EOF int main(void) { unsigned val = 0, h = 0; __asm__ __volatile__ ( "crc32 %1,%0" : "+r" (h) : "r" (val) ); return (int) h; } EOF - if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then - echo "Checking for SSE4.2 CRC inline assembly ... Yes." | tee -a configure.log - HAVE_SSE42CRC_INLINE_ASM=1 - else - echo "Checking for SSE4.2 CRC inline assembly ... No." | tee -a configure.log - HAVE_SSE42CRC_INLINE_ASM=0 - fi - ;; -esac + if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then + echo "Checking for SSE4.2 CRC inline assembly ... Yes." | tee -a configure.log + HAVE_SSE42CRC_INLINE_ASM=1 + else + echo "Checking for SSE4.2 CRC inline assembly ... No." | tee -a configure.log + HAVE_SSE42CRC_INLINE_ASM=0 + fi -# Check for SSE4.2 CRC intrinsics -case "${ARCH}" in - i386 | i486 | i586 | i686 | x86_64) - cat > $test.c << EOF + # Check whether compiler supports SSE4.2 CRC intrinsics + cat > $test.c << EOF int main(void) { unsigned crc = 0; char c = 'c'; @@ -1050,20 +1090,16 @@ int main(void) { return 0; } EOF - if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then - echo "Checking for SSE4.2 CRC intrinsics ... Yes." | tee -a configure.log - HAVE_SSE42CRC_INTRIN=1 - else - echo "Checking for SSE4.2 CRC intrinsics ... No." | tee -a configure.log - HAVE_SSE42CRC_INTRIN=0 - fi - ;; -esac + if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then + echo "Checking for SSE4.2 CRC intrinsics ... Yes." | tee -a configure.log + HAVE_SSE42CRC_INTRIN=1 + else + echo "Checking for SSE4.2 CRC intrinsics ... No." | tee -a configure.log + HAVE_SSE42CRC_INTRIN=0 + fi -# Check for SSE4.2 compare string intrinsics -case "${ARCH}" in - i386 | i486 | i586 | i686 | x86_64) - cat > $test.c << EOF + # Check whether compiler supports SSE4.2 compare string intrinsics + cat > $test.c << EOF #include int main(void) { @@ -1075,38 +1111,40 @@ int main(void) return _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, 0); } EOF - if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then - echo "Checking for SSE4.2 compare string intrinsics ... Yes." | tee -a configure.log - HAVE_SSE42CMPSTR_INTRIN=1 - else - echo "Checking for SSE4.2 compare string intrinsics ... No." | tee -a configure.log - HAVE_SSE42CMPSTR_INTRIN=0 - fi - ;; -esac + if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then + echo "Checking for SSE4.2 compare string intrinsics ... Yes." | tee -a configure.log + HAVE_SSE42CMPSTR_INTRIN=1 + else + echo "Checking for SSE4.2 compare string intrinsics ... No." | tee -a configure.log + HAVE_SSE42CMPSTR_INTRIN=0 + fi +} -# Check for PCLMULQDQ intrinsics -case "${ARCH}" in - i386 | i486 | i586 | i686 | x86_64) - cat > $test.c << EOF -#include -#include -int main(void) { - __m128i a = _mm_setzero_si128(); - __m128i b = _mm_setzero_si128(); - __m128i c = _mm_clmulepi64_si128(a, b, 0x10); - (void)c; +check_ssse3_intrinsics() { + # Check whether compiler supports SSSE3 intrinsics + cat > $test.c << EOF +#include +int main(void) +{ + __m128i u, v, w; + u = _mm_set1_epi32(1); + v = _mm_set1_epi32(2); + w = _mm_hadd_epi32(u, v); + (void)w; return 0; } EOF - if try ${CC} ${CFLAGS} ${pclmulflag} $test.c; then - echo "Checking for PCLMULQDQ intrinsics ... Yes." | tee -a configure.log - HAVE_PCLMULQDQ_INTRIN=1 - else - echo "Checking for PCLMULQDQ intrinsics ... No." | tee -a configure.log - HAVE_PCLMULQDQ_INTRIN=0 - fi + if try ${CC} ${CFLAGS} ${ssse3flag} $test.c; then + echo "Checking for SSSE3 intrinsics ... Yes." | tee -a configure.log + HAVE_SSSE3_INTRIN=1 + else + echo "Checking for SSSE3 intrinsics ... No." | tee -a configure.log + HAVE_SSSE3_INTRIN=0 + fi +} +case "${ARCH}" in + i386 | i486 | i586 | i686 | x86_64) # Enable deflate_medium at level 1 if test $without_new_strategies -eq 1; then CFLAGS="${CFLAGS} -DNO_QUICK_STRATEGY" @@ -1120,75 +1158,6 @@ EOF ;; esac -# Check for AVX2 intrinsics -case "${ARCH}" in - i386 | i486 | i586 | i686 | x86_64) - cat > $test.c << EOF -#include -int main(void) { - __m256i x = _mm256_set1_epi16(2); - const __m256i y = _mm256_set1_epi16(1); - x = _mm256_subs_epu16(x, y); - (void)x; - return 0; -} -EOF - if try ${CC} ${CFLAGS} ${avx2flag} $test.c; then - echo "Checking for AVX2 intrinsics ... Yes." | tee -a configure.log - HAVE_AVX2_INTRIN=1 - else - echo "Checking for AVX2 intrinsics ... No." | tee -a configure.log - HAVE_AVX2_INTRIN=0 - fi - ;; -esac - - -# Check whether -mfpu=neon is available on ARM processors. -case "${ARCH}" in - arm*) - cat > $test.c << EOF -int main() { return 0; } -EOF - if try $CC -c $CFLAGS -mfpu=neon $test.c; then - MFPU_NEON_AVAILABLE=1 - echo "Check whether -mfpu=neon is available ... Yes." | tee -a configure.log - else - MFPU_NEON_AVAILABLE=0 - echo "Check whether -mfpu=neon is available ... No." | tee -a configure.log - fi - ;; -esac - -# Check whether features needed by POWER optimisations are available -case "${ARCH}" in - powerpc*) - cat > $test.c << EOF -#include -int main() { return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); } -EOF - if try $CC -c $CFLAGS -mcpu=power8 $test.c; then - HAVE_POWER8=1 - echo "Check whether POWER8 instructions are available ... Yes." | tee -a configure.log - else - HAVE_POWER8=0 - echo "Check whether POWER8 instructions are available ... No." | tee -a configure.log - fi -esac - -# Check whether sys/sdt.h is available -cat > $test.c << EOF -#include -int main() { return 0; } -EOF -if try ${CC} ${CFLAGS} $test.c; then - echo "Checking for sys/sdt.h ... Yes." | tee -a configure.log - CFLAGS="$CFLAGS -DHAVE_SYS_SDT_H" - SFLAGS="$SFLAGS -DHAVE_SYS_SDT_H" -else - echo "Checking for sys/sdt.h ... No." | tee -a configure.log -fi - ARCHDIR='arch/generic' ARCH_STATIC_OBJS='' ARCH_SHARED_OBJS='' @@ -1210,6 +1179,8 @@ case "${ARCH}" in ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo" + check_avx2_intrinsics + if test ${HAVE_AVX2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" @@ -1217,6 +1188,8 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo chunkset_avx.lo compare258_avx.lo adler32_avx.lo" fi + check_sse4_intrinsics + if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH" SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH" @@ -1230,6 +1203,8 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} insert_string_sse.lo" fi + check_sse4_intrinsics + if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE42_CMP_STR" SFLAGS="${SFLAGS} -DX86_SSE42_CMP_STR" @@ -1238,6 +1213,8 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_sse.lo" fi + check_sse2_intrinsics + if test ${HAVE_SSE2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET" SFLAGS="${SFLAGS} -DX86_SSE2 -DX86_SSE2_CHUNKSET" @@ -1250,6 +1227,8 @@ case "${ARCH}" in fi fi + check_ssse3_intrinsics + if test ${HAVE_SSSE3_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" SFLAGS="${SFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" @@ -1257,6 +1236,8 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo" fi + check_pclmulqdq_intrinsics + if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC" SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC" @@ -1351,6 +1332,10 @@ EOF SFLAGS="${SFLAGS} ${floatabi}" fi + if test $without_optimizations -eq 0; then + check_neon_intrinsics + fi + case "${ARCH}" in armv[345]*) if test $without_optimizations -eq 0; then @@ -1527,7 +1512,10 @@ EOF ARCHDIR=arch/power if test $without_optimizations -eq 0; then - if test $HAVE_POWER8 -eq 1; then + + check_power8_intrinsics + + if test $HAVE_POWER8_INTRIN -eq 1; then CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH" SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_SLIDEHASH"