From: Mika Lindqvist Date: Fri, 23 Mar 2018 12:48:53 +0000 (+0200) Subject: Separate feature checks for x86 and x86_64 X-Git-Tag: 1.9.9-b1~630 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=394ff60d450bc7ff86f78968c4e679c258b610da;p=thirdparty%2Fzlib-ng.git Separate feature checks for x86 and x86_64 * Don't check for SSE2 on anything else than i685 * Don't check for PCLMULQDQ on anything else than i686 or x86_64 * Check for SSE4.2 CRC intrinsics --- diff --git a/CMakeLists.txt b/CMakeLists.txt index d19b26af5..60868dab3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -318,6 +318,17 @@ else() }" HAVE_SSE42_INTRIN ) + check_c_source_compile_or_run( + "int main(void) + { + unsigned crc = 0; + char c = 'c'; + crc = __builtin_ia32_crc32qi(crc, c); + (void)crc; + return 0; + }" + HAVE_SSE42CRC_INTRIN + ) if(WITH_NATIVE_INSTRUCTIONS) set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") else() @@ -431,6 +442,9 @@ if(WITH_OPTIM) set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/insert_string_sse.c) add_feature_info(SSE4_CRC 1 "Support CRC hash generation using the SSE4.2 instruction set, using \"${SSE4FLAG}\"") add_intrinsics_option("${SSE4FLAG}") + if(HAVE_SSE42CRC_INTRIN) + add_definitions(-DX86_SSE4_2_CRC_INTRIN) + endif() if(WITH_NEW_STRATEGIES) add_definitions(-DX86_QUICK_STRATEGY) set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/deflate_quick.c) diff --git a/arch/x86/insert_string_sse.c b/arch/x86/insert_string_sse.c index cb756ac5c..bf09aabb8 100644 --- a/arch/x86/insert_string_sse.c +++ b/arch/x86/insert_string_sse.c @@ -32,6 +32,8 @@ ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsig #ifdef _MSC_VER h = _mm_crc32_u32(h, val); +#elif defined(X86_SSE4_2_CRC_INTRIN) + h = __builtin_ia32_crc32si(h, val); #else __asm__ __volatile__ ( "crc32 %1,%0\n\t" diff --git a/arch/x86/x86.c b/arch/x86/x86.c index 45bd63de2..c04e0a79a 100644 --- a/arch/x86/x86.c +++ b/arch/x86/x86.c @@ -46,15 +46,23 @@ static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigne void ZLIB_INTERNAL x86_check_features(void) { unsigned eax, ebx, ecx, edx; + unsigned maxbasic; + + cpuid(0, &maxbasic, &ebx, &ecx, &edx); + cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx); x86_cpu_has_sse2 = edx & 0x4000000; x86_cpu_has_sse42 = ecx & 0x100000; x86_cpu_has_pclmulqdq = ecx & 0x2; - cpuid(7, &eax, &ebx, &ecx, &edx); + if (maxbasic >= 7) { + cpuid(7, &eax, &ebx, &ecx, &edx); - // check BMI1 bit - // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf - x86_cpu_has_tzcnt = ebx & 0x8; + // check BMI1 bit + // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf + x86_cpu_has_tzcnt = ebx & 0x8; + } else { + x86_cpu_has_tzcnt = 0; + } } diff --git a/configure b/configure index 9de84b975..71e2ff601 100755 --- a/configure +++ b/configure @@ -102,6 +102,7 @@ floatabi= native=0 sse2flag="-msse2" sse4flag="-msse4" +sse42flag="-msse4.2" pclmulflag="-mpclmul" without_optimizations=0 without_new_strategies=0 @@ -751,6 +752,7 @@ else fi # Check for SSE2 intrinsics +if test "${ARCH}" = "i686"; then cat > $test.c << EOF #include int main(void) { @@ -767,7 +769,31 @@ else HAVE_SSE2_INTRIN=0 fi +fi + +# Check for SSE4.2 CRC intrinsics +if test "${ARCH}" = "i686" || test "${ARCH}" = "x86_64"; then +cat > $test.c << EOF +int main(void) { + unsigned crc = 0; + char c = 'c'; + crc = __builtin_ia32_crc32qi(crc, c); + (void)crc; + return 0; +} +EOF +if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then + echo "Checking for SSE4.2 CRC intrinsics ... Yes." | tee -a configure.log + HAVE_SSE42CRC_INTRIN=1 +else + echo "Checking for SSE4.2 CRC intrinsics ... No." | tee -a configure.log + HAVE_SSE42CRC_INTRIN=0 +fi + +fi + # Check for PCLMULQDQ intrinsics +if test "${ARCH}" = "i686" || test "${ARCH}" = "x86_64"; then cat > $test.c << EOF #include #include @@ -793,32 +819,23 @@ if test $without_new_strategies -eq 0; then SFLAGS="${SFLAGS} -DMEDIUM_STRATEGY" fi +fi + ARCHDIR='arch/generic' ARCH_STATIC_OBJS='' ARCH_SHARED_OBJS='' # Set ARCH specific FLAGS case "${ARCH}" in - # x86 and x86_64 specific optimizations - i386 | i486 | i586 | i686 | x86_64) - ARCHDIR=arch/x86 - - case "${ARCH}" in - x86_64) - CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2" - SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2" - ;; - i386 | i486 | i586 | i686) - CFLAGS="${CFLAGS} -DX86" - SFLAGS="${SFLAGS} -DX86" - ;; - esac + # x86 specific optimizations + i386 | i486 | i586 | i686) + ARCHDIR=arch/x86 - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="${CFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS" + SFLAGS="${SFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS" - # Enable arch-specific optimizations? - if test $without_optimizations -eq 0; then + # Enable arch-specific optimizations? + if test $without_optimizations -eq 0; then CFLAGS="${CFLAGS} -DX86_CPUID" SFLAGS="${SFLAGS} -DX86_CPUID" @@ -830,6 +847,21 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -DX86_SSE2_FILL_WINDOW" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo" + + # Enable deflate_quick at level 1? + # requires SSE2: code uses fill_window_sse + if test $without_new_strategies -eq 0; then + CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY" + SFLAGS="${SFLAGS} -DX86_QUICK_STRATEGY" + + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} deflate_quick.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} deflate_quick.lo" + fi + fi + + if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_SSE4_2_CRC_INTRIN" + SFLAGS="${SFLAGS} -DX86_SSE4_2_CRC_INTRIN" fi CFLAGS="${CFLAGS} -DX86_SSE4_2_CRC_HASH" @@ -844,9 +876,38 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo crc_pclmulqdq.lo" fi - # Enable deflate_quick at level 1? - # requires SSE2: code uses fill_window_sse - if test ${HAVE_SSE2_INTRIN} -eq 1 && test $without_new_strategies -eq 0; then + fi + ;; + + # x86_64 specific optimizations + x86_64) + ARCHDIR=arch/x86 + + CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS" + SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS" + + # Enable arch-specific optimizations? + if test $without_optimizations -eq 0; then + CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2_FILL_WINDOW -DX86_SSE4_2_CRC_HASH" + SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2_FILL_WINDOW -DX86_SSE4_2_CRC_HASH" + + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo" + + if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_SSE4_2_CRC_INTRIN" + SFLAGS="${SFLAGS} -DX86_SSE4_2_CRC_INTRIN" + fi + + if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC" + SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc_folding.o crc_pclmulqdq.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo crc_pclmulqdq.lo" + fi + + # Enable deflate_quick at level 1? + if test $without_new_strategies -eq 0; then CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY" SFLAGS="${SFLAGS} -DX86_QUICK_STRATEGY"