From: Nathan Moinvaziri Date: Sun, 11 Aug 2019 10:49:01 +0000 (-0700) Subject: Fixed optimizations not being used when compiler is msvc. (#376) X-Git-Tag: 1.9.9-b1~454 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=243cf61c4a1bea6ae7b3b0295fefb87d70f03d8c;p=thirdparty%2Fzlib-ng.git Fixed optimizations not being used when compiler is msvc. (#376) This issue I mentioned in #370. Optimization code such as crc_folding.c, deflate_quick_sse.c, fill_window_sse.c, and insert_string_sse.c were not being compiled when the compiler was MSVC because the checks for the instrincs were not being done and the HAVE_[TARGET]_INTRIN variables weren't being set. I could have simply set HAVE_[TARGET]_INTRIN variables to ON manually in the case of MSVC, but it is better this way to have one path for all the compilers (that it runs and checks some code for determination). I have just added MSVC code where necessary in the checks. * Rename HAVE_SSE42_INTRIN to HAVE_SSE42CRC_INLINE_ASM. * Added msvc inline asm support to insert_string_sse.c --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 420fc1c6..de87b59f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,7 @@ string(REGEX REPLACE ".*#define[ \t]+ZLIBNG_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*" message(STATUS "ZLIB_HEADER_VERSION: ${ZLIB_HEADER_VERSION}") message(STATUS "ZLIBNG_HEADER_VERSION: ${ZLIBNG_HEADER_VERSION}") -project(zlib +project(zlib VERSION ${ZLIB_HEADER_VERSION} LANGUAGES C) @@ -400,80 +400,88 @@ if(MSVC) set(CMAKE_DEBUG_POSTFIX "d") add_definitions(-D_CRT_SECURE_NO_DEPRECATE) add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) +endif() + +# Check whether compiler supports SSE2 instrinics +if(WITH_NATIVE_INSTRUCTIONS) + set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") else() - # - # Not MSVC, so we need to check if we have the MS-style SSE etc. intrinsics - # - if(WITH_NATIVE_INSTRUCTIONS) - set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") - else() - set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG}") - endif() + set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG}") +endif() +check_c_source_compile_or_run( + "#include + int main(void) + { + __m128i zero = _mm_setzero_si128(); + (void)zero; + return 0; + }" + HAVE_SSE2_INTRIN +) +set(CMAKE_REQUIRED_FLAGS) + +# Check whether compiler supports SSE4 CRC inline asm +if(WITH_NATIVE_INSTRUCTIONS) + set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") +else() + set(CMAKE_REQUIRED_FLAGS "${SSE4FLAG}") +endif() +check_c_source_compile_or_run( + "int main(void) + { + unsigned val = 0, h = 0; + #if defined(_MSC_VER) + { __asm mov edx, h __asm mov eax, val __asm crc32 eax, edx __asm mov val, eax } + #else + __asm__ __volatile__ ( \"crc32 %1,%0\" : \"+r\" (h) : \"r\" (val) ); + #endif + return (int) h; + }" + HAVE_SSE42CRC_INLINE_ASM +) +# Check whether compiler supports SSE4 CRC instrinics +check_c_source_compile_or_run( + "#include + int main(void) + { + unsigned crc = 0; + char c = 'c'; + #if defined(_MSC_VER) + crc = _mm_crc32_u32(crc, c); + #else + crc = __builtin_ia32_crc32qi(crc, c); + #endif + (void)crc; + return 0; + }" + HAVE_SSE42CRC_INTRIN +) +set(CMAKE_REQUIRED_FLAGS) + +# Check whether compiler supports PCLMULQDQ intrinics +if(WITH_NATIVE_INSTRUCTIONS) + set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") +else() + set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG}") +endif() +if(NOT (APPLE AND ${ARCH} MATCHES "i386")) + # The pclmul code currently crashes on Mac in 32bit mode. Avoid for now. check_c_source_compile_or_run( "#include int main(void) { - __m128i zero = _mm_setzero_si128(); - (void)zero; + __m128i a = _mm_setzero_si128(); + __m128i b = _mm_setzero_si128(); + __m128i c = _mm_clmulepi64_si128(a, b, 0x10); + (void)c; return 0; }" - HAVE_SSE2_INTRIN + HAVE_PCLMULQDQ_INTRIN ) - set(CMAKE_REQUIRED_FLAGS) - - if(WITH_NATIVE_INSTRUCTIONS) - set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") - else() - # Use the generic SSE4 enabler option to check for the SSE4.2 instruction we require: - set(CMAKE_REQUIRED_FLAGS "${SSE4FLAG}") - endif() - check_c_source_compile_or_run( - "int main(void) - { - unsigned val = 0, h = 0; - __asm__ __volatile__ ( \"crc32 %1,%0\" : \"+r\" (h) : \"r\" (val) ); - return (int) h; - }" - HAVE_SSE42_INTRIN - ) - check_c_source_compile_or_run( - "int main(void) - { - unsigned crc = 0; - char c = 'c'; - crc = __builtin_ia32_crc32qi(crc, c); - (void)crc; - return 0; - }" - HAVE_SSE42CRC_INTRIN - ) - set(CMAKE_REQUIRED_FLAGS) - - if(WITH_NATIVE_INSTRUCTIONS) - set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") - else() - set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG}") - endif() - if(NOT (APPLE AND ${ARCH} MATCHES "i386")) - # The pclmul code currently crashes on Mac in 32bit mode. Avoid for now. - check_c_source_compile_or_run( - "#include - #include - int main(void) - { - __m128i a = _mm_setzero_si128(); - __m128i b = _mm_setzero_si128(); - __m128i c = _mm_clmulepi64_si128(a, b, 0x10); - (void)c; - return 0; - }" - HAVE_PCLMULQDQ_INTRIN - ) - else() - set(HAVE_PCLMULQDQ_INTRIN NO) - endif() - set(CMAKE_REQUIRED_FLAGS) +else() + set(HAVE_PCLMULQDQ_INTRIN NO) endif() +set(CMAKE_REQUIRED_FLAGS) # Check whether -mfpu=neon is available set(CMAKE_REQUIRED_FLAGS "-mfpu=neon") @@ -570,7 +578,7 @@ if(WITH_OPTIM) elseif("${ARCHDIR}" MATCHES "arch/x86") add_definitions("-DX86_CPUID") set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/x86.c) - if(HAVE_SSE42_INTRIN) + if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN) add_definitions(-DX86_SSE4_2_CRC_HASH) set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/insert_string_sse.c) add_feature_info(SSE4_CRC 1 "Support CRC hash generation using the SSE4.2 instruction set, using \"${SSE4FLAG}\"") @@ -599,7 +607,7 @@ if(WITH_OPTIM) add_definitions(-DX86_PCLMULQDQ_CRC) set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/crc_folding.c) add_intrinsics_option("${PCLMULFLAG}") - if(HAVE_SSE42_INTRIN) + if(HAVE_SSE42CRC_INLINE_ASM) add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${PCLMULFLAG}\"") else() add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${PCLMULFLAG} ${SSE4FLAG}\"") @@ -882,4 +890,3 @@ if (ZLIB_ENABLE_TESTS) endif() FEATURE_SUMMARY(WHAT ALL INCLUDE_QUIET_PACKAGES) - diff --git a/arch/x86/insert_string_sse.c b/arch/x86/insert_string_sse.c index 394e5093..59962151 100644 --- a/arch/x86/insert_string_sse.c +++ b/arch/x86/insert_string_sse.c @@ -30,16 +30,27 @@ ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsig if (s->level >= TRIGGER_LEVEL) val &= 0xFFFFFF; -#ifdef _MSC_VER +#if defined(X86_SSE4_2_CRC_INTRIN) +# ifdef _MSC_VER h = _mm_crc32_u32(h, val); -#elif defined(X86_SSE4_2_CRC_INTRIN) +# else h = __builtin_ia32_crc32si(h, val); +# endif #else +# ifdef _MSC_VER + __asm { + mov edx h + mov eax, val + crc32 eax, edx + mov val, eax + }; +# else __asm__ __volatile__ ( "crc32 %1,%0\n\t" : "+r" (h) : "r" (val) ); +# endif #endif Pos head = s->head[h & s->hash_mask]; if (head != str+idx) {