From: Mika Lindqvist Date: Sun, 13 Mar 2022 15:12:42 +0000 (+0200) Subject: Allow bypassing runtime feature check of TZCNT instructions. X-Git-Tag: 2.1.0-beta1~335 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=db3feb4cf2de73a84039e90c89f0c0b393e620bc;p=thirdparty%2Fzlib-ng.git Allow bypassing runtime feature check of TZCNT instructions. * This avoids conditional branch when it's known at build time that TZCNT instructions are always supported --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 440a7169..60ccbe25 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,7 @@ elseif(BASEARCH_S360_FOUND) option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF) option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON) elseif(BASEARCH_X86_FOUND) + option(FORCE_TZCNT "Always assume CPU is TZCNT capable" OFF) option(WITH_AVX2 "Build with AVX2" ON) option(WITH_AVX512 "Build with AVX512" ON) option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON) @@ -792,6 +793,10 @@ if(WITH_OPTIM) set(WITH_SSE4 OFF) endif() endif() + if(FORCE_TZCNT) + add_definitions(-DX86_NOCHECK_TZCNT) + endif() + add_feature_info(FORCE_TZCNT FORCE_TZCNT "Assume CPU is TZCNT capable") if(WITH_SSE2) check_sse2_intrinsics() if(HAVE_SSE2_INTRIN) diff --git a/README.md b/README.md index 3c4fd00b..668c29cb 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,7 @@ Advanced Build Options | ZLIB_DUAL_LINK | | Dual link tests with system zlib | OFF | | UNALIGNED_OK | | Allow unaligned reads | ON (x86, arm) | | | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) | +| FORCE_TZCNT | --force-tzcnt | Skip runtime check for TZCNT instructions | OFF | | WITH_AVX2 | | Build with AVX2 intrinsics | ON | | WITH_AVX512 | | Build with AVX512 intrinsics | ON | | WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON | diff --git a/configure b/configure index 149ae732..bd237236 100755 --- a/configure +++ b/configure @@ -103,6 +103,7 @@ with_fuzzers=0 floatabi= native=0 forcesse2=0 +forcetzcnt=0 # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mtune=cascadelake" @@ -173,6 +174,7 @@ case "$1" in echo ' [--without-crc32-vx] Build without vectorized CRC32 on IBM Z' | tee -a configure.log echo ' [--with-reduced-mem] Reduced memory usage for special cases (reduces performance)' | tee -a configure.log echo ' [--force-sse2] Assume SSE2 instructions are always available (disabled by default on x86, enabled on x86_64)' | tee -a configure.log + echo ' [--force-tzcnt] Assume TZCNT instructions are always available (disabled by default)' | tee -a configure.log echo ' [--with-sanitizer] Build with sanitizer (memory, address, undefined)' | tee -a configure.log echo ' [--with-fuzzers] Build test/fuzz (disabled by default)' | tee -a configure.log echo ' [--native] Compiles with full instruction set supported on this host' | tee -a configure.log @@ -206,6 +208,7 @@ case "$1" in --without-crc32-vx) buildcrc32vx=0; shift ;; --with-reduced-mem) reducedmem=1; shift ;; --force-sse2) forcesse2=1; shift ;; + --force-tzcnt) forcetzcnt=1; shift ;; -n | --native) native=1; shift ;; -a*=* | --archs=*) ARCHS=$(echo $1 | sed 's/.*=//'); shift ;; --sysconfdir=*) echo "ignored option: --sysconfdir" | tee -a configure.log; shift ;; @@ -1589,6 +1592,11 @@ case "${ARCH}" in fi fi fi + + if test $forcetzcnt -eq 1; then + CFLAGS="${CFLAGS} -DX86_NOCHECK_TZCNT" + SFLAGS="${SFLAGS} -DX86_NOCHECK_TZCNT" + fi fi ;; diff --git a/fallback_builtins.h b/fallback_builtins.h index 8abec2fa..c9fcceac 100644 --- a/fallback_builtins.h +++ b/fallback_builtins.h @@ -14,7 +14,9 @@ */ static __forceinline unsigned long __builtin_ctz(uint32_t value) { #ifdef X86_FEATURES +# ifndef X86_NOCHECK_TZCNT if (x86_cpu_has_tzcnt) +# endif return _tzcnt_u32(value); #endif unsigned long trailing_zero; @@ -29,7 +31,9 @@ static __forceinline unsigned long __builtin_ctz(uint32_t value) { */ static __forceinline unsigned long long __builtin_ctzll(uint64_t value) { #ifdef X86_FEATURES +# ifndef X86_NOCHECK_TZCNT if (x86_cpu_has_tzcnt) +# endif return _tzcnt_u64(value); #endif unsigned long trailing_zero;