From 5159088fe8d58d1e2273b9952c8c2dc5e760bebc Mon Sep 17 00:00:00 2001 From: Hans Kristian Rosbach Date: Fri, 10 Oct 2025 13:26:12 +0200 Subject: [PATCH] Remove force-sse2 config option from x86 builds. Due to major refactoring done long ago, this option no longer avoids a branch in a hot path, it currently only removes a single if check during init. --- CMakeLists.txt | 8 -------- README.md | 1 - arch/x86/x86_functions.h | 2 +- configure | 8 -------- functable.c | 2 +- 5 files changed, 2 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b1d52e5..7a30ed2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1042,20 +1042,12 @@ if(WITH_OPTIM) endif() if(WITH_SSE2) check_sse2_intrinsics() - # FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true - if("${ARCH}" MATCHES "i[3-6]86") - cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF) - endif() if(HAVE_SSE2_INTRIN) add_definitions(-DX86_SSE2) set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/chorba_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c) list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS}) if(NOT ${ARCH} MATCHES "x86_64") set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}") - add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable") - if(FORCE_SSE2) - add_definitions(-DX86_NOCHECK_SSE2) - endif() endif() else() set(WITH_SSE2 OFF) diff --git a/README.md b/README.md index 183935e3..c0ddc622 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,6 @@ Advanced Build Options | CMake | configure | Description | Default | |:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------| -| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) | | WITH_AVX2 | | Build with AVX2 intrinsics | ON | | WITH_AVX512 | | Build with AVX512 intrinsics | ON | | WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON | diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index ddb61b74..918b7e0f 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -91,7 +91,7 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); #ifdef DISABLE_RUNTIME_CPU_DETECTION // X86 - SSE2 -# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2) +# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_sse2 # undef native_inflate_fast diff --git a/configure b/configure index 0a7cbfcf..fcfc795d 100755 --- a/configure +++ b/configure @@ -106,7 +106,6 @@ builddfltccinflate=0 buildcrc32vx=1 buildcrc32la=1 floatabi= -forcesse2=0 # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2" @@ -190,7 +189,6 @@ case "$1" in echo ' [--with-dfltcc-inflate] Use DEFLATE CONVERSION CALL instruction for decompression on IBM Z' | tee -a configure.log echo ' [--without-crc32-vx] Build without vectorized CRC32 on IBM Z' | tee -a configure.log echo ' [--with-reduced-mem] Reduced memory usage for special cases (reduces performance)' | tee -a configure.log - echo ' [--force-sse2] Assume SSE2 instructions are always available (disabled by default on x86, enabled on x86_64)' | tee -a configure.log exit 0 ;; -p*=* | --prefix=*) prefix=$(echo $1 | sed 's/.*=//'); shift ;; -e*=* | --eprefix=*) exec_prefix=$(echo $1 | sed 's/.*=//'); shift ;; @@ -229,7 +227,6 @@ case "$1" in --without-crc32-vx) buildcrc32vx=0; shift ;; --without-crc32-la) buildcrc32la=0; shift ;; --with-reduced-mem) reducedmem=1; shift ;; - --force-sse2) forcesse2=1; shift ;; -a*=* | --archs=*) ARCHS=$(echo $1 | sed 's/.*=//'); shift ;; --sysconfdir=*) echo "ignored option: --sysconfdir" | tee -a configure.log; shift ;; --localstatedir=*) echo "ignored option: --localstatedir" | tee -a configure.log; shift ;; @@ -1811,11 +1808,6 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -DX86_SSE2" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse2.o chorba_sse2.o compare256_sse2.o slide_hash_sse2.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse2.lo chorba_sse2.lo compare256_sse2.lo slide_hash_sse2.lo" - - if test $forcesse2 -eq 1; then - CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2" - SFLAGS="${SFLAGS} -DX86_NOCHECK_SSE2" - fi fi check_ssse3_intrinsics diff --git a/functable.c b/functable.c index 831a8a27..1f8f52fd 100644 --- a/functable.c +++ b/functable.c @@ -68,7 +68,7 @@ static void init_functable(void) { // X86 - SSE2 #ifdef X86_SSE2 -# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) +# if !defined(__x86_64__) && !defined(_M_X64) if (cf.x86.has_sse2) # endif { -- 2.47.3