From db3f23f720efdcf2ef5a453592a0d73d40cada99 Mon Sep 17 00:00:00 2001 From: Hans Kristian Rosbach Date: Fri, 10 Oct 2025 14:52:21 +0200 Subject: [PATCH] Don't build C-fallback functions that never get used on x86_64 --- .github/workflows/pkgcheck.yml | 4 +-- CMakeLists.txt | 50 +++++++++++++++++++++----- README.md | 3 +- configure | 10 ++++++ functable.c | 29 ++++++++++++--- test/benchmarks/benchmark_slidehash.cc | 2 ++ 6 files changed, 82 insertions(+), 16 deletions(-) diff --git a/.github/workflows/pkgcheck.yml b/.github/workflows/pkgcheck.yml index 58af3a51..3ec5461f 100644 --- a/.github/workflows/pkgcheck.yml +++ b/.github/workflows/pkgcheck.yml @@ -137,7 +137,7 @@ jobs: CFLAGS: ${{ matrix.cflags }} CXXFLAGS: ${{ matrix.cxxflags }} CHOST: ${{ matrix.chost }} - CMAKE_ARGS: ${{ matrix.cmake-args }} + CMAKE_ARGS: ${{ matrix.cmake-args }} -DWITH_ALL_FALLBACKS=ON CONFIGURE_ARGS: ${{ matrix.configure-args }} LDFLAGS: ${{ matrix.ldflags }} @@ -147,7 +147,7 @@ jobs: CC: ${{ matrix.compiler }} CFLAGS: ${{ matrix.cflags }} CHOST: ${{ matrix.chost }} - CMAKE_ARGS: ${{ matrix.cmake-args }} + CMAKE_ARGS: ${{ matrix.cmake-args }} -DWITH_ALL_FALLBACKS=ON CONFIGURE_ARGS: ${{ matrix.configure-args }} LDFLAGS: ${{ matrix.ldflags }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a30ed2f..31e8c691 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,7 @@ endif() option(WITH_GZFILEOP "Compile with support for gzFile related functions" ON) option(ZLIB_COMPAT "Compile with zlib compatible API" OFF) option(WITH_OPTIM "Build with optimisation" ON) +option(WITH_ALL_FALLBACKS "Build all generic fallback functions (Useful for Gbench)" OFF) option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces performance)" OFF) option(WITH_NEW_STRATEGIES "Use new strategies" ON) option(WITH_CRC32_CHORBA "Enable optimized CRC32 algorithm Chorba" ON) @@ -151,6 +152,7 @@ mark_as_advanced(FORCE ZLIB_SYMBOL_PREFIX WITH_REDUCED_MEM WITH_CRC32_CHORBA + WITH_ALL_FALLBACKS WITH_ARMV8 WITH_NEON WITH_ARMV6 WITH_DFLTCC_DEFLATE @@ -713,6 +715,7 @@ else() endif() if(WITH_OPTIM) + add_definitions(-DWITH_OPTIM) if(BASEARCH_ARM_FOUND) add_definitions(-DARM_FEATURES) if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") @@ -1160,6 +1163,9 @@ if(WITH_OPTIM) endif() endif() endif() +else() + # If WITH_OPTIM is disabled, we need all the fallbacks. + set(WITH_ALL_FALLBACKS ON) endif() message(STATUS "Architecture-specific source files: ${ZLIB_ARCH_SRCS}") @@ -1267,14 +1273,6 @@ set(ZLIB_PRIVATE_HDRS zutil_p.h ) set(ZLIB_SRCS - arch/generic/adler32_c.c - arch/generic/adler32_fold_c.c - arch/generic/chunkset_c.c - arch/generic/compare256_c.c - arch/generic/crc32_braid_c.c - arch/generic/crc32_c.c - arch/generic/crc32_fold_c.c - arch/generic/slide_hash_c.c adler32.c compress.c crc32.c @@ -1298,6 +1296,39 @@ set(ZLIB_SRCS zutil.c ) +set(ZLIB_ALL_FALLBACK_SRCS + arch/generic/adler32_c.c + arch/generic/adler32_fold_c.c + arch/generic/chunkset_c.c + arch/generic/compare256_c.c + arch/generic/crc32_braid_c.c + arch/generic/crc32_c.c + arch/generic/crc32_fold_c.c + arch/generic/slide_hash_c.c +) + +if(WITH_ALL_FALLBACKS) + list(APPEND ZLIB_GENERIC_SRCS ${ZLIB_ALL_FALLBACK_SRCS}) + add_definitions(-DWITH_ALL_FALLBACKS) +elseif(${ARCH} STREQUAL "x86_64" AND WITH_SSE2) + # x86_64 always has SSE2, so let the SSE2 functions act as fallbacks. + list(APPEND ZLIB_GENERIC_SRCS + arch/generic/adler32_c.c + arch/generic/adler32_fold_c.c + arch/generic/crc32_braid_c.c + arch/generic/crc32_c.c + arch/generic/crc32_fold_c.c + ) + + # x86_64 does not need compare256 fallback if we have BUILTIN_CTZ + if(NOT HAVE_BUILTIN_CTZ) + list(APPEND ZLIB_GENERIC_SRCS arch/generic/compare256_c.c) + endif() +else() + list(APPEND ZLIB_GENERIC_SRCS ${ZLIB_ALL_FALLBACK_SRCS}) + add_definitions(-DWITH_ALL_FALLBACKS) +endif() + if(WITH_CRC32_CHORBA) list(APPEND ZLIB_SRCS arch/generic/crc32_chorba_c.c) endif() @@ -1316,7 +1347,7 @@ set(ZLIB_GZFILE_SRCS gzwrite.c ) -set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) +set(ZLIB_ALL_SRCS ${ZLIB_GENERIC_SRCS} ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) if(WITH_GZFILEOP) list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS}) endif() @@ -1542,6 +1573,7 @@ add_feature_info(WITH_GTEST WITH_GTEST "Build gtest_zlib") add_feature_info(WITH_FUZZERS WITH_FUZZERS "Build test/fuzz") add_feature_info(WITH_BENCHMARKS WITH_BENCHMARKS "Build test/benchmarks") add_feature_info(WITH_BENCHMARK_APPS WITH_BENCHMARK_APPS "Build application benchmarks") +add_feature_info(WITH_ALL_FALLBACKS WITH_ALL_FALLBACKS "Build all generic fallback functions") add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation") add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies") add_feature_info(WITH_CRC32_CHORBA WITH_CRC32_CHORBA "Use optimized CRC32 algorithm Chorba") diff --git a/README.md b/README.md index c0ddc622..81d967fb 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ Advanced Build Options | WITH_SSE42 | | Build with SSE42 intrinsics | ON | | WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON | | WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON | -| WITH_ARMV8 | --without-armv8 | Build with ARMv8 intrinsics | ON | +| WITH_ARMV8 | --without-armv8 | Build with ARMv8 intrinsics | ON | | WITH_NEON | --without-neon | Build with NEON intrinsics | ON | | WITH_ARMV6 | --without-armv6 | Build with ARMv6 intrinsics | ON | | WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON | @@ -216,6 +216,7 @@ Advanced Build Options | WITH_INFLATE_ALLOW_INVALID_DIST | | Build with zero fill for inflate invalid distances | OFF | | INSTALL_UTILS | | Copy minigzip and minideflate during install | OFF | | ZLIBNG_ENABLE_TESTS | | Test zlib-ng specific API | ON | +| WITH_ALL_FALLBACKS | | Build with all c-fallbacks (useful for Gbench comparisons) | OFF | Related Projects diff --git a/configure b/configure index fcfc795d..9cec2d03 100755 --- a/configure +++ b/configure @@ -1774,6 +1774,16 @@ if test $without_new_strategies -eq 1; then SFLAGS="${SFLAGS} -DNO_QUICK_STRATEGY -DNO_MEDIUM_STRATEGY" fi +# CMake can exclude building some of the generic fallback functions, +# configure does not have the detection code to do so. +CFLAGS="${CFLAGS} -DWITH_ALL_FALLBACKS" +SFLAGS="${SFLAGS} -DWITH_ALL_FALLBACKS" + +if test $without_optimizations -eq 0; then + CFLAGS="${CFLAGS} -DWITH_OPTIM" + SFLAGS="${SFLAGS} -DWITH_OPTIM" +fi + ARCHDIR='arch/generic' ARCH_STATIC_OBJS='' ARCH_SHARED_OBJS='' diff --git a/functable.c b/functable.c index 1f8f52fd..4481fdb9 100644 --- a/functable.c +++ b/functable.c @@ -47,9 +47,26 @@ static void init_functable(void) { struct cpu_features cf; cpu_check_features(&cf); - - // Generic code ft.force_init = &force_init_empty; + + // Set up generic C code fallbacks +#ifndef WITH_ALL_FALLBACKS +# if (defined(__x86_64__) || defined(_M_X64)) && defined(X86_SSE2) + // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available. + ft.adler32 = &adler32_c; + ft.adler32_fold_copy = &adler32_fold_copy_c; + ft.crc32 = &crc32_c; + ft.crc32_fold = &crc32_fold_c; + ft.crc32_fold_copy = &crc32_fold_copy_c; + ft.crc32_fold_final = &crc32_fold_final_c; + ft.crc32_fold_reset = &crc32_fold_reset_c; +# ifndef HAVE_BUILTIN_CTZ + ft.longest_match = &longest_match_c; + ft.longest_match_slow = &longest_match_slow_c; + ft.compare256 = &compare256_c; +# endif +# endif +#else // WITH_ALL_FALLBACKS ft.adler32 = &adler32_c; ft.adler32_fold_copy = &adler32_fold_copy_c; ft.chunkmemset_safe = &chunkmemset_safe_c; @@ -63,8 +80,10 @@ static void init_functable(void) { ft.longest_match = &longest_match_c; ft.longest_match_slow = &longest_match_slow_c; ft.compare256 = &compare256_c; +#endif // Select arch-optimized functions +#ifdef WITH_OPTIM // X86 - SSE2 #ifdef X86_SSE2 @@ -73,9 +92,9 @@ static void init_functable(void) { # endif { ft.chunkmemset_safe = &chunkmemset_safe_sse2; -#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) +# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) ft.crc32 = &crc32_chorba_sse2; -#endif +# endif ft.inflate_fast = &inflate_fast_sse2; ft.slide_hash = &slide_hash_sse2; # ifdef HAVE_BUILTIN_CTZ @@ -301,6 +320,8 @@ static void init_functable(void) { } #endif +#endif // WITH_OPTIM + // Assign function pointers individually for atomic operation FUNCTABLE_ASSIGN(ft, force_init); FUNCTABLE_ASSIGN(ft, adler32); diff --git a/test/benchmarks/benchmark_slidehash.cc b/test/benchmarks/benchmark_slidehash.cc index 6f3b1221..4479a935 100644 --- a/test/benchmarks/benchmark_slidehash.cc +++ b/test/benchmarks/benchmark_slidehash.cc @@ -77,7 +77,9 @@ public: } \ BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(512, MAX_RANDOM_INTS); +#if defined(WITH_ALL_FALLBACKS) || !defined(__x86_64__) BENCHMARK_SLIDEHASH(c, slide_hash_c, 1); +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION BENCHMARK_SLIDEHASH(native, native_slide_hash, 1); -- 2.47.3