From: Nathan Moinvaziri Date: Tue, 10 Mar 2026 06:41:45 +0000 (-0700) Subject: Add fallback defines to skip generic C code when native intrinsics exist X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=15c7569f1b38c5bca1f4ea9fee4e2b57482c9290;p=thirdparty%2Fzlib-ng.git Add fallback defines to skip generic C code when native intrinsics exist Each arch header now sets *_FALLBACK defines (ADLER32_FALLBACK, CHUNKSET_FALLBACK, COMPARE256_FALLBACK, CRC32_BRAID_FALLBACK, SLIDE_HASH_FALLBACK) when no native SIMD implementation exists. Generic C source files, declarations, functable entries, tests, and benchmarks are guarded by these defines. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 81d7b0729..ffd3c5bac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1297,7 +1297,7 @@ set(ZLIB_SRCS zutil.c ) -set(ZLIB_ALL_FALLBACK_SRCS +set(ZLIB_GENERIC_SRCS arch/generic/adler32_c.c arch/generic/chunkset_c.c arch/generic/compare256_c.c @@ -1306,16 +1306,6 @@ set(ZLIB_ALL_FALLBACK_SRCS ) if(WITH_ALL_FALLBACKS) - list(APPEND ZLIB_GENERIC_SRCS ${ZLIB_ALL_FALLBACK_SRCS}) - add_definitions(-DWITH_ALL_FALLBACKS) -elseif(BASEARCH_X86_FOUND AND ARCH_64BIT AND WITH_SSE2) - # x86_64 always has SSE2, so let the SSE2 functions act as fallbacks. - list(APPEND ZLIB_GENERIC_SRCS - arch/generic/adler32_c.c - arch/generic/crc32_braid_c.c - ) -else() - list(APPEND ZLIB_GENERIC_SRCS ${ZLIB_ALL_FALLBACK_SRCS}) add_definitions(-DWITH_ALL_FALLBACKS) endif() diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h index bc77adb97..d5ebd1997 100644 --- a/arch/arm/arm_functions.h +++ b/arch/arm/arm_functions.h @@ -18,15 +18,29 @@ uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match); void slide_hash_neon(deflate_state *s); #endif +#ifndef ARM_NEON_NATIVE +# define ADLER32_FALLBACK +# define CHUNKSET_FALLBACK +# define COMPARE256_FALLBACK +# ifndef ARM_SIMD_NATIVE +# define SLIDE_HASH_FALLBACK +# endif +#endif + #ifdef ARM_CRC32 uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); #endif + #ifdef ARM_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); #endif +#if !defined(ARM_CRC32_NATIVE) && !defined(ARM_PMULL_EOR3_NATIVE) +# define CRC32_BRAID_FALLBACK +#endif + #ifdef ARM_SIMD void slide_hash_armv6(deflate_state *s); #endif diff --git a/arch/generic/adler32_c.c b/arch/generic/adler32_c.c index 84c946f45..8abfcd2e6 100644 --- a/arch/generic/adler32_c.c +++ b/arch/generic/adler32_c.c @@ -4,6 +4,10 @@ */ #include "zbuild.h" +#include "arch_functions.h" + +#ifdef ADLER32_FALLBACK + #include "functable.h" #include "adler32_p.h" @@ -53,3 +57,5 @@ Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t * memcpy(dst, src, len); return adler; } + +#endif /* ADLER32_FALLBACK */ diff --git a/arch/generic/chunkset_c.c b/arch/generic/chunkset_c.c index ff9b1cb5f..5cc66977e 100644 --- a/arch/generic/chunkset_c.c +++ b/arch/generic/chunkset_c.c @@ -3,6 +3,10 @@ */ #include "zbuild.h" +#include "arch_functions.h" + +#ifdef CHUNKSET_FALLBACK + #include "zmemory.h" typedef uint64_t chunk_t; @@ -38,3 +42,5 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { #define INFLATE_FAST inflate_fast_c #include "inffast_tpl.h" + +#endif /* CHUNKSET_FALLBACK */ diff --git a/arch/generic/compare256_c.c b/arch/generic/compare256_c.c index a2b47751e..e0b016556 100644 --- a/arch/generic/compare256_c.c +++ b/arch/generic/compare256_c.c @@ -4,6 +4,10 @@ */ #include "zbuild.h" +#include "arch_functions.h" + +#ifdef COMPARE256_FALLBACK + #include "zendian.h" #include "deflate.h" #include "fallback_builtins.h" @@ -64,7 +68,6 @@ static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t * # define COMPARE256 compare256_64_static #endif -#ifdef WITH_ALL_FALLBACKS Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { return compare256_8_static(src0, src1); } @@ -72,7 +75,6 @@ Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { return compare256_64_static(src0, src1); } -#endif Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { return COMPARE256(src0, src1); @@ -86,3 +88,5 @@ Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { #define LONGEST_MATCH_SLOW #define LONGEST_MATCH longest_match_slow_c #include "match_tpl.h" + +#endif /* COMPARE256_FALLBACK */ diff --git a/arch/generic/crc32_braid_c.c b/arch/generic/crc32_braid_c.c index bda4a249b..1e83543d1 100644 --- a/arch/generic/crc32_braid_c.c +++ b/arch/generic/crc32_braid_c.c @@ -8,6 +8,11 @@ */ #include "zbuild.h" +#include "arch_functions.h" + +/* Used by chorba fallback and by arch-specific implementations (s390 vx, risc-v zbc). */ +#ifdef CRC32_BRAID_FALLBACK + #include "crc32_braid_p.h" #include "crc32_braid_tbl.h" #include "crc32_p.h" @@ -211,3 +216,5 @@ Z_INTERNAL uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t * memcpy(dst, src, len); return crc; } + +#endif /* CRC32_BRAID_FALLBACK */ diff --git a/arch/generic/crc32_chorba_c.c b/arch/generic/crc32_chorba_c.c index 9f8427bd1..ded968a8a 100644 --- a/arch/generic/crc32_chorba_c.c +++ b/arch/generic/crc32_chorba_c.c @@ -1,5 +1,8 @@ #include "zbuild.h" -#include "zendian.h" +#include "arch_functions.h" + +#ifdef CRC32_CHORBA_FALLBACK + #if defined(__EMSCRIPTEN__) # include "zutil_p.h" #endif @@ -7,7 +10,6 @@ #include "crc32_chorba_p.h" #include "crc32_braid_p.h" #include "crc32_braid_tbl.h" -#include "generic_functions.h" /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */ #define bitbuffer_size_bytes (16 * 1024 * sizeof(chorba_word_t)) @@ -1032,3 +1034,5 @@ uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_ memcpy(dst, src, len); return crc; } + +#endif /* CRC32_CHORBA_FALLBACK */ diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h index c150a2f01..0fcca560b 100644 --- a/arch/generic/generic_functions.h +++ b/arch/generic/generic_functions.h @@ -5,9 +5,6 @@ #ifndef GENERIC_FUNCTIONS_H_ #define GENERIC_FUNCTIONS_H_ -#include "zendian.h" -#include "deflate.h" - typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len); typedef uint32_t (*adler32_copy_func)(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1); @@ -15,50 +12,69 @@ typedef uint32_t (*crc32_func)(uint32_t crc, const uint8_t *buf, size_t len); typedef uint32_t (*crc32_copy_func)(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); typedef void (*slide_hash_func)(deflate_state *s); - +#ifdef ADLER32_FALLBACK uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); - +#endif +#ifdef CHUNKSET_FALLBACK uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, size_t len, size_t left); - -#ifdef WITH_ALL_FALLBACKS +#endif +#ifdef COMPARE256_FALLBACK uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1); uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1); -#endif uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); +#endif +#ifdef CRC32_BRAID_FALLBACK uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif -#ifndef WITHOUT_CHORBA +/* Chorba is available whenever braid is needed as a fallback and hasn't been disabled. */ +#if defined(CRC32_BRAID_FALLBACK) && !defined(WITHOUT_CHORBA) +# define CRC32_CHORBA_FALLBACK +#endif + +#ifdef CRC32_CHORBA_FALLBACK uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); #endif - +#ifdef CHUNKSET_FALLBACK void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); - +#endif +#ifdef COMPARE256_FALLBACK uint32_t longest_match_c(deflate_state *const s, uint32_t cur_match); uint32_t longest_match_slow_c(deflate_state *const s, uint32_t cur_match); - +#endif +#ifdef SLIDE_HASH_FALLBACK void slide_hash_c(deflate_state *s); +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION -// Generic code -# define native_adler32 adler32_c -# define native_adler32_copy adler32_copy_c -# define native_chunkmemset_safe chunkmemset_safe_c -#ifndef WITHOUT_CHORBA -# define native_crc32 crc32_chorba -# define native_crc32_copy crc32_copy_chorba -#else -# define native_crc32 crc32_braid -# define native_crc32_copy crc32_copy_braid -#endif -# define native_inflate_fast inflate_fast_c -# define native_slide_hash slide_hash_c -# define native_longest_match longest_match_c -# define native_longest_match_slow longest_match_slow_c -# define native_compare256 compare256_c +// Generic fallbacks when no native implementation exists +# ifdef ADLER32_FALLBACK +# define native_adler32 adler32_c +# define native_adler32_copy adler32_copy_c +# endif +# ifdef CHUNKSET_FALLBACK +# define native_chunkmemset_safe chunkmemset_safe_c +# define native_inflate_fast inflate_fast_c +# endif +# ifdef COMPARE256_FALLBACK +# define native_compare256 compare256_c +# define native_longest_match longest_match_c +# define native_longest_match_slow longest_match_slow_c +# endif +# ifdef CRC32_CHORBA_FALLBACK +# define native_crc32 crc32_chorba +# define native_crc32_copy crc32_copy_chorba +# elif defined(CRC32_BRAID_FALLBACK) +# define native_crc32 crc32_braid +# define native_crc32_copy crc32_copy_braid +# endif +# ifdef SLIDE_HASH_FALLBACK +# define native_slide_hash slide_hash_c +# endif #endif #endif diff --git a/arch/generic/slide_hash_c.c b/arch/generic/slide_hash_c.c index 8345b9e36..8fdc478cb 100644 --- a/arch/generic/slide_hash_c.c +++ b/arch/generic/slide_hash_c.c @@ -5,6 +5,10 @@ */ #include "zbuild.h" +#include "arch_functions.h" + +#ifdef SLIDE_HASH_FALLBACK + #include "deflate.h" /* =========================================================================== @@ -50,3 +54,5 @@ Z_INTERNAL void slide_hash_c(deflate_state *s) { slide_hash_c_chain(s->head, HASH_SIZE, wsize); slide_hash_c_chain(s->prev, wsize, wsize); } + +#endif /* SLIDE_HASH_FALLBACK */ diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index 0ec8bd66d..980ebca02 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -15,6 +15,10 @@ uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); #endif +#ifndef LOONGARCH_CRC_NATIVE +# define CRC32_BRAID_FALLBACK +#endif + #ifdef LOONGARCH_LSX uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); @@ -26,6 +30,13 @@ uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match); void slide_hash_lsx(deflate_state *s); #endif +#ifndef LOONGARCH_LSX_NATIVE +# define ADLER32_FALLBACK +# define CHUNKSET_FALLBACK +# define COMPARE256_FALLBACK +# define SLIDE_HASH_FALLBACK +#endif + #ifdef LOONGARCH_LASX uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len); uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); diff --git a/arch/power/power_functions.h b/arch/power/power_functions.h index ccc7754a4..78bae4a49 100644 --- a/arch/power/power_functions.h +++ b/arch/power/power_functions.h @@ -25,12 +25,28 @@ void slide_hash_power8(deflate_state *s); void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); #endif +#if !defined(PPC_VMX_NATIVE) && !defined(POWER8_VSX_NATIVE) +# define ADLER32_FALLBACK +# define SLIDE_HASH_FALLBACK +#endif + +#ifndef POWER8_VSX_NATIVE +# define CHUNKSET_FALLBACK +#endif +#ifndef POWER8_VSX_CRC32_NATIVE +# define CRC32_BRAID_FALLBACK +#endif + #ifdef POWER9 uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); uint32_t longest_match_power9(deflate_state *const s, uint32_t cur_match); uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match); #endif +#ifndef POWER9_NATIVE +# define COMPARE256_FALLBACK +#endif + #ifdef DISABLE_RUNTIME_CPU_DETECTION // Power - VMX # ifdef PPC_VMX_NATIVE diff --git a/arch/riscv/riscv_functions.h b/arch/riscv/riscv_functions.h index 89120ffab..22f783c5e 100644 --- a/arch/riscv/riscv_functions.h +++ b/arch/riscv/riscv_functions.h @@ -11,6 +11,8 @@ #include "riscv_natives.h" +#define CRC32_BRAID_FALLBACK /* used by crc32_zbc */ + #ifdef RISCV_RVV uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); @@ -23,6 +25,13 @@ void slide_hash_rvv(deflate_state *s); void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); #endif +#ifndef RISCV_RVV_NATIVE +# define ADLER32_FALLBACK +# define CHUNKSET_FALLBACK +# define COMPARE256_FALLBACK +# define SLIDE_HASH_FALLBACK +#endif + #ifdef RISCV_CRC32_ZBC uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); diff --git a/arch/s390/s390_functions.h b/arch/s390/s390_functions.h index d5308c8d0..c0043a6e8 100644 --- a/arch/s390/s390_functions.h +++ b/arch/s390/s390_functions.h @@ -7,6 +7,15 @@ #include "s390_natives.h" +#define ADLER32_FALLBACK +#define CHUNKSET_FALLBACK +#define COMPARE256_FALLBACK +#define CRC32_BRAID_FALLBACK /* used by crc32_s390_vx */ + +#ifndef S390_VX_NATIVE +# define SLIDE_HASH_FALLBACK +#endif + #ifdef S390_VX uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); diff --git a/arch/x86/crc32_chorba_sse2.c b/arch/x86/crc32_chorba_sse2.c index 93ec5d535..5fbdbc6ef 100644 --- a/arch/x86/crc32_chorba_sse2.c +++ b/arch/x86/crc32_chorba_sse2.c @@ -1,12 +1,13 @@ -#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE) - #include "zbuild.h" +#include "arch_functions.h" + +#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE) && defined(CRC32_CHORBA_FALLBACK) + #include "crc32_chorba_p.h" #include "crc32_braid_p.h" #include "crc32_braid_tbl.h" #include #include "arch/x86/x86_intrins.h" -#include "arch_functions.h" #define LSHIFT_QWORD(x) _mm_unpacklo_epi64(_mm_setzero_si128(), (x)) #define RSHIFT_QWORD(x) _mm_unpackhi_epi64((x), _mm_setzero_si128()) diff --git a/arch/x86/crc32_chorba_sse41.c b/arch/x86/crc32_chorba_sse41.c index a137c7a56..d8cadc93a 100644 --- a/arch/x86/crc32_chorba_sse41.c +++ b/arch/x86/crc32_chorba_sse41.c @@ -1,13 +1,14 @@ -#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) - #include "zbuild.h" +#include "arch_functions.h" + +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) && defined(CRC32_CHORBA_FALLBACK) + #include "crc32_chorba_p.h" #include "crc32_braid_p.h" #include "crc32_braid_tbl.h" #include #include #include "arch/x86/x86_intrins.h" -#include "arch_functions.h" #define READ_NEXT(in, off, a, b) \ do { \ diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 881c6efe2..0bcbdaeda 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -24,13 +24,19 @@ uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match); uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match); void slide_hash_sse2(deflate_state *s); -# if !defined(WITHOUT_CHORBA_SSE) +# if !defined(WITHOUT_CHORBA) && !defined(WITHOUT_CHORBA_SSE) uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len); # endif #endif +#ifndef X86_SSE2_NATIVE +# define CHUNKSET_FALLBACK +# define COMPARE256_FALLBACK +# define SLIDE_HASH_FALLBACK +#endif + #ifdef X86_SSSE3 uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); @@ -38,9 +44,15 @@ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, size_t len, size_t void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); #endif -#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) +#ifndef X86_SSSE3_NATIVE +# define ADLER32_FALLBACK +#endif + +#if defined(X86_SSE41) +# if !defined(WITHOUT_CHORBA) && !defined(WITHOUT_CHORBA_SSE) uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +# endif #endif #ifdef X86_SSE42 @@ -84,6 +96,10 @@ uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); #endif +#if !defined(X86_PCLMULQDQ_NATIVE) && !defined(X86_VPCLMULQDQ_NATIVE) +# define CRC32_BRAID_FALLBACK +#endif + #ifdef DISABLE_RUNTIME_CPU_DETECTION // X86 - SSE2 # ifdef X86_SSE2_NATIVE @@ -97,7 +113,7 @@ uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t # define native_longest_match longest_match_sse2 # undef native_longest_match_slow # define native_longest_match_slow longest_match_slow_sse2 -# if !defined(WITHOUT_CHORBA_SSE) +# if !defined(WITHOUT_CHORBA) && !defined(WITHOUT_CHORBA_SSE) # undef native_crc32 # define native_crc32 crc32_chorba_sse2 # undef native_crc32_copy @@ -118,11 +134,13 @@ uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t # define native_inflate_fast inflate_fast_ssse3 # endif // X86 - SSE4.1 -# if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE) -# undef native_crc32 -# define native_crc32 crc32_chorba_sse41 -# undef native_crc32_copy -# define native_crc32_copy crc32_copy_chorba_sse41 +# if defined(X86_SSE41_NATIVE) +# if !defined(WITHOUT_CHORBA) && !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse41 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_chorba_sse41 +# endif # endif // X86 - SSE4.2 # ifdef X86_SSE42_NATIVE diff --git a/arch_functions.h b/arch_functions.h index 979c96862..d5b152e73 100644 --- a/arch_functions.h +++ b/arch_functions.h @@ -11,8 +11,6 @@ #include "deflate.h" #include "fallback_builtins.h" -#include "arch/generic/generic_functions.h" - #if defined(X86_FEATURES) # include "arch/x86/x86_functions.h" #elif defined(ARM_FEATURES) @@ -25,6 +23,34 @@ # include "arch/riscv/riscv_functions.h" #elif defined(LOONGARCH_FEATURES) # include "arch/loongarch/loongarch_functions.h" +#else +/* No architecture detected - all fallbacks needed */ +# ifndef WITH_ALL_FALLBACKS +# define WITH_ALL_FALLBACKS +# endif +#endif + +#ifdef WITH_ALL_FALLBACKS +# ifndef ADLER32_FALLBACK +# define ADLER32_FALLBACK +# endif +# ifndef CHUNKSET_FALLBACK +# define CHUNKSET_FALLBACK +# endif +# ifndef COMPARE256_FALLBACK +# define COMPARE256_FALLBACK +# endif +# ifndef CRC32_BRAID_FALLBACK +# define CRC32_BRAID_FALLBACK +# endif +# if !defined(CRC32_CHORBA_FALLBACK) && !defined(WITHOUT_CHORBA) +# define CRC32_CHORBA_FALLBACK +# endif +# ifndef SLIDE_HASH_FALLBACK +# define SLIDE_HASH_FALLBACK +# endif #endif +#include "arch/generic/generic_functions.h" + #endif diff --git a/functable.c b/functable.c index fad863ecb..4064c6942 100644 --- a/functable.c +++ b/functable.c @@ -75,60 +75,25 @@ static int init_functable(void) { cpu_check_features(&cf); ft.force_init = &force_init_empty; - // Set up generic C code fallbacks -#ifndef WITH_ALL_FALLBACKS // Only use necessary generic functions when no suitable simd versions are available. -# ifdef X86_SSE2_NATIVE - // x86_64 always has SSE2 - ft.adler32 = &adler32_c; - ft.adler32_copy = &adler32_copy_c; - ft.crc32 = &crc32_braid; - ft.crc32_copy = &crc32_copy_braid; -# elif defined(ARM_NEON_NATIVE) -# ifndef ARM_CRC32_NATIVE - ft.crc32 = &crc32_braid; - ft.crc32_copy = &crc32_copy_braid; -# endif -# elif defined(POWER8_VSX_NATIVE) -# ifndef POWER9_NATIVE - ft.compare256 = &compare256_c; - ft.longest_match = &longest_match_c; - ft.longest_match_slow = &longest_match_slow_c; -# endif -# ifndef POWER8_VSX_CRC32_NATIVE - ft.crc32 = &crc32_braid; - ft.crc32_copy = &crc32_copy_braid; -# endif -# elif defined(LOONGARCH_LSX_NATIVE) -# ifndef LOONGARCH_CRC - ft.crc32 = &crc32_braid; - ft.crc32_copy = &crc32_copy_braid; -# endif -# elif defined(RISCV_RVV_NATIVE) -# ifndef RISCV_ZBC_NATIVE - ft.crc32 = &crc32_braid; - ft.crc32_copy = &crc32_copy_braid; -# endif -# elif defined(S390_VX_NATIVE) +#ifdef ADLER32_FALLBACK ft.adler32 = &adler32_c; ft.adler32_copy = &adler32_copy_c; +#endif +#ifdef CHUNKSET_FALLBACK ft.chunkmemset_safe = &chunkmemset_safe_c; - ft.compare256 = &compare256_c; ft.inflate_fast = &inflate_fast_c; +#endif +#ifdef COMPARE256_FALLBACK + ft.compare256 = &compare256_c; ft.longest_match = &longest_match_c; ft.longest_match_slow = &longest_match_slow_c; - ft.slide_hash = &slide_hash_c; -# endif -#else // WITH_ALL_FALLBACKS - ft.adler32 = &adler32_c; - ft.adler32_copy = &adler32_copy_c; - ft.chunkmemset_safe = &chunkmemset_safe_c; - ft.compare256 = &compare256_c; +#endif +#ifdef CRC32_BRAID_FALLBACK ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; - ft.inflate_fast = &inflate_fast_c; - ft.longest_match = &longest_match_c; - ft.longest_match_slow = &longest_match_slow_c; +#endif +#ifdef SLIDE_HASH_FALLBACK ft.slide_hash = &slide_hash_c; #endif @@ -136,7 +101,7 @@ static int init_functable(void) { #ifdef WITH_OPTIM // Chorba generic C fallback -#ifndef WITHOUT_CHORBA +#ifdef CRC32_CHORBA_FALLBACK ft.crc32 = &crc32_chorba; ft.crc32_copy = &crc32_copy_chorba; #endif diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc index 5ee9102e2..6916af71e 100644 --- a/test/benchmarks/benchmark_adler32.cc +++ b/test/benchmarks/benchmark_adler32.cc @@ -77,7 +77,9 @@ public: BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \ BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag); +#ifdef ADLER32_FALLBACK BENCHMARK_ADLER32(c, adler32_c, 1); +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION BENCHMARK_ADLER32(native, native_adler32, 1); diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index 6d913b1d1..ff6a1b079 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -128,7 +128,9 @@ public: BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) #endif +#ifdef ADLER32_FALLBACK BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1); +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1); diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index 2d8352879..88929bfb1 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -73,7 +73,7 @@ public: BENCHMARK_COMPARE256(native, native_compare256, 1); #else -#ifdef WITH_ALL_FALLBACKS +#ifdef COMPARE256_FALLBACK BENCHMARK_COMPARE256(8, compare256_8, 1); BENCHMARK_COMPARE256(64, compare256_64, 1); #endif diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc index 772dbfd72..b95f9520a 100644 --- a/test/benchmarks/benchmark_crc32.cc +++ b/test/benchmarks/benchmark_crc32.cc @@ -77,16 +77,18 @@ public: BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \ BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag); +#ifdef CRC32_BRAID_FALLBACK BENCHMARK_CRC32(braid, crc32_braid, 1); +#endif +#ifdef CRC32_CHORBA_FALLBACK +BENCHMARK_CRC32(chorba_c, crc32_chorba, 1); +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION BENCHMARK_CRC32(native, native_crc32, 1); #else -#ifndef WITHOUT_CHORBA -BENCHMARK_CRC32(chorba_c, crc32_chorba, 1); -#endif -#ifndef WITHOUT_CHORBA_SSE +#if defined(CRC32_CHORBA_FALLBACK) && !defined(WITHOUT_CHORBA_SSE) # ifdef X86_SSE2 BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2); # endif diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc index b0f0704e0..2df1f5710 100644 --- a/test/benchmarks/benchmark_crc32_copy.cc +++ b/test/benchmarks/benchmark_crc32_copy.cc @@ -128,17 +128,19 @@ public: #endif // Base test +#ifdef CRC32_BRAID_FALLBACK BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1); +#endif +#ifdef CRC32_CHORBA_FALLBACK +BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1) +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION // Native BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1) #else // Optimized functions -# ifndef WITHOUT_CHORBA - BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1) -# endif -# ifndef WITHOUT_CHORBA_SSE +# if defined(CRC32_CHORBA_FALLBACK) && !defined(WITHOUT_CHORBA_SSE) # ifdef X86_SSE2 BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2); # endif diff --git a/test/benchmarks/benchmark_slidehash.cc b/test/benchmarks/benchmark_slidehash.cc index 6e8f455cd..34c8fbe3d 100644 --- a/test/benchmarks/benchmark_slidehash.cc +++ b/test/benchmarks/benchmark_slidehash.cc @@ -77,7 +77,7 @@ public: } \ BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(512, MAX_RANDOM_INTS); -#if defined(WITH_ALL_FALLBACKS) || !(defined(__x86_64__) || defined(_M_X64)) +#ifdef SLIDE_HASH_FALLBACK BENCHMARK_SLIDEHASH(c, slide_hash_c, 1); #endif diff --git a/test/test_adler32.cc b/test/test_adler32.cc index c461f9393..7fe8bd69a 100644 --- a/test/test_adler32.cc +++ b/test/test_adler32.cc @@ -36,7 +36,9 @@ INSTANTIATE_TEST_SUITE_P(adler32, adler32_variant, testing::ValuesIn(hash_tests) hash(GetParam(), func); \ } +#ifdef ADLER32_FALLBACK TEST_ADLER32(c, adler32_c, 1) +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION TEST_ADLER32(native, native_adler32, 1) diff --git a/test/test_adler32_copy.cc b/test/test_adler32_copy.cc index 725d86aa9..47b63412a 100644 --- a/test/test_adler32_copy.cc +++ b/test/test_adler32_copy.cc @@ -40,7 +40,9 @@ INSTANTIATE_TEST_SUITE_P(adler32_copy, adler32_copy_variant, testing::ValuesIn(h } // Base test +#ifdef ADLER32_FALLBACK TEST_ADLER32_COPY(c, adler32_copy_c, 1) +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION // Native test diff --git a/test/test_compare256.cc b/test/test_compare256.cc index b3efe79fb..9978f9a73 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -63,7 +63,7 @@ static inline void compare256_match_check(compare256_func compare256) { TEST_COMPARE256(native, native_compare256, 1) #else -#ifdef WITH_ALL_FALLBACKS +#ifdef COMPARE256_FALLBACK TEST_COMPARE256(8, compare256_8, 1) TEST_COMPARE256(64, compare256_64, 1) #endif diff --git a/test/test_crc32.cc b/test/test_crc32.cc index 19eb43932..3da7a34a3 100644 --- a/test/test_crc32.cc +++ b/test/test_crc32.cc @@ -77,7 +77,12 @@ INSTANTIATE_TEST_SUITE_P(crc32, crc32_variant, testing::ValuesIn(hash_tests)); hash(func); \ } +#ifdef CRC32_BRAID_FALLBACK TEST_CRC32(braid, crc32_braid, 1) +#endif +#ifdef CRC32_CHORBA_FALLBACK +TEST_CRC32(chorba_c, crc32_chorba, 1) +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION TEST_CRC32(native, native_crc32, 1) @@ -99,9 +104,6 @@ static const int align_offsets[] = { } #endif -#ifndef WITHOUT_CHORBA -TEST_CRC32(chorba_c, crc32_chorba, 1) -#endif #ifdef ARM_CRC32 INSTANTIATE_TEST_SUITE_P(crc32_alignment, crc32_align, testing::ValuesIn(align_offsets)); TEST_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32) @@ -129,7 +131,7 @@ TEST_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pc #ifdef X86_VPCLMULQDQ_AVX512 TEST_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) #endif -#ifndef WITHOUT_CHORBA_SSE +#if defined(CRC32_CHORBA_FALLBACK) && !defined(WITHOUT_CHORBA_SSE) # ifdef X86_SSE2 TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2) # endif diff --git a/test/test_crc32_copy.cc b/test/test_crc32_copy.cc index 12b2be7e8..9edc8f601 100644 --- a/test/test_crc32_copy.cc +++ b/test/test_crc32_copy.cc @@ -40,17 +40,19 @@ INSTANTIATE_TEST_SUITE_P(crc32_copy, crc32_copy_variant, testing::ValuesIn(hash_ } // Base test +#ifdef CRC32_BRAID_FALLBACK TEST_CRC32_COPY(braid, crc32_copy_braid, 1) +#endif +#ifdef CRC32_CHORBA_FALLBACK +TEST_CRC32_COPY(chorba, crc32_copy_chorba, 1) +#endif #ifdef DISABLE_RUNTIME_CPU_DETECTION // Native test TEST_CRC32_COPY(native, native_crc32_copy, 1) #else // Optimized functions -# ifndef WITHOUT_CHORBA - TEST_CRC32_COPY(chorba, crc32_copy_chorba, 1) -# endif -# ifndef WITHOUT_CHORBA_SSE +# if defined(CRC32_CHORBA_FALLBACK) && !defined(WITHOUT_CHORBA_SSE) # ifdef X86_SSE2 TEST_CRC32_COPY(chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2) # endif