From: Nathan Moinvaziri Date: Mon, 2 Feb 2026 21:43:09 +0000 (-0800) Subject: Refactor ctz builtins while always providing fallback. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d397562b5d3413c596c9eb7645487b937dc8a0e7;p=thirdparty%2Fzlib-ng.git Refactor ctz builtins while always providing fallback. Centralize count trailing zeros logic in fallback_builtins.h with zng_ctz32/zng_ctz64 that use hardware intrinsics when available and De Bruijn multiplication as portable fallback. --- diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c index 3d05152f3..afaf42f5b 100644 --- a/arch/arm/compare256_neon.c +++ b/arch/arm/compare256_neon.c @@ -24,16 +24,12 @@ static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t cmp = veorq_u8(a, b); lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0); - if (lane) { - uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8; - return len + match_byte; - } + if (lane) + return len + zng_ctz64(lane) / 8; len += 8; lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1); - if (lane) { - uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8; - return len + match_byte; - } + if (lane) + return len + zng_ctz64(lane) / 8; len += 8; src0 += 16, src1 += 16; diff --git a/arch/generic/compare256_p.h b/arch/generic/compare256_p.h index ac934841d..331a14bfc 100644 --- a/arch/generic/compare256_p.h +++ b/arch/generic/compare256_p.h @@ -78,14 +78,8 @@ static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) { mv = zng_memread_4(src1); diff = sv ^ mv; - if (diff) { -# if BYTE_ORDER == LITTLE_ENDIAN - uint32_t match_byte = __builtin_ctz(diff) / 8; -# else - uint32_t match_byte = __builtin_clz(diff) / 8; -# endif - return len + match_byte; - } + if (diff) + return len + zng_ctz32(Z_U32_FROM_LE(diff)) / 8; src0 += 4, src1 += 4, len += 4; } while (len < 256); @@ -106,14 +100,8 @@ static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { mv = zng_memread_8(src1); diff = sv ^ mv; - if (diff) { -# if BYTE_ORDER == LITTLE_ENDIAN - uint64_t match_byte = __builtin_ctzll(diff) / 8; -# else - uint64_t match_byte = __builtin_clzll(diff) / 8; -# endif - return len + (uint32_t)match_byte; - } + if (diff) + return len + zng_ctz64(Z_U64_FROM_LE(diff)) / 8; src0 += 8, src1 += 8, len += 8; } while (len < 256); diff --git a/arch/loongarch/compare256_lasx.c b/arch/loongarch/compare256_lasx.c index 7cc05d993..2db428b6b 100644 --- a/arch/loongarch/compare256_lasx.c +++ b/arch/loongarch/compare256_lasx.c @@ -23,10 +23,8 @@ static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t ymm_src1 = __lasx_xvld(src1, 0); ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp); - if (mask != 0xFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */ - return len + match_byte; - } + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */ src0 += 32, src1 += 32, len += 32; @@ -34,10 +32,8 @@ static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t ymm_src1 = __lasx_xvld(src1, 0); ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); mask = (unsigned)lasx_movemask_b(ymm_cmp); - if (mask != 0xFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return len + match_byte; - } + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); src0 += 32, src1 += 32, len += 32; } while (len < 256); diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c index 4d23dee3c..e02329db0 100644 --- a/arch/loongarch/compare256_lsx.c +++ b/arch/loongarch/compare256_lsx.c @@ -27,10 +27,8 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t /* Compiler _may_ turn this branch into a ptest + movemask, * since a lot of those uops are shared and fused */ - if (mask != 0xFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return match_byte; - } + if (mask != 0xFFFF) + return zng_ctz32(~mask); const uint8_t *last0 = src0 + 240; const uint8_t *last1 = src1 + 240; @@ -51,10 +49,8 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t /* Compiler _may_ turn this branch into a ptest + movemask, * since a lot of those uops are shared and fused */ - if (mask != 0xFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return len + match_byte; - } + if (mask != 0xFFFF) + return len + zng_ctz32(~mask); len += 16, src0 += 16, src1 += 16; } @@ -66,10 +62,8 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t mask = (unsigned)lsx_movemask_b(xmm_cmp); - if (mask != 0xFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return 240 + match_byte; - } + if (mask != 0xFFFF) + return 240 + zng_ctz32(~mask); } return 256; diff --git a/arch/x86/compare256_avx2.c b/arch/x86/compare256_avx2.c index 8a0213c3a..c99db3b34 100644 --- a/arch/x86/compare256_avx2.c +++ b/arch/x86/compare256_avx2.c @@ -24,10 +24,8 @@ static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t ymm_src1 = _mm256_loadu_si256((__m256i*)src1); ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); - if (mask != 0xFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */ - return len + match_byte; - } + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */ src0 += 32, src1 += 32, len += 32; @@ -35,10 +33,8 @@ static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t ymm_src1 = _mm256_loadu_si256((__m256i*)src1); ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); - if (mask != 0xFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return len + match_byte; - } + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); src0 += 32, src1 += 32, len += 32; } while (len < 256); diff --git a/arch/x86/compare256_avx512.c b/arch/x86/compare256_avx512.c index a1ebe0e5b..f61402ae6 100644 --- a/arch/x86/compare256_avx512.c +++ b/arch/x86/compare256_avx512.c @@ -34,47 +34,36 @@ static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8 xmm_src0_0 = _mm_loadu_si128((__m128i*)src0); xmm_src1_0 = _mm_loadu_si128((__m128i*)src1); mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz - if (mask_0 != 0x0000FFFF) { - // There is potential for using __builtin_ctzg/__builtin_ctzs/_tzcnt_u16/__tzcnt_u16 here - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask_0); /* Invert bits so identical = 0 */ - return match_byte; - } + if (mask_0 != 0x0000FFFF) + return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */ // 64 bytes zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16)); zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16)); mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1); - if (mask_1 != 0xFFFFFFFFFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_1); - return 16 + match_byte; - } + if (mask_1 != 0xFFFFFFFFFFFFFFFF) + return 16 + zng_ctz64(~mask_1); // 64 bytes zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80)); zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80)); mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2); - if (mask_2 != 0xFFFFFFFFFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_2); - return 80 + match_byte; - } + if (mask_2 != 0xFFFFFFFFFFFFFFFF) + return 80 + zng_ctz64(~mask_2); // 64 bytes zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144)); zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144)); mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3); - if (mask_3 != 0xFFFFFFFFFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_3); - return 144 + match_byte; - } + if (mask_3 != 0xFFFFFFFFFFFFFFFF) + return 144 + zng_ctz64(~mask_3); // 64 bytes (overlaps the previous 16 bytes for fast tail processing) zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192)); zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192)); mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4); - if (mask_4 != 0xFFFFFFFFFFFFFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_4); - return 192 + match_byte; - } + if (mask_4 != 0xFFFFFFFFFFFFFFFF) + return 192 + zng_ctz64(~mask_4); return 256; } diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c index 1d539cb0d..2864b4df9 100644 --- a/arch/x86/compare256_sse2.c +++ b/arch/x86/compare256_sse2.c @@ -25,10 +25,8 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t /* Compiler _may_ turn this branch into a ptest + movemask, * since a lot of those uops are shared and fused */ - if (mask != 0xFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return match_byte; - } + if (mask != 0xFFFF) + return zng_ctz32(~mask); const uint8_t *last0 = src0 + 240; const uint8_t *last1 = src1 + 240; @@ -49,10 +47,8 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t /* Compiler _may_ turn this branch into a ptest + movemask, * since a lot of those uops are shared and fused */ - if (mask != 0xFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return len + match_byte; - } + if (mask != 0xFFFF) + return len + zng_ctz32(~mask); len += 16, src0 += 16, src1 += 16; } @@ -64,10 +60,8 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t mask = (unsigned)_mm_movemask_epi8(xmm_cmp); - if (mask != 0xFFFF) { - uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); - return 240 + match_byte; - } + if (mask != 0xFFFF) + return 240 + zng_ctz32(~mask); } return 256; diff --git a/compare256_rle.h b/compare256_rle.h index 5edfd734a..8fac7e108 100644 --- a/compare256_rle.h +++ b/compare256_rle.h @@ -83,14 +83,8 @@ static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src mv = zng_memread_4(src1); diff = sv ^ mv; - if (diff) { -#if BYTE_ORDER == LITTLE_ENDIAN - uint32_t match_byte = __builtin_ctz(diff) / 8; -#else - uint32_t match_byte = __builtin_clz(diff) / 8; -#endif - return len + match_byte; - } + if (diff) + return len + zng_ctz32(Z_U32_TO_LE(diff)) / 8; src1 += 4, len += 4; } while (len < 256); @@ -116,14 +110,8 @@ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src mv = zng_memread_8(src1); diff = sv ^ mv; - if (diff) { -#if BYTE_ORDER == LITTLE_ENDIAN - uint64_t match_byte = __builtin_ctzll(diff) / 8; -#else - uint64_t match_byte = __builtin_clzll(diff) / 8; -#endif - return len + (uint32_t)match_byte; - } + if (diff) + return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; src1 += 8, len += 8; } while (len < 256); diff --git a/fallback_builtins.h b/fallback_builtins.h index 8ac842106..8ccd04e04 100644 --- a/fallback_builtins.h +++ b/fallback_builtins.h @@ -1,51 +1,71 @@ #ifndef FALLBACK_BUILTINS_H #define FALLBACK_BUILTINS_H -/* Provide fallback for compilers that don't support __has_builtin */ -# ifndef __has_builtin -# define __has_builtin(x) 0 -# endif - #if defined(_MSC_VER) && !defined(__clang__) +# include +#endif -#include +/* Provide fallback for compilers that don't support __has_builtin */ +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +/* Count trailing zeros (CTZ) functions with portable fallback. + * + * Predicate: Input must be non-zero. The result is undefined for zero input because + * __builtin_ctz, BSF, and TZCNT all have undefined/different behavior for zero. TZCNT + * returns operand size for zero, BSF leaves destination undefined, and __builtin_ctz + * is explicitly undefined per GCC/Clang docs. */ -/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0. - * Because of that assumption trailing_zero is not initialized and the return value is not checked. - * Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed. - * If tzcnt instruction is not supported, the cpu will itself execute bsf instead. - * Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu. - */ -Z_FORCEINLINE static int __builtin_ctz(unsigned int value) { +Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) { Assert(value != 0, "Invalid input value: 0"); +#if __has_builtin(__builtin_ctz) + return (uint32_t)__builtin_ctz(value); +#elif defined(_MSC_VER) && !defined(__clang__) # if defined(X86_FEATURES) && !(_MSC_VER < 1700) - return (int)_tzcnt_u32(value); + /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ + return (uint32_t)_tzcnt_u32(value); # else unsigned long trailing_zero; _BitScanForward(&trailing_zero, value); - return (int)trailing_zero; + return (uint32_t)trailing_zero; # endif +#else + /* De Bruijn CTZ for 32-bit values */ + static const uint8_t debruijn_ctz32[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 + }; + uint32_t lsb = value & (~value + 1u); + return debruijn_ctz32[(lsb * 0x077CB531U) >> 27]; +#endif } -# define HAVE_BUILTIN_CTZ -# ifdef ARCH_64BIT -/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0. - * Because of that assumption trailing_zero is not initialized and the return value is not checked. - */ -Z_FORCEINLINE static int __builtin_ctzll(unsigned long long value) { +Z_FORCEINLINE static uint32_t zng_ctz64(uint64_t value) { Assert(value != 0, "Invalid input value: 0"); +#if __has_builtin(__builtin_ctzll) + return (uint32_t)__builtin_ctzll(value); +#elif defined(_MSC_VER) && !defined(__clang__) && defined(ARCH_64BIT) # if defined(X86_FEATURES) && !(_MSC_VER < 1700) - return (int)_tzcnt_u64(value); + /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ + return (uint32_t)_tzcnt_u64(value); # else unsigned long trailing_zero; _BitScanForward64(&trailing_zero, value); - return (int)trailing_zero; + return (uint32_t)trailing_zero; # endif +#else + /* De Bruijn CTZ for 64-bit values */ + static const uint8_t debruijn_ctz64[64] = { + 63, 0, 1, 52, 2, 6, 53, 26, 3, 37, 40, 7, 33, 54, 47, 27, + 61, 4, 38, 45, 43, 41, 21, 8, 23, 34, 58, 55, 48, 17, 28, 10, + 62, 51, 5, 25, 36, 39, 32, 46, 60, 44, 42, 20, 22, 57, 16, 9, + 50, 24, 35, 31, 59, 19, 56, 15, 49, 30, 18, 14, 29, 13, 12, 11 + }; + uint64_t lsb = value & (~value + 1ull); + return debruijn_ctz64[(lsb * 0x045FBAC7992A70DAULL) >> 58]; +#endif } -# define HAVE_BUILTIN_CTZLL -# endif // ARCH_64BIT - -#endif // _MSC_VER && !__clang__ #if !__has_builtin(__builtin_bitreverse16)