From: Nathan Moinvaziri <nathan@nathanm.com>
Date: Mon, 2 Feb 2026 21:43:09 +0000 (-0800)
Subject: Refactor ctz builtins while always providing fallback.
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d397562b5d3413c596c9eb7645487b937dc8a0e7;p=thirdparty%2Fzlib-ng.git

Refactor ctz builtins while always providing fallback.

Centralize count trailing zeros logic in fallback_builtins.h with
zng_ctz32/zng_ctz64 that use hardware intrinsics when available and
De Bruijn multiplication as portable fallback.
---

diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c
index 3d05152f3..afaf42f5b 100644
--- a/arch/arm/compare256_neon.c
+++ b/arch/arm/compare256_neon.c
@@ -24,16 +24,12 @@ static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t
         cmp = veorq_u8(a, b);
 
         lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
-        if (lane) {
-            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
-            return len + match_byte;
-        }
+        if (lane)
+            return len + zng_ctz64(lane) / 8;
         len += 8;
         lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
-        if (lane) {
-            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
-            return len + match_byte;
-        }
+        if (lane)
+            return len + zng_ctz64(lane) / 8;
         len += 8;
 
         src0 += 16, src1 += 16;
diff --git a/arch/generic/compare256_p.h b/arch/generic/compare256_p.h
index ac934841d..331a14bfc 100644
--- a/arch/generic/compare256_p.h
+++ b/arch/generic/compare256_p.h
@@ -78,14 +78,8 @@ static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
         mv = zng_memread_4(src1);
 
         diff = sv ^ mv;
-        if (diff) {
-#  if BYTE_ORDER == LITTLE_ENDIAN
-            uint32_t match_byte = __builtin_ctz(diff) / 8;
-#  else
-            uint32_t match_byte = __builtin_clz(diff) / 8;
-#  endif
-            return len + match_byte;
-        }
+        if (diff)
+            return len + zng_ctz32(Z_U32_FROM_LE(diff)) / 8;
 
         src0 += 4, src1 += 4, len += 4;
     } while (len < 256);
@@ -106,14 +100,8 @@ static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
         mv = zng_memread_8(src1);
 
         diff = sv ^ mv;
-        if (diff) {
-#  if BYTE_ORDER == LITTLE_ENDIAN
-            uint64_t match_byte = __builtin_ctzll(diff) / 8;
-#  else
-            uint64_t match_byte = __builtin_clzll(diff) / 8;
-#  endif
-            return len + (uint32_t)match_byte;
-        }
+        if (diff)
+            return len + zng_ctz64(Z_U64_FROM_LE(diff)) / 8;
 
         src0 += 8, src1 += 8, len += 8;
     } while (len < 256);
diff --git a/arch/loongarch/compare256_lasx.c b/arch/loongarch/compare256_lasx.c
index 7cc05d993..2db428b6b 100644
--- a/arch/loongarch/compare256_lasx.c
+++ b/arch/loongarch/compare256_lasx.c
@@ -23,10 +23,8 @@ static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t
         ymm_src1 = __lasx_xvld(src1, 0);
         ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
         unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp);
-        if (mask != 0xFFFFFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
-            return len + match_byte;
-        }
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
 
         src0 += 32, src1 += 32, len += 32;
 
@@ -34,10 +32,8 @@ static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t
         ymm_src1 = __lasx_xvld(src1, 0);
         ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1);
         mask = (unsigned)lasx_movemask_b(ymm_cmp);
-        if (mask != 0xFFFFFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return len + match_byte;
-        }
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask);
 
         src0 += 32, src1 += 32, len += 32;
     } while (len < 256);
diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c
index 4d23dee3c..e02329db0 100644
--- a/arch/loongarch/compare256_lsx.c
+++ b/arch/loongarch/compare256_lsx.c
@@ -27,10 +27,8 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t
 
     /* Compiler _may_ turn this branch into a ptest + movemask,
      * since a lot of those uops are shared and fused */
-    if (mask != 0xFFFF) {
-        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-        return match_byte;
-    }
+    if (mask != 0xFFFF)
+        return zng_ctz32(~mask);
 
     const uint8_t *last0 = src0 + 240;
     const uint8_t *last1 = src1 + 240;
@@ -51,10 +49,8 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t
 
         /* Compiler _may_ turn this branch into a ptest + movemask,
          * since a lot of those uops are shared and fused */
-        if (mask != 0xFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return len + match_byte;
-        }
+        if (mask != 0xFFFF)
+            return len + zng_ctz32(~mask);
 
         len += 16, src0 += 16, src1 += 16;
     }
@@ -66,10 +62,8 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t
 
         mask = (unsigned)lsx_movemask_b(xmm_cmp);
 
-        if (mask != 0xFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return 240 + match_byte;
-        }
+        if (mask != 0xFFFF)
+            return 240 + zng_ctz32(~mask);
     }
 
     return 256;
diff --git a/arch/x86/compare256_avx2.c b/arch/x86/compare256_avx2.c
index 8a0213c3a..c99db3b34 100644
--- a/arch/x86/compare256_avx2.c
+++ b/arch/x86/compare256_avx2.c
@@ -24,10 +24,8 @@ static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t
         ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
         ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
         unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
-        if (mask != 0xFFFFFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
-            return len + match_byte;
-        }
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
 
         src0 += 32, src1 += 32, len += 32;
 
@@ -35,10 +33,8 @@ static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t
         ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
         ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
         mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
-        if (mask != 0xFFFFFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return len + match_byte;
-        }
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask);
 
         src0 += 32, src1 += 32, len += 32;
     } while (len < 256);
diff --git a/arch/x86/compare256_avx512.c b/arch/x86/compare256_avx512.c
index a1ebe0e5b..f61402ae6 100644
--- a/arch/x86/compare256_avx512.c
+++ b/arch/x86/compare256_avx512.c
@@ -34,47 +34,36 @@ static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8
     xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
     xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
     mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz
-    if (mask_0 != 0x0000FFFF) {
-        // There is potential for using __builtin_ctzg/__builtin_ctzs/_tzcnt_u16/__tzcnt_u16 here
-        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask_0); /* Invert bits so identical = 0 */
-        return match_byte;
-    }
+    if (mask_0 != 0x0000FFFF)
+        return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */
 
     // 64 bytes
     zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16));
     zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16));
     mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1);
-    if (mask_1 != 0xFFFFFFFFFFFFFFFF) {
-        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_1);
-        return 16 + match_byte;
-    }
+    if (mask_1 != 0xFFFFFFFFFFFFFFFF)
+        return 16 + zng_ctz64(~mask_1);
 
     // 64 bytes
     zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80));
     zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80));
     mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2);
-    if (mask_2 != 0xFFFFFFFFFFFFFFFF) {
-        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_2);
-        return 80 + match_byte;
-    }
+    if (mask_2 != 0xFFFFFFFFFFFFFFFF)
+        return 80 + zng_ctz64(~mask_2);
 
     // 64 bytes
     zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144));
     zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144));
     mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3);
-    if (mask_3 != 0xFFFFFFFFFFFFFFFF) {
-        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_3);
-        return 144 + match_byte;
-    }
+    if (mask_3 != 0xFFFFFFFFFFFFFFFF)
+        return 144 + zng_ctz64(~mask_3);
 
     // 64 bytes (overlaps the previous 16 bytes for fast tail processing)
     zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192));
     zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192));
     mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4);
-    if (mask_4 != 0xFFFFFFFFFFFFFFFF) {
-        uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_4);
-        return 192 + match_byte;
-    }
+    if (mask_4 != 0xFFFFFFFFFFFFFFFF)
+        return 192 + zng_ctz64(~mask_4);
 
     return 256;
 }
diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c
index 1d539cb0d..2864b4df9 100644
--- a/arch/x86/compare256_sse2.c
+++ b/arch/x86/compare256_sse2.c
@@ -25,10 +25,8 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t
 
     /* Compiler _may_ turn this branch into a ptest + movemask,
      * since a lot of those uops are shared and fused */
-    if (mask != 0xFFFF) {
-        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-        return match_byte;
-    }
+    if (mask != 0xFFFF)
+        return zng_ctz32(~mask);
 
     const uint8_t *last0 = src0 + 240;
     const uint8_t *last1 = src1 + 240;
@@ -49,10 +47,8 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t
 
         /* Compiler _may_ turn this branch into a ptest + movemask,
          * since a lot of those uops are shared and fused */
-        if (mask != 0xFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return len + match_byte;
-        }
+        if (mask != 0xFFFF)
+            return len + zng_ctz32(~mask);
 
         len += 16, src0 += 16, src1 += 16;
     }
@@ -64,10 +60,8 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t
 
         mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
 
-        if (mask != 0xFFFF) {
-            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return 240 + match_byte;
-        }
+        if (mask != 0xFFFF)
+            return 240 + zng_ctz32(~mask);
     }
 
     return 256;
diff --git a/compare256_rle.h b/compare256_rle.h
index 5edfd734a..8fac7e108 100644
--- a/compare256_rle.h
+++ b/compare256_rle.h
@@ -83,14 +83,8 @@ static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src
         mv = zng_memread_4(src1);
 
         diff = sv ^ mv;
-        if (diff) {
-#if BYTE_ORDER == LITTLE_ENDIAN
-            uint32_t match_byte = __builtin_ctz(diff) / 8;
-#else
-            uint32_t match_byte = __builtin_clz(diff) / 8;
-#endif
-            return len + match_byte;
-        }
+        if (diff)
+            return len + zng_ctz32(Z_U32_TO_LE(diff)) / 8;
 
         src1 += 4, len += 4;
     } while (len < 256);
@@ -116,14 +110,8 @@ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src
         mv = zng_memread_8(src1);
 
         diff = sv ^ mv;
-        if (diff) {
-#if BYTE_ORDER == LITTLE_ENDIAN
-            uint64_t match_byte = __builtin_ctzll(diff) / 8;
-#else
-            uint64_t match_byte = __builtin_clzll(diff) / 8;
-#endif
-            return len + (uint32_t)match_byte;
-        }
+        if (diff)
+            return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
 
         src1 += 8, len += 8;
     } while (len < 256);
diff --git a/fallback_builtins.h b/fallback_builtins.h
index 8ac842106..8ccd04e04 100644
--- a/fallback_builtins.h
+++ b/fallback_builtins.h
@@ -1,51 +1,71 @@
 #ifndef FALLBACK_BUILTINS_H
 #define FALLBACK_BUILTINS_H
 
-/* Provide fallback for compilers that don't support __has_builtin */
-#  ifndef __has_builtin
-#    define __has_builtin(x) 0
-#  endif
-
 #if defined(_MSC_VER) && !defined(__clang__)
+#  include <intrin.h>
+#endif
 
-#include <intrin.h>
+/* Provide fallback for compilers that don't support __has_builtin */
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+/* Count trailing zeros (CTZ) functions with portable fallback.
+ *
+ * Predicate: Input must be non-zero. The result is undefined for zero input because
+ * __builtin_ctz, BSF, and TZCNT all have undefined/different behavior for zero. TZCNT
+ * returns operand size for zero, BSF leaves destination undefined, and __builtin_ctz
+ * is explicitly undefined per GCC/Clang docs. */
 
-/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
- * Because of that assumption trailing_zero is not initialized and the return value is not checked.
- * Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed.
- * If tzcnt instruction is not supported, the cpu will itself execute bsf instead.
- * Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu.
- */
-Z_FORCEINLINE static int __builtin_ctz(unsigned int value) {
+Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) {
     Assert(value != 0, "Invalid input value: 0");
+#if __has_builtin(__builtin_ctz)
+    return (uint32_t)__builtin_ctz(value);
+#elif defined(_MSC_VER) && !defined(__clang__)
 #  if defined(X86_FEATURES) && !(_MSC_VER < 1700)
-    return (int)_tzcnt_u32(value);
+    /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */
+    return (uint32_t)_tzcnt_u32(value);
 #  else
     unsigned long trailing_zero;
     _BitScanForward(&trailing_zero, value);
-    return (int)trailing_zero;
+    return (uint32_t)trailing_zero;
 #  endif
+#else
+    /* De Bruijn CTZ for 32-bit values */
+    static const uint8_t debruijn_ctz32[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+    };
+    uint32_t lsb = value & (~value + 1u);
+    return debruijn_ctz32[(lsb * 0x077CB531U) >> 27];
+#endif
 }
-#  define HAVE_BUILTIN_CTZ
 
-#  ifdef ARCH_64BIT
-/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
- * Because of that assumption trailing_zero is not initialized and the return value is not checked.
- */
-Z_FORCEINLINE static int __builtin_ctzll(unsigned long long value) {
+Z_FORCEINLINE static uint32_t zng_ctz64(uint64_t value) {
     Assert(value != 0, "Invalid input value: 0");
+#if __has_builtin(__builtin_ctzll)
+    return (uint32_t)__builtin_ctzll(value);
+#elif defined(_MSC_VER) && !defined(__clang__) && defined(ARCH_64BIT)
 #  if defined(X86_FEATURES) && !(_MSC_VER < 1700)
-    return (int)_tzcnt_u64(value);
+    /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */
+    return (uint32_t)_tzcnt_u64(value);
 #  else
     unsigned long trailing_zero;
     _BitScanForward64(&trailing_zero, value);
-    return (int)trailing_zero;
+    return (uint32_t)trailing_zero;
 #  endif
+#else
+    /* De Bruijn CTZ for 64-bit values */
+    static const uint8_t debruijn_ctz64[64] = {
+        63, 0, 1, 52, 2, 6, 53, 26, 3, 37, 40, 7, 33, 54, 47, 27,
+        61, 4, 38, 45, 43, 41, 21, 8, 23, 34, 58, 55, 48, 17, 28, 10,
+        62, 51, 5, 25, 36, 39, 32, 46, 60, 44, 42, 20, 22, 57, 16, 9,
+        50, 24, 35, 31, 59, 19, 56, 15, 49, 30, 18, 14, 29, 13, 12, 11
+    };
+    uint64_t lsb = value & (~value + 1ull);
+    return debruijn_ctz64[(lsb * 0x045FBAC7992A70DAULL) >> 58];
+#endif
 }
-#  define HAVE_BUILTIN_CTZLL
-#  endif // ARCH_64BIT
-
-#endif // _MSC_VER && !__clang__
 
 #if !__has_builtin(__builtin_bitreverse16)