cmp = veorq_u8(a, b);
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
- if (lane) {
- uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
- return len + match_byte;
- }
+ if (lane)
+ return len + zng_ctz64(lane) / 8;
len += 8;
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
- if (lane) {
- uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
- return len + match_byte;
- }
+ if (lane)
+ return len + zng_ctz64(lane) / 8;
len += 8;
src0 += 16, src1 += 16;
mv = zng_memread_4(src1);
diff = sv ^ mv;
- if (diff) {
-# if BYTE_ORDER == LITTLE_ENDIAN
- uint32_t match_byte = __builtin_ctz(diff) / 8;
-# else
- uint32_t match_byte = __builtin_clz(diff) / 8;
-# endif
- return len + match_byte;
- }
+ if (diff)
+ return len + zng_ctz32(Z_U32_FROM_LE(diff)) / 8;
src0 += 4, src1 += 4, len += 4;
} while (len < 256);
mv = zng_memread_8(src1);
diff = sv ^ mv;
- if (diff) {
-# if BYTE_ORDER == LITTLE_ENDIAN
- uint64_t match_byte = __builtin_ctzll(diff) / 8;
-# else
- uint64_t match_byte = __builtin_clzll(diff) / 8;
-# endif
- return len + (uint32_t)match_byte;
- }
+ if (diff)
+ return len + zng_ctz64(Z_U64_FROM_LE(diff)) / 8;
src0 += 8, src1 += 8, len += 8;
} while (len < 256);
ymm_src1 = __lasx_xvld(src1, 0);
ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp);
- if (mask != 0xFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
- return len + match_byte;
- }
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
src0 += 32, src1 += 32, len += 32;
ymm_src1 = __lasx_xvld(src1, 0);
ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1);
mask = (unsigned)lasx_movemask_b(ymm_cmp);
- if (mask != 0xFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return len + match_byte;
- }
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask);
src0 += 32, src1 += 32, len += 32;
} while (len < 256);
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
- if (mask != 0xFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return match_byte;
- }
+ if (mask != 0xFFFF)
+ return zng_ctz32(~mask);
const uint8_t *last0 = src0 + 240;
const uint8_t *last1 = src1 + 240;
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
- if (mask != 0xFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return len + match_byte;
- }
+ if (mask != 0xFFFF)
+ return len + zng_ctz32(~mask);
len += 16, src0 += 16, src1 += 16;
}
mask = (unsigned)lsx_movemask_b(xmm_cmp);
- if (mask != 0xFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return 240 + match_byte;
- }
+ if (mask != 0xFFFF)
+ return 240 + zng_ctz32(~mask);
}
return 256;
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
- if (mask != 0xFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
- return len + match_byte;
- }
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
src0 += 32, src1 += 32, len += 32;
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
- if (mask != 0xFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return len + match_byte;
- }
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask);
src0 += 32, src1 += 32, len += 32;
} while (len < 256);
xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz
- if (mask_0 != 0x0000FFFF) {
- // There is potential for using __builtin_ctzg/__builtin_ctzs/_tzcnt_u16/__tzcnt_u16 here
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask_0); /* Invert bits so identical = 0 */
- return match_byte;
- }
+ if (mask_0 != 0x0000FFFF)
+ return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */
// 64 bytes
zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16));
zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16));
mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1);
- if (mask_1 != 0xFFFFFFFFFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_1);
- return 16 + match_byte;
- }
+ if (mask_1 != 0xFFFFFFFFFFFFFFFF)
+ return 16 + zng_ctz64(~mask_1);
// 64 bytes
zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80));
zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80));
mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2);
- if (mask_2 != 0xFFFFFFFFFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_2);
- return 80 + match_byte;
- }
+ if (mask_2 != 0xFFFFFFFFFFFFFFFF)
+ return 80 + zng_ctz64(~mask_2);
// 64 bytes
zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144));
zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144));
mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3);
- if (mask_3 != 0xFFFFFFFFFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_3);
- return 144 + match_byte;
- }
+ if (mask_3 != 0xFFFFFFFFFFFFFFFF)
+ return 144 + zng_ctz64(~mask_3);
// 64 bytes (overlaps the previous 16 bytes for fast tail processing)
zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192));
zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192));
mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4);
- if (mask_4 != 0xFFFFFFFFFFFFFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_4);
- return 192 + match_byte;
- }
+ if (mask_4 != 0xFFFFFFFFFFFFFFFF)
+ return 192 + zng_ctz64(~mask_4);
return 256;
}
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
- if (mask != 0xFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return match_byte;
- }
+ if (mask != 0xFFFF)
+ return zng_ctz32(~mask);
const uint8_t *last0 = src0 + 240;
const uint8_t *last1 = src1 + 240;
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
- if (mask != 0xFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return len + match_byte;
- }
+ if (mask != 0xFFFF)
+ return len + zng_ctz32(~mask);
len += 16, src0 += 16, src1 += 16;
}
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
- if (mask != 0xFFFF) {
- uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
- return 240 + match_byte;
- }
+ if (mask != 0xFFFF)
+ return 240 + zng_ctz32(~mask);
}
return 256;
mv = zng_memread_4(src1);
diff = sv ^ mv;
- if (diff) {
-#if BYTE_ORDER == LITTLE_ENDIAN
- uint32_t match_byte = __builtin_ctz(diff) / 8;
-#else
- uint32_t match_byte = __builtin_clz(diff) / 8;
-#endif
- return len + match_byte;
- }
+ if (diff)
+ return len + zng_ctz32(Z_U32_TO_LE(diff)) / 8;
src1 += 4, len += 4;
} while (len < 256);
mv = zng_memread_8(src1);
diff = sv ^ mv;
- if (diff) {
-#if BYTE_ORDER == LITTLE_ENDIAN
- uint64_t match_byte = __builtin_ctzll(diff) / 8;
-#else
- uint64_t match_byte = __builtin_clzll(diff) / 8;
-#endif
- return len + (uint32_t)match_byte;
- }
+ if (diff)
+ return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
src1 += 8, len += 8;
} while (len < 256);
#ifndef FALLBACK_BUILTINS_H
#define FALLBACK_BUILTINS_H
-/* Provide fallback for compilers that don't support __has_builtin */
-# ifndef __has_builtin
-# define __has_builtin(x) 0
-# endif
-
#if defined(_MSC_VER) && !defined(__clang__)
+# include <intrin.h>
+#endif
-#include <intrin.h>
+/* Provide fallback for compilers that don't support __has_builtin */
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+/* Count trailing zeros (CTZ) functions with portable fallback.
+ *
+ * Predicate: Input must be non-zero. The result is undefined for zero input because
+ * __builtin_ctz, BSF, and TZCNT all have undefined/different behavior for zero. TZCNT
+ * returns operand size for zero, BSF leaves destination undefined, and __builtin_ctz
+ * is explicitly undefined per GCC/Clang docs. */
-/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
- * Because of that assumption trailing_zero is not initialized and the return value is not checked.
- * Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed.
- * If tzcnt instruction is not supported, the cpu will itself execute bsf instead.
- * Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu.
- */
-Z_FORCEINLINE static int __builtin_ctz(unsigned int value) {
+Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) {
Assert(value != 0, "Invalid input value: 0");
+#if __has_builtin(__builtin_ctz)
+ return (uint32_t)__builtin_ctz(value);
+#elif defined(_MSC_VER) && !defined(__clang__)
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
- return (int)_tzcnt_u32(value);
+ /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */
+ return (uint32_t)_tzcnt_u32(value);
# else
unsigned long trailing_zero;
_BitScanForward(&trailing_zero, value);
- return (int)trailing_zero;
+ return (uint32_t)trailing_zero;
# endif
+#else
+ /* De Bruijn CTZ for 32-bit values */
+ static const uint8_t debruijn_ctz32[32] = {
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+ };
+ uint32_t lsb = value & (~value + 1u);
+ return debruijn_ctz32[(lsb * 0x077CB531U) >> 27];
+#endif
}
-# define HAVE_BUILTIN_CTZ
-# ifdef ARCH_64BIT
-/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
- * Because of that assumption trailing_zero is not initialized and the return value is not checked.
- */
-Z_FORCEINLINE static int __builtin_ctzll(unsigned long long value) {
+Z_FORCEINLINE static uint32_t zng_ctz64(uint64_t value) {
Assert(value != 0, "Invalid input value: 0");
+#if __has_builtin(__builtin_ctzll)
+ return (uint32_t)__builtin_ctzll(value);
+#elif defined(_MSC_VER) && !defined(__clang__) && defined(ARCH_64BIT)
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
- return (int)_tzcnt_u64(value);
+ /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */
+ return (uint32_t)_tzcnt_u64(value);
# else
unsigned long trailing_zero;
_BitScanForward64(&trailing_zero, value);
- return (int)trailing_zero;
+ return (uint32_t)trailing_zero;
# endif
+#else
+ /* De Bruijn CTZ for 64-bit values */
+ static const uint8_t debruijn_ctz64[64] = {
+ 63, 0, 1, 52, 2, 6, 53, 26, 3, 37, 40, 7, 33, 54, 47, 27,
+ 61, 4, 38, 45, 43, 41, 21, 8, 23, 34, 58, 55, 48, 17, 28, 10,
+ 62, 51, 5, 25, 36, 39, 32, 46, 60, 44, 42, 20, 22, 57, 16, 9,
+ 50, 24, 35, 31, 59, 19, 56, 15, 49, 30, 18, 14, 29, 13, 12, 11
+ };
+ uint64_t lsb = value & (~value + 1ull);
+ return debruijn_ctz64[(lsb * 0x045FBAC7992A70DAULL) >> 58];
+#endif
}
-# define HAVE_BUILTIN_CTZLL
-# endif // ARCH_64BIT
-
-#endif // _MSC_VER && !__clang__
#if !__has_builtin(__builtin_bitreverse16)