#define X86_CTZL_H
#include <intrin.h>
-#ifdef X86_CPUID
-# include "x86.h"
-#endif
#if defined(_MSC_VER) && !defined(__clang__)
-/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
- * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
- */
+/* __builtin_ctzl
+ * - For 0, the result is undefined
+ * - On the x86 architecture, it is typically implemented using BSF
+ * - the equivalent intrinsic on MSC is _BitScanForward
+ *
+ * _tzcnt_u32
+ * - For 0, the result is the size of the operand
+ * - On processors that do not support TZCNT, the instruction byte encoding is executed as BSF. In this case the result for 0
+ * is undefined.
+ * - Performance:
+ * + AMD: The reciprocal throughput for TZCNT is 2 vs 3 for BSF
+ * + Intel: On modern Intel CPUs (Haswell), the performance of TZCNT is equivalent to BSF
+ * Reference: http://www.agner.org/optimize/instruction_tables.pdf
+*/
+#if defined(_M_IX86) || defined(_M_AMD64)
+#define __builtin_ctzl _tzcnt_u32
+#else
static __forceinline unsigned long __builtin_ctzl(unsigned long value)
{
-#ifdef X86_CPUID
- if (x86_cpu_has_tzcnt)
- return _tzcnt_u32(value);
-#endif
- unsigned long trailing_zero;
- _BitScanForward(&trailing_zero, value);
- return trailing_zero;
+ unsigned long trailing_zero;
+ _BitScanForward(&trailing_zero, value);
+ return trailing_zero;
}
#endif
+#endif
#endif