From: Mat Date: Wed, 31 May 2017 16:55:32 +0000 (+0200) Subject: x86: use TZCNT (#113) X-Git-Tag: 1.9.9-b1~660^2~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a7271104bf9a2d82dc6a69090c12442eacd2fd71;p=thirdparty%2Fzlib-ng.git x86: use TZCNT (#113) x86: use TZCNT instruction On processors that do not support TZCNT, the instruction byte encoding is executed as BSF. TZCNT is faster on AMD than BSF. --- diff --git a/arch/x86/ctzl.h b/arch/x86/ctzl.h index bc9e9bd5b..7482788bf 100644 --- a/arch/x86/ctzl.h +++ b/arch/x86/ctzl.h @@ -2,24 +2,32 @@ #define X86_CTZL_H #include -#ifdef X86_CPUID -# include "x86.h" -#endif #if defined(_MSC_VER) && !defined(__clang__) -/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0 - * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked - */ +/* __builtin_ctzl + * - For 0, the result is undefined + * - On the x86 architecture, it is typically implemented using BSF + * - the equivalent intrinsic on MSC is _BitScanForward + * + * _tzcnt_u32 + * - For 0, the result is the size of the operand + * - On processors that do not support TZCNT, the instruction byte encoding is executed as BSF. In this case the result for 0 + * is undefined. + * - Performance: + * + AMD: The reciprocal throughput for TZCNT is 2 vs 3 for BSF + * + Intel: On modern Intel CPUs (Haswell), the performance of TZCNT is equivalent to BSF + * Reference: http://www.agner.org/optimize/instruction_tables.pdf +*/ +#if defined(_M_IX86) || defined(_M_AMD64) +#define __builtin_ctzl _tzcnt_u32 +#else static __forceinline unsigned long __builtin_ctzl(unsigned long value) { -#ifdef X86_CPUID - if (x86_cpu_has_tzcnt) - return _tzcnt_u32(value); -#endif - unsigned long trailing_zero; - _BitScanForward(&trailing_zero, value); - return trailing_zero; + unsigned long trailing_zero; + _BitScanForward(&trailing_zero, value); + return trailing_zero; } #endif +#endif #endif