From: Mat <mberchtold@gmail.com>
Date: Wed, 31 May 2017 16:55:32 +0000 (+0200)
Subject: x86: use TZCNT (#113)
X-Git-Tag: 1.9.9-b1~660^2~8
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a7271104bf9a2d82dc6a69090c12442eacd2fd71;p=thirdparty%2Fzlib-ng.git

x86: use TZCNT (#113)

x86: use TZCNT instruction
On processors that do not support TZCNT, the instruction byte encoding is executed as BSF.
TZCNT is faster on AMD than BSF.
---

diff --git a/arch/x86/ctzl.h b/arch/x86/ctzl.h
index bc9e9bd5b..7482788bf 100644
--- a/arch/x86/ctzl.h
+++ b/arch/x86/ctzl.h
@@ -2,24 +2,32 @@
 #define X86_CTZL_H
 
 #include <intrin.h>
-#ifdef X86_CPUID
-# include "x86.h"
-#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
-/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
- * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
- */
+/* __builtin_ctzl
+ *  - For 0, the result is undefined
+ *  - On the x86 architecture, it is typically implemented using BSF
+ *  - the equivalent intrinsic on MSC is _BitScanForward
+ *
+ * _tzcnt_u32
+ *  - For 0, the result is the size of the operand 
+ *  - On processors that do not support TZCNT, the instruction byte encoding is executed as BSF. In this case the result for 0
+ *    is undefined.
+ *  - Performance:
+ *    + AMD: The reciprocal throughput for TZCNT is 2 vs 3 for BSF
+ *    + Intel: On modern Intel CPUs (Haswell), the performance of TZCNT is equivalent to BSF
+ *    Reference: http://www.agner.org/optimize/instruction_tables.pdf
+*/
+#if defined(_M_IX86) || defined(_M_AMD64)
+#define __builtin_ctzl _tzcnt_u32
+#else
 static __forceinline unsigned long __builtin_ctzl(unsigned long value)
 {
-#ifdef X86_CPUID
-	if (x86_cpu_has_tzcnt)
-		return _tzcnt_u32(value);
-#endif
-	unsigned long trailing_zero;
-	_BitScanForward(&trailing_zero, value);
-	return trailing_zero;
+    unsigned long trailing_zero;
+    _BitScanForward(&trailing_zero, value);
+    return trailing_zero;
 }
 #endif
+#endif
 
 #endif