--- /dev/null
+
+#include <stdio.h>
+
+typedef unsigned long long int ULong;
+typedef unsigned int UInt;
+
+__attribute__((noinline))
+void do_lzcnt64 ( /*OUT*/UInt* flags, /*OUT*/ULong* res, ULong arg )
+{
+ ULong block[3] = { arg, 0ULL, 0ULL };
+ __asm__ __volatile__(
+ "movabsq $0x5555555555555555, %%r11" "\n\t"
+ "lzcntq 0(%0), %%r11" "\n\t"
+ "movq %%r11, 8(%0)" "\n\t"
+ "pushfq" "\n\t"
+ "popq %%r11" "\n\t"
+ "movq %%r11, 16(%0)" "\n"
+ : : "r"(&block[0]) : "r11","cc","memory"
+ );
+ *res = block[1];
+ *flags = block[2] & 0x8d5;
+}
+
+__attribute__((noinline))
+void do_lzcnt32 ( /*OUT*/UInt* flags, /*OUT*/ULong* res, ULong arg )
+{
+ ULong block[3] = { arg, 0ULL, 0ULL };
+ __asm__ __volatile__(
+ "movabsq $0x5555555555555555, %%r11" "\n\t"
+ "lzcntl 0(%0), %%r11d" "\n\t"
+ "movq %%r11, 8(%0)" "\n\t"
+ "pushfq" "\n\t"
+ "popq %%r11" "\n\t"
+ "movq %%r11, 16(%0)" "\n"
+ : : "r"(&block[0]) : "r11","cc","memory"
+ );
+ *res = block[1];
+ *flags = block[2] & 0x8d5;
+}
+
+__attribute__((noinline))
+void do_lzcnt16 ( /*OUT*/UInt* flags, /*OUT*/ULong* res, ULong arg )
+{
+ ULong block[3] = { arg, 0ULL, 0ULL };
+ __asm__ __volatile__(
+ "movabsq $0x5555555555555555, %%r11" "\n\t"
+ "lzcntw 0(%0), %%r11w" "\n\t"
+ "movq %%r11, 8(%0)" "\n\t"
+ "pushfq" "\n\t"
+ "popq %%r11" "\n\t"
+ "movq %%r11, 16(%0)" "\n"
+ : : "r"(&block[0]) : "r11","cc","memory"
+ );
+ *res = block[1];
+ *flags = block[2] & 0x8d5;
+}
+
+int main ( void )
+{
+ ULong w;
+
+ w = 0xFEDC192837475675ULL;
+ while (1) {
+ ULong res;
+ UInt flags;
+ do_lzcnt64(&flags, &res, w);
+ printf("lzcntq %016llx -> %016llx %04x\n", w, res, flags);
+ if (w == 0) break;
+ w = ((w >> 2) | (w >> 1)) + (w / 17ULL);
+ }
+
+ w = 0xFEDC192837475675ULL;
+ while (1) {
+ ULong res;
+ UInt flags;
+ do_lzcnt32(&flags, &res, w);
+ printf("lzcntl %016llx -> %016llx %04x\n", w, res, flags);
+ if (w == 0) break;
+ w = ((w >> 2) | (w >> 1)) + (w / 17ULL);
+ }
+
+ w = 0xFEDC192837475675ULL;
+ while (1) {
+ ULong res;
+ UInt flags;
+ do_lzcnt16(&flags, &res, w);
+ printf("lzcntw %016llx -> %016llx %04x\n", w, res, flags);
+ if (w == 0) break;
+ w = ((w >> 2) | (w >> 1)) + (w / 17ULL);
+ }
+
+ return 0;
+}
--- /dev/null
+
+#include <stdio.h>
+
+typedef unsigned long long int ULong;
+typedef unsigned int UInt;
+
+__attribute__((noinline))
+void do_lzcnt32 ( /*OUT*/UInt* flags, /*OUT*/UInt* res, UInt arg )
+{
+ UInt block[3] = { arg, 0, 0 };
+ __asm__ __volatile__(
+ "movl $0x55555555, %%esi" "\n\t"
+ "lzcntl 0(%0), %%esi" "\n\t"
+ "movl %%esi, 4(%0)" "\n\t"
+ "pushfl" "\n\t"
+ "popl %%esi" "\n\t"
+ "movl %%esi, 8(%0)" "\n"
+ : : "r"(&block[0]) : "esi","cc","memory"
+ );
+ *res = block[1];
+ *flags = block[2] & 0x8d5;
+}
+
+__attribute__((noinline))
+void do_lzcnt16 ( /*OUT*/UInt* flags, /*OUT*/UInt* res, UInt arg )
+{
+ UInt block[3] = { arg, 0, 0 };
+ __asm__ __volatile__(
+ "movl $0x55555555, %%esi" "\n\t"
+ "lzcntw 0(%0), %%si" "\n\t"
+ "movl %%esi, 4(%0)" "\n\t"
+ "pushfl" "\n\t"
+ "popl %%esi" "\n\t"
+ "movl %%esi, 8(%0)" "\n"
+ : : "r"(&block[0]) : "esi","cc","memory"
+ );
+ *res = block[1];
+ *flags = block[2] & 0x8d5;
+}
+
+int main ( void )
+{
+ UInt w;
+
+ w = 0xFEDC1928;
+ while (1) {
+ UInt res;
+ UInt flags;
+ do_lzcnt32(&flags, &res, w);
+ printf("lzcntl %08x -> %08x %04x\n", w, res, flags);
+ if (w == 0) break;
+ w = ((w >> 2) | (w >> 1)) + (w / 17);
+ }
+
+ w = 0xFEDC1928;
+ while (1) {
+ UInt res;
+ UInt flags;
+ do_lzcnt16(&flags, &res, w);
+ printf("lzcntw %08x -> %08x %04x\n", w, res, flags);
+ if (w == 0) break;
+ w = ((w >> 2) | (w >> 1)) + (w / 17);
+ }
+
+ return 0;
+}