]> git.ipfire.org Git - thirdparty/valgrind.git/commitdiff
Add a test program for PCMPISTRI on amd64 (not yet hooked up).
authorJulian Seward <jseward@acm.org>
Fri, 6 Aug 2010 08:10:45 +0000 (08:10 +0000)
committerJulian Seward <jseward@acm.org>
Fri, 6 Aug 2010 08:10:45 +0000 (08:10 +0000)
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11250

none/tests/amd64/pcmpstr64.c [new file with mode: 0644]

diff --git a/none/tests/amd64/pcmpstr64.c b/none/tests/amd64/pcmpstr64.c
new file mode 100644 (file)
index 0000000..b3defd2
--- /dev/null
@@ -0,0 +1,596 @@
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef  unsigned char  V128[16];
+typedef  unsigned int   UInt;
+typedef  signed int     Int;
+typedef  unsigned char  UChar;
+typedef  unsigned long long int ULong;
+
+#define SHIFT_O   11
+#define SHIFT_S   7
+#define SHIFT_Z   6
+#define SHIFT_A   4
+#define SHIFT_C   0
+#define SHIFT_P   2
+
+#define MASK_O    (1ULL << SHIFT_O)
+#define MASK_S    (1ULL << SHIFT_S)
+#define MASK_Z    (1ULL << SHIFT_Z)
+#define MASK_A    (1ULL << SHIFT_A)
+#define MASK_C    (1ULL << SHIFT_C)
+#define MASK_P    (1ULL << SHIFT_P)
+
+
+UInt clz32 ( UInt x )
+{
+   Int y, m, n;
+   y = -(x >> 16);
+   m = (y >> 16) & 16;
+   n = 16 - m;
+   x = x >> m;
+   y = x - 0x100;
+   m = (y >> 16) & 8;
+   n = n + m;
+   x = x << m;
+   y = x - 0x1000;
+   m = (y >> 16) & 4;
+   n = n + m;
+   x = x << m;
+   y = x - 0x4000;
+   m = (y >> 16) & 2;
+   n = n + m;
+   x = x << m;
+   y = x >> 14;
+   m = y & ~(y >> 1);
+   return n + 2 - m;
+}
+
+UInt ctz32 ( UInt x )
+{
+   return 32 - clz32((~x) & (x-1));
+}
+
+void expand ( V128* dst, char* summary )
+{
+   Int i;
+   assert( strlen(summary) == 16 );
+   for (i = 0; i < 16; i++) {
+      UChar xx = 0;
+      UChar x = summary[15-i];
+      if      (x >= '0' && x <= '9') { xx = x - '0'; }
+      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
+      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
+      else assert(0);
+
+      assert(xx < 16);
+      xx = (xx << 4) | xx;
+      assert(xx < 256);
+      (*dst)[i] = xx;
+   }
+}
+
+void try_istri ( char* which,
+                 UInt(*h_fn)(V128*,V128*),
+                 UInt(*s_fn)(V128*,V128*),
+                 char* summL, char* summR )
+{
+   assert(strlen(which) == 2);
+   V128 argL, argR;
+   expand(&argL, summL);
+   expand(&argR, summR);
+   UInt h_res = h_fn(&argL, &argR);
+   UInt s_res = s_fn(&argL, &argR);
+   printf("istri %s  %s %s -> %08x %08x %s\n",
+          which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
+}
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_4A                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_4A ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
+{
+   /* signed bytes  (also works for unsigned)
+      equal each    (straightforward parallel compare)
+      polarity +    (IntRes2 = IntRes1)
+      index 1       (want index of ms 1 bit)
+   */
+   Int    i;
+   UChar* argL = (UChar*)argLU;
+   UChar* argR = (UChar*)argRU;
+   UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
+   for (i = 15; i >= 0; i--) {
+      UChar cL  = argL[i];
+      UChar cR  = argR[i];
+      zmaskL    = (zmaskL << 1)    | (cL == 0  ? 1 : 0);
+      zmaskR    = (zmaskR << 1)    | (cR == 0  ? 1 : 0);
+      boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+   }
+   UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+   UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+   // do invalidation, common to all equal-each cases
+   UInt intRes1
+      = (boolResII & validL & validR)  // if both valid, use cmpres
+        | (~ (validL | validR));       // if both invalid, force 1
+                                       // else force 0
+   intRes1 &= 0xFFFF;
+
+   // polarity: +
+   UInt intRes2 = intRes1;
+
+   // generate ecx value, common to all index-of-ms-1-bit cases
+   UInt newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
+
+   // generate new flags, common to all ISTRI and ISTRM cases
+   UInt newFlags    // A, P are zero
+      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
+      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+
+   return (newFlags << 16) | newECX;
+}
+
+void istri_4A ( void )
+{
+   char* wot = "4A";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
+   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
+}
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_3A                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_3A ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
+{
+   /* signed bytes      (also works for unsigned)
+      equal each        (straightforward parallel compare)
+      polarity Masked-  (IntRes2 = IntRes1 ^ validL)
+      index 0           (want index of ls 1 bit)
+   */
+   Int    i;
+   UChar* argL = (UChar*)argLU;
+   UChar* argR = (UChar*)argRU;
+   UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
+   for (i = 15; i >= 0; i--) {
+      UChar cL  = argL[i];
+      UChar cR  = argR[i];
+      zmaskL    = (zmaskL << 1)    | (cL == 0  ? 1 : 0);
+      zmaskR    = (zmaskR << 1)    | (cR == 0  ? 1 : 0);
+      boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+   }
+   UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+   UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+   // do invalidation, common to all equal-each cases
+   UInt intRes1
+      = (boolResII & validL & validR)  // if both valid, use cmpres
+        | (~ (validL | validR));       // if both invalid, force 1
+                                       // else force 0
+   intRes1 &= 0xFFFF;
+
+   // polarity: Masked-
+   UInt intRes2 = (intRes1 ^ validL) & 0xFFFF;
+
+   // generate ecx value, common to all index-of-ls-1-bit cases
+   UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+
+   // generate new flags, common to all ISTRI and ISTRM cases
+   UInt newFlags    // A, P are zero
+      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
+      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+
+   return (newFlags << 16) | newECX;
+}
+
+void istri_3A ( void )
+{
+   char* wot = "3A";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
+   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
+}
+
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_0C                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+__attribute__((noinline))
+UInt h_pcmpistri_0C ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res = 0, flags = 0;
+   __asm__ __volatile__(
+      "movdqa    0(%2),  %%xmm2"            "\n\t"
+      "movdqa    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
+      //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
+      //"movd %%xmm0, %%ecx" "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0][0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
+{
+   /* unsigned bytes
+      equal ordered     (substring search)
+      polarity +        (IntRes2 = IntRes1)
+      index 0           (want index of ls 1 bit)
+
+      argL: haystack,  argR: needle
+   */
+   UInt i, hi, ni;
+   UChar* argL = (UChar*)argLU;
+   UChar* argR = (UChar*)argRU;
+   UInt boolRes = 0, zmaskL = 0, zmaskR = 0;
+   UInt keepSearching = 1;
+   for (i = 0; i < 16; i++) {
+      UChar cL  = argL[i];
+      UChar cR  = argR[i];
+      zmaskL    = (zmaskL >> 1) | (cL == 0 ? (1 << 15) : 0);
+      zmaskR    = (zmaskR >> 1) | (cR == 0 ? (1 << 15) : 0);
+
+      if (argL[i] == 0) {
+        // run off the end of the haystack.
+        keepSearching = 0;
+      } 
+
+      UInt m = 1;
+      if (keepSearching) {
+         for (ni = 0; ni < 16; ni++) {
+            if (argR[ni] == 0) break;
+            hi = ni + i;
+            if (hi >= 16) break;
+            if (argL[hi] != argR[ni]) { m = 0; break; }
+         }
+      } else {
+         m = 0;
+      }
+      boolRes = (boolRes >> 1) | (m << 15);
+
+   }
+
+   // boolRes is "pre-invalidated"
+   UInt intRes1 = boolRes & 0xFFFF;
+
+   // polarity: +
+   UInt intRes2 = intRes1;
+
+   // generate ecx value, common to all index-of-ls-1-bit cases
+   UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+
+   // generate new flags, common to all ISTRI and ISTRM cases
+   UInt newFlags    // A, P are zero
+      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
+      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+
+   return (newFlags << 16) | newECX;
+}
+
+void istri_0C ( void )
+{
+   char* wot = "0C";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
+   
+   try_istri(wot,h,s, "111111111abcde11", "00000000000abcde"); 
+
+   try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde"); 
+
+   try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde"); 
+   try_istri(wot,h,s, "11111111111abcde", "00000000000abcde"); 
+   try_istri(wot,h,s, "111111111111abcd", "00000000000abcde"); 
+
+   try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde"); 
+
+   try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde"); 
+   try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde"); 
+   try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde"); 
+   try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde"); 
+   try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde"); 
+
+   try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde"); 
+   try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde"); 
+   try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde"); 
+
+   try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde"); 
+   try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde"); 
+
+   try_istri(wot,h,s, "1111111111111234", "0000000000000000"); 
+   try_istri(wot,h,s, "1111111111111234", "0000000000000001"); 
+   try_istri(wot,h,s, "1111111111111234", "0000000000000011"); 
+
+   try_istri(wot,h,s, "1111111111111234", "1111111111111234"); 
+   try_istri(wot,h,s, "a111111111111111", "000000000000000a"); 
+   try_istri(wot,h,s, "b111111111111111", "000000000000000a"); 
+}
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                       ISTRI_08                       //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_08 ( V128* argL, V128* argR )
+{
+   V128 block[2];
+   memcpy(&block[0], argL, sizeof(V128));
+   memcpy(&block[1], argR, sizeof(V128));
+   ULong res, flags;
+   __asm__ __volatile__(
+      "subq      $1024,  %%rsp"             "\n\t"
+      "movdqu    0(%2),  %%xmm2"            "\n\t"
+      "movdqu    16(%2), %%xmm11"           "\n\t"
+      "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
+      "pushfq"                              "\n\t"
+      "popq      %%rdx"                     "\n\t"
+      "movq      %%rcx,  %0"                "\n\t"
+      "movq      %%rdx,  %1"                "\n\t"
+      "addq      $1024,  %%rsp"             "\n\t"
+      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+   );
+   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
+{
+   /* unsigned bytes    (also works for unsigned)
+      equal each        (straightforward parallel compare)
+      polarity +        (IntRes2 = IntRes1)
+      index 0           (want index of ls 1 bit)
+   */
+   Int    i;
+   UChar* argL = (UChar*)argLU;
+   UChar* argR = (UChar*)argRU;
+   UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
+   for (i = 15; i >= 0; i--) {
+      UChar cL  = argL[i];
+      UChar cR  = argR[i];
+      zmaskL    = (zmaskL << 1)    | (cL == 0  ? 1 : 0);
+      zmaskR    = (zmaskR << 1)    | (cR == 0  ? 1 : 0);
+      boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+   }
+   UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+   UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+   // do invalidation, common to all equal-each cases
+   UInt intRes1
+      = (boolResII & validL & validR)  // if both valid, use cmpres
+        | (~ (validL | validR));       // if both invalid, force 1
+                                       // else force 0
+   intRes1 &= 0xFFFF;
+
+   // polarity: +
+   UInt intRes2 = intRes1;
+
+   // generate ecx value, common to all index-of-ls-1-bit cases
+   UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+
+   // generate new flags, common to all ISTRI and ISTRM cases
+   UInt newFlags    // A, P are zero
+      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
+      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+
+   return (newFlags << 16) | newECX;
+}
+
+void istri_08 ( void )
+{
+   char* wot = "08";
+   UInt(*h)(V128*,V128*) = h_pcmpistri_08;
+   UInt(*s)(V128*,V128*) = s_pcmpistri_08;
+
+   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
+   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
+   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 
+
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
+   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 
+
+   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
+   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
+}
+
+
+
+
+
+//////////////////////////////////////////////////////////
+//                                                      //
+//                         main                         //
+//                                                      //
+//////////////////////////////////////////////////////////
+
+int main ( void )
+{
+   istri_4A();
+   istri_3A();
+   istri_08();
+   istri_0C();
+   return 0;
+}