typedef signed int Int;
typedef unsigned char UChar;
typedef unsigned long long int ULong;
+typedef UChar Bool;
+#define False ((Bool)0)
+#define True ((Bool)1)
#define SHIFT_O 11
#define SHIFT_S 7
which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
}
+UInt zmask_from_V128 ( V128* arg )
+{
+ UInt i, res = 0;
+ for (i = 0; i < 16; i++) {
+ res |= (((*arg)[i] == 0) ? 1 : 0) << i;
+ }
+ return res;
+}
+
+//////////////////////////////////////////////////////////
+// //
+// GENERAL //
+// //
+//////////////////////////////////////////////////////////
+
+
+/* Given partial results from a pcmpXstrX operation (intRes1,
+ basically), generate an I format (index value for ECX) output, and
+ also the new OSZACP flags.
+*/
+static
+void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
+ /*OUT*/UInt* resOSZACP,
+ UInt intRes1,
+ UInt zmaskL, UInt zmaskR,
+ UInt validL,
+ UInt pol, UInt idx )
+{
+ assert((pol >> 2) == 0);
+ assert((idx >> 1) == 0);
+
+ UInt intRes2 = 0;
+ switch (pol) {
+ case 0: intRes2 = intRes1; break; // pol +
+ case 1: intRes2 = ~intRes1; break; // pol -
+ case 2: intRes2 = intRes1; break; // pol m+
+ case 3: intRes2 = intRes1 ^ validL; break; // pol m-
+ }
+ intRes2 &= 0xFFFF;
+
+ // generate ecx value
+ UInt newECX = 0;
+ if (idx) {
+ // index of ms-1-bit
+ newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
+ } else {
+ // index of ls-1-bit
+ newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+ }
+
+ *(UInt*)(&resV[0]) = newECX;
+
+ // generate new flags, common to all ISTRI and ISTRM cases
+ *resOSZACP // A, P are zero
+ = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+ | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+ | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
+ | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
+}
+
+
+/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
+ variants.
+
+ For xSTRI variants, the new ECX value is placed in the 32 bits
+ pointed to by *resV. For xSTRM variants, the result is a 128 bit
+ value and is placed at *resV in the obvious way.
+
+ For all variants, the new OSZACP value is placed at *resOSZACP.
+
+ argLV and argRV are the vector args. The caller must prepare a
+ 16-bit mask for each, zmaskL and zmaskR. For ISTRx variants this
+ must be 1 for each zero byte of of the respective arg. For ESTRx
+ variants this is derived from the explicit length indication, and
+ must be 0 in all places except at the bit index corresponding to
+ the valid length (0 .. 16). If the valid length is 16 then the
+ mask must be all zeroes. In all cases, bits 31:16 must be zero.
+
+ imm8 is the original immediate from the instruction. isSTRM
+ indicates whether this is a xSTRM or xSTRI variant, which controls
+ how much of *res is written.
+
+ If the given imm8 case can be handled, the return value is True.
+ If not, False is returned, and neither *res not *resOSZACP are
+ altered.
+*/
+
+Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
+ /*OUT*/UInt* resOSZACP,
+ V128* argLV, V128* argRV,
+ UInt zmaskL, UInt zmaskR,
+ UInt imm8, Bool isSTRM )
+{
+ assert(imm8 < 0x80);
+ assert((zmaskL >> 16) == 0);
+ assert((zmaskR >> 16) == 0);
+
+ UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format
+ UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn
+ UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity
+ UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask
+
+ /*----------------------------------------*/
+ /*-- strcmp on byte data --*/
+ /*----------------------------------------*/
+
+ if (agg == 2/*equal each, aka strcmp*/
+ && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
+ && !isSTRM) {
+ Int i;
+ UChar* argL = (UChar*)argLV;
+ UChar* argR = (UChar*)argRV;
+ UInt boolResII = 0;
+ for (i = 15; i >= 0; i--) {
+ UChar cL = argL[i];
+ UChar cR = argR[i];
+ boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+ }
+ UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
+ UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+
+ // do invalidation, common to all equal-each cases
+ UInt intRes1
+ = (boolResII & validL & validR) // if both valid, use cmpres
+ | (~ (validL | validR)); // if both invalid, force 1
+ // else force 0
+ intRes1 &= 0xFFFF;
+
+ // generate I-format output
+ pcmpXstrX_WRK_gen_output_fmt_I(
+ resV, resOSZACP,
+ intRes1, zmaskL, zmaskR, validL, pol, idx
+ );
+
+ return True;
+ }
+
+ /*----------------------------------------*/
+ /*-- set membership on byte data --*/
+ /*----------------------------------------*/
+
+ if (agg == 0/*equal any, aka find chars in a set*/
+ && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
+ && !isSTRM) {
+ /* argL: the string, argR: charset */
+ UInt si, ci;
+ UChar* argL = (UChar*)argLV;
+ UChar* argR = (UChar*)argRV;
+ UInt boolRes = 0;
+ UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
+ UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+
+ for (si = 0; si < 16; si++) {
+ if ((validL & (1 << si)) == 0)
+ // run off the end of the string.
+ break;
+ UInt m = 0;
+ for (ci = 0; ci < 16; ci++) {
+ if ((validR & (1 << ci)) == 0) break;
+ if (argR[ci] == argL[si]) { m = 1; break; }
+ }
+ boolRes |= (m << si);
+ }
+
+ // boolRes is "pre-invalidated"
+ UInt intRes1 = boolRes & 0xFFFF;
+
+ // generate I-format output
+ pcmpXstrX_WRK_gen_output_fmt_I(
+ resV, resOSZACP,
+ intRes1, zmaskL, zmaskR, validL, pol, idx
+ );
+
+ return True;
+ }
+
+ /*----------------------------------------*/
+ /*-- substring search on byte data --*/
+ /*----------------------------------------*/
+
+ if (agg == 3/*equal ordered, aka substring search*/
+ && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
+ && !isSTRM) {
+
+ /* argL: haystack, argR: needle */
+ UInt ni, hi;
+ UChar* argL = (UChar*)argLV;
+ UChar* argR = (UChar*)argRV;
+ UInt boolRes = 0;
+ UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
+ UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+ for (hi = 0; hi < 16; hi++) {
+ if ((validL & (1 << hi)) == 0)
+ // run off the end of the haystack
+ break;
+ UInt m = 1;
+ for (ni = 0; ni < 16; ni++) {
+ if ((validR & (1 << ni)) == 0) break;
+ UInt i = ni + hi;
+ if (i >= 16) break;
+ if (argL[i] != argR[ni]) { m = 0; break; }
+ }
+ boolRes |= (m << hi);
+ }
+
+ // boolRes is "pre-invalidated"
+ UInt intRes1 = boolRes & 0xFFFF;
+
+ // generate I-format output
+ pcmpXstrX_WRK_gen_output_fmt_I(
+ resV, resOSZACP,
+ intRes1, zmaskL, zmaskR, validL, pol, idx
+ );
+
+ return True;
+ }
+
+ /*----------------------------------------*/
+ /*-- ranges, unsigned byte data --*/
+ /*----------------------------------------*/
+
+ if (agg == 1/*ranges*/
+ && fmt == 0/*ub*/
+ && !isSTRM) {
+
+ /* argL: string, argR: range-pairs */
+ UInt ri, si;
+ UChar* argL = (UChar*)argLV;
+ UChar* argR = (UChar*)argRV;
+ UInt boolRes = 0;
+ UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
+ UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+ for (si = 0; si < 16; si++) {
+ if ((validL & (1 << si)) == 0)
+ // run off the end of the string
+ break;
+ UInt m = 0;
+ for (ri = 0; ri < 16; ri += 2) {
+ if ((validR & (3 << ri)) != (3 << ri)) break;
+ if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
+ m = 1; break;
+ }
+ }
+ boolRes |= (m << si);
+ }
+
+ // boolRes is "pre-invalidated"
+ UInt intRes1 = boolRes & 0xFFFF;
+
+ // generate I-format output
+ pcmpXstrX_WRK_gen_output_fmt_I(
+ resV, resOSZACP,
+ intRes1, zmaskL, zmaskR, validL, pol, idx
+ );
+
+ return True;
+ }
+
+ return False;
+}
+
+
//////////////////////////////////////////////////////////
// //
// ISTRI_4A //
UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
{
- /* signed bytes (also works for unsigned)
- equal each (straightforward parallel compare)
- polarity + (IntRes2 = IntRes1)
- index 1 (want index of ms 1 bit)
- */
- Int i;
- UChar* argL = (UChar*)argLU;
- UChar* argR = (UChar*)argRU;
- UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
- for (i = 15; i >= 0; i--) {
- UChar cL = argL[i];
- UChar cR = argR[i];
- zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0);
- zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0);
- boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
- }
- UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
- UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
-
- // do invalidation, common to all equal-each cases
- UInt intRes1
- = (boolResII & validL & validR) // if both valid, use cmpres
- | (~ (validL | validR)); // if both invalid, force 1
- // else force 0
- intRes1 &= 0xFFFF;
-
- // polarity: +
- UInt intRes2 = intRes1;
-
- // generate ecx value, common to all index-of-ms-1-bit cases
- UInt newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
-
- // generate new flags, common to all ISTRI and ISTRM cases
- UInt newFlags // A, P are zero
- = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
- | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
- | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
- | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
-
- return (newFlags << 16) | newECX;
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x4A, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
}
void istri_4A ( void )
UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
{
- /* signed bytes (also works for unsigned)
- equal each (straightforward parallel compare)
- polarity Masked- (IntRes2 = IntRes1 ^ validL)
- index 0 (want index of ls 1 bit)
- */
- Int i;
- UChar* argL = (UChar*)argLU;
- UChar* argR = (UChar*)argRU;
- UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
- for (i = 15; i >= 0; i--) {
- UChar cL = argL[i];
- UChar cR = argR[i];
- zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0);
- zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0);
- boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
- }
- UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
- UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
-
- // do invalidation, common to all equal-each cases
- UInt intRes1
- = (boolResII & validL & validR) // if both valid, use cmpres
- | (~ (validL | validR)); // if both invalid, force 1
- // else force 0
- intRes1 &= 0xFFFF;
-
- // polarity: Masked-
- UInt intRes2 = (intRes1 ^ validL) & 0xFFFF;
-
- // generate ecx value, common to all index-of-ls-1-bit cases
- UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
-
- // generate new flags, common to all ISTRI and ISTRM cases
- UInt newFlags // A, P are zero
- = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
- | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
- | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
- | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
-
- return (newFlags << 16) | newECX;
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x3A, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
}
void istri_3A ( void )
UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
{
- /* unsigned bytes
- equal ordered (substring search)
- polarity + (IntRes2 = IntRes1)
- index 0 (want index of ls 1 bit)
-
- argL: haystack, argR: needle
- */
- UInt i, hi, ni;
- UChar* argL = (UChar*)argLU;
- UChar* argR = (UChar*)argRU;
- UInt boolRes = 0, zmaskL = 0, zmaskR = 0;
- UInt keepSearching = 1;
- for (i = 0; i < 16; i++) {
- UChar cL = argL[i];
- UChar cR = argR[i];
- zmaskL = (zmaskL >> 1) | (cL == 0 ? (1 << 15) : 0);
- zmaskR = (zmaskR >> 1) | (cR == 0 ? (1 << 15) : 0);
-
- if (argL[i] == 0) {
- // run off the end of the haystack.
- keepSearching = 0;
- }
-
- UInt m = 1;
- if (keepSearching) {
- for (ni = 0; ni < 16; ni++) {
- if (argR[ni] == 0) break;
- hi = ni + i;
- if (hi >= 16) break;
- if (argL[hi] != argR[ni]) { m = 0; break; }
- }
- } else {
- m = 0;
- }
- boolRes = (boolRes >> 1) | (m << 15);
-
- }
-
- // boolRes is "pre-invalidated"
- UInt intRes1 = boolRes & 0xFFFF;
-
- // polarity: +
- UInt intRes2 = intRes1;
-
- // generate ecx value, common to all index-of-ls-1-bit cases
- UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
-
- // generate new flags, common to all ISTRI and ISTRM cases
- UInt newFlags // A, P are zero
- = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
- | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
- | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
- | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
-
- return (newFlags << 16) | newECX;
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x0C, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
}
void istri_0C ( void )
UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
{
- /* unsigned bytes (also works for unsigned)
- equal each (straightforward parallel compare)
- polarity + (IntRes2 = IntRes1)
- index 0 (want index of ls 1 bit)
- */
- Int i;
- UChar* argL = (UChar*)argLU;
- UChar* argR = (UChar*)argRU;
- UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
- for (i = 15; i >= 0; i--) {
- UChar cL = argL[i];
- UChar cR = argR[i];
- zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0);
- zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0);
- boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
- }
- UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
- UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x08, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
+}
- // do invalidation, common to all equal-each cases
- UInt intRes1
- = (boolResII & validL & validR) // if both valid, use cmpres
- | (~ (validL | validR)); // if both invalid, force 1
- // else force 0
- intRes1 &= 0xFFFF;
+void istri_08 ( void )
+{
+ char* wot = "08";
+ UInt(*h)(V128*,V128*) = h_pcmpistri_08;
+ UInt(*s)(V128*,V128*) = s_pcmpistri_08;
- // polarity: +
- UInt intRes2 = intRes1;
+ try_istri(wot,h,s, "0000000000000000", "0000000000000000");
- // generate ecx value, common to all index-of-ls-1-bit cases
- UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
- // generate new flags, common to all ISTRI and ISTRM cases
- UInt newFlags // A, P are zero
- = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
- | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
- | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
- | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
- return (newFlags << 16) | newECX;
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+ try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
+ try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
+
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+ try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
+
+ try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
+ try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
+
+ try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
+
+ try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
+ try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
+ try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
+
+ try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
}
-void istri_08 ( void )
+
+
+//////////////////////////////////////////////////////////
+// //
+// ISTRI_1A //
+// //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_1A ( V128* argL, V128* argR )
{
- char* wot = "08";
- UInt(*h)(V128*,V128*) = h_pcmpistri_08;
- UInt(*s)(V128*,V128*) = s_pcmpistri_08;
+ V128 block[2];
+ memcpy(&block[0], argL, sizeof(V128));
+ memcpy(&block[1], argR, sizeof(V128));
+ ULong res, flags;
+ __asm__ __volatile__(
+ "subq $1024, %%rsp" "\n\t"
+ "movdqu 0(%2), %%xmm2" "\n\t"
+ "movdqu 16(%2), %%xmm11" "\n\t"
+ "pcmpistri $0x1A, %%xmm2, %%xmm11" "\n\t"
+ "pushfq" "\n\t"
+ "popq %%rdx" "\n\t"
+ "movq %%rcx, %0" "\n\t"
+ "movq %%rdx, %1" "\n\t"
+ "addq $1024, %%rsp" "\n\t"
+ : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+ : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+ );
+ return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
+{
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x1A, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
+}
+
+void istri_1A ( void )
+{
+ char* wot = "1A";
+ UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
+ UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+//////////////////////////////////////////////////////////
+// //
+// ISTRI_02 //
+// //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_02 ( V128* argL, V128* argR )
+{
+ V128 block[2];
+ memcpy(&block[0], argL, sizeof(V128));
+ memcpy(&block[1], argR, sizeof(V128));
+ ULong res, flags;
+ __asm__ __volatile__(
+ "subq $1024, %%rsp" "\n\t"
+ "movdqu 0(%2), %%xmm2" "\n\t"
+ "movdqu 16(%2), %%xmm11" "\n\t"
+ "pcmpistri $0x02, %%xmm2, %%xmm11" "\n\t"
+//"pcmpistrm $0x02, %%xmm2, %%xmm11" "\n\t"
+//"movd %%xmm0, %%ecx" "\n\t"
+ "pushfq" "\n\t"
+ "popq %%rdx" "\n\t"
+ "movq %%rcx, %0" "\n\t"
+ "movq %%rdx, %1" "\n\t"
+ "addq $1024, %%rsp" "\n\t"
+ : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+ : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+ );
+ return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
+{
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x02, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
+}
+
+void istri_02 ( void )
+{
+ char* wot = "02";
+ UInt(*h)(V128*,V128*) = h_pcmpistri_02;
+ UInt(*s)(V128*,V128*) = s_pcmpistri_02;
+
+ try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
+ try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
+
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
+
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
+
+ try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
+
+ try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
+
+ try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
+ try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
+}
+
+
+//////////////////////////////////////////////////////////
+// //
+// ISTRI_12 //
+// //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_12 ( V128* argL, V128* argR )
+{
+ V128 block[2];
+ memcpy(&block[0], argL, sizeof(V128));
+ memcpy(&block[1], argR, sizeof(V128));
+ ULong res, flags;
+ __asm__ __volatile__(
+ "subq $1024, %%rsp" "\n\t"
+ "movdqu 0(%2), %%xmm2" "\n\t"
+ "movdqu 16(%2), %%xmm11" "\n\t"
+ "pcmpistri $0x12, %%xmm2, %%xmm11" "\n\t"
+//"pcmpistrm $0x12, %%xmm2, %%xmm11" "\n\t"
+//"movd %%xmm0, %%ecx" "\n\t"
+ "pushfq" "\n\t"
+ "popq %%rdx" "\n\t"
+ "movq %%rcx, %0" "\n\t"
+ "movq %%rdx, %1" "\n\t"
+ "addq $1024, %%rsp" "\n\t"
+ : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+ : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+ );
+ return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
+{
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x12, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
+}
+
+void istri_12 ( void )
+{
+ char* wot = "12";
+ UInt(*h)(V128*,V128*) = h_pcmpistri_12;
+ UInt(*s)(V128*,V128*) = s_pcmpistri_12;
+
+ try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
+ try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
+
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
+
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
+ try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
+
+ try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+ try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
+
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
+ try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
+
+ try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
+
+ try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
+ try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
+}
+
+
+
+//////////////////////////////////////////////////////////
+// //
+// ISTRI_44 //
+// //
+//////////////////////////////////////////////////////////
+
+UInt h_pcmpistri_44 ( V128* argL, V128* argR )
+{
+ V128 block[2];
+ memcpy(&block[0], argL, sizeof(V128));
+ memcpy(&block[1], argR, sizeof(V128));
+ ULong res, flags;
+ __asm__ __volatile__(
+ "subq $1024, %%rsp" "\n\t"
+ "movdqu 0(%2), %%xmm2" "\n\t"
+ "movdqu 16(%2), %%xmm11" "\n\t"
+ "pcmpistri $0x44, %%xmm2, %%xmm11" "\n\t"
+//"pcmpistrm $0x04, %%xmm2, %%xmm11" "\n\t"
+//"movd %%xmm0, %%ecx" "\n\t"
+ "pushfq" "\n\t"
+ "popq %%rdx" "\n\t"
+ "movq %%rcx, %0" "\n\t"
+ "movq %%rdx, %1" "\n\t"
+ "addq $1024, %%rsp" "\n\t"
+ : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
+ : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
+ );
+ return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
+}
+
+UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
+{
+ V128 resV;
+ UInt resOSZACP, resECX;
+ Bool ok
+ = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
+ zmask_from_V128(argLU),
+ zmask_from_V128(argRU),
+ 0x44, False/*!isSTRM*/
+ );
+ assert(ok);
+ resECX = *(UInt*)(&resV[0]);
+ return (resOSZACP << 16) | resECX;
+}
+
+void istri_44 ( void )
+{
+ char* wot = "44";
+ UInt(*h)(V128*,V128*) = h_pcmpistri_44;
+ UInt(*s)(V128*,V128*) = s_pcmpistri_44;
+
+ try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
+ try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
+ try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
+ try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
+
+ try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
+ try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
+ try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
+ try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
+ try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
+
+ try_istri(wot,h,s, "0000000000000000", "0000000000000000");
+
+ try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
+ try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
+ try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
+
+ try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
+ try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
+ try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
+
+ try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
+ try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
+
+ try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
+ try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
+}
+
+
+
//////////////////////////////////////////////////////////
istri_4A();
istri_3A();
istri_08();
+ istri_1A();
+ istri_02();
istri_0C();
+ istri_12();
+ istri_44();
return 0;
}