From: Julian Seward Date: Tue, 17 Aug 2010 00:19:25 +0000 (+0000) Subject: Majorly improve and generalise the core arithmetic routines. X-Git-Tag: svn/VALGRIND_3_6_0~193 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c529c2c43e2f9408c6fa666a1f94f11e168675ff;p=thirdparty%2Fvalgrind.git Majorly improve and generalise the core arithmetic routines. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11262 --- diff --git a/none/tests/amd64/pcmpstr64.c b/none/tests/amd64/pcmpstr64.c index b3defd2c47..69e94e805d 100644 --- a/none/tests/amd64/pcmpstr64.c +++ b/none/tests/amd64/pcmpstr64.c @@ -8,6 +8,9 @@ typedef unsigned int UInt; typedef signed int Int; typedef unsigned char UChar; typedef unsigned long long int ULong; +typedef UChar Bool; +#define False ((Bool)0) +#define True ((Bool)1) #define SHIFT_O 11 #define SHIFT_S 7 @@ -87,6 +90,268 @@ void try_istri ( char* which, which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!"); } +UInt zmask_from_V128 ( V128* arg ) +{ + UInt i, res = 0; + for (i = 0; i < 16; i++) { + res |= (((*arg)[i] == 0) ? 1 : 0) << i; + } + return res; +} + +////////////////////////////////////////////////////////// +// // +// GENERAL // +// // +////////////////////////////////////////////////////////// + + +/* Given partial results from a pcmpXstrX operation (intRes1, + basically), generate an I format (index value for ECX) output, and + also the new OSZACP flags. +*/ +static +void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV, + /*OUT*/UInt* resOSZACP, + UInt intRes1, + UInt zmaskL, UInt zmaskR, + UInt validL, + UInt pol, UInt idx ) +{ + assert((pol >> 2) == 0); + assert((idx >> 1) == 0); + + UInt intRes2 = 0; + switch (pol) { + case 0: intRes2 = intRes1; break; // pol + + case 1: intRes2 = ~intRes1; break; // pol - + case 2: intRes2 = intRes1; break; // pol m+ + case 3: intRes2 = intRes1 ^ validL; break; // pol m- + } + intRes2 &= 0xFFFF; + + // generate ecx value + UInt newECX = 0; + if (idx) { + // index of ms-1-bit + newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2)); + } else { + // index of ls-1-bit + newECX = intRes2 == 0 ? 16 : ctz32(intRes2); + } + + *(UInt*)(&resV[0]) = newECX; + + // generate new flags, common to all ISTRI and ISTRM cases + *resOSZACP // A, P are zero + = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 + | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 + | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 + | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] +} + + +/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M} + variants. + + For xSTRI variants, the new ECX value is placed in the 32 bits + pointed to by *resV. For xSTRM variants, the result is a 128 bit + value and is placed at *resV in the obvious way. + + For all variants, the new OSZACP value is placed at *resOSZACP. + + argLV and argRV are the vector args. The caller must prepare a + 16-bit mask for each, zmaskL and zmaskR. For ISTRx variants this + must be 1 for each zero byte of of the respective arg. For ESTRx + variants this is derived from the explicit length indication, and + must be 0 in all places except at the bit index corresponding to + the valid length (0 .. 16). If the valid length is 16 then the + mask must be all zeroes. In all cases, bits 31:16 must be zero. + + imm8 is the original immediate from the instruction. isSTRM + indicates whether this is a xSTRM or xSTRI variant, which controls + how much of *res is written. + + If the given imm8 case can be handled, the return value is True. + If not, False is returned, and neither *res not *resOSZACP are + altered. +*/ + +Bool pcmpXstrX_WRK ( /*OUT*/V128* resV, + /*OUT*/UInt* resOSZACP, + V128* argLV, V128* argRV, + UInt zmaskL, UInt zmaskR, + UInt imm8, Bool isSTRM ) +{ + assert(imm8 < 0x80); + assert((zmaskL >> 16) == 0); + assert((zmaskR >> 16) == 0); + + UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format + UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn + UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity + UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask + + /*----------------------------------------*/ + /*-- strcmp on byte data --*/ + /*----------------------------------------*/ + + if (agg == 2/*equal each, aka strcmp*/ + && (fmt == 0/*ub*/ || fmt == 2/*sb*/) + && !isSTRM) { + Int i; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolResII = 0; + for (i = 15; i >= 0; i--) { + UChar cL = argL[i]; + UChar cR = argR[i]; + boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); + } + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + + // do invalidation, common to all equal-each cases + UInt intRes1 + = (boolResII & validL & validR) // if both valid, use cmpres + | (~ (validL | validR)); // if both invalid, force 1 + // else force 0 + intRes1 &= 0xFFFF; + + // generate I-format output + pcmpXstrX_WRK_gen_output_fmt_I( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx + ); + + return True; + } + + /*----------------------------------------*/ + /*-- set membership on byte data --*/ + /*----------------------------------------*/ + + if (agg == 0/*equal any, aka find chars in a set*/ + && (fmt == 0/*ub*/ || fmt == 2/*sb*/) + && !isSTRM) { + /* argL: the string, argR: charset */ + UInt si, ci; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolRes = 0; + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + + for (si = 0; si < 16; si++) { + if ((validL & (1 << si)) == 0) + // run off the end of the string. + break; + UInt m = 0; + for (ci = 0; ci < 16; ci++) { + if ((validR & (1 << ci)) == 0) break; + if (argR[ci] == argL[si]) { m = 1; break; } + } + boolRes |= (m << si); + } + + // boolRes is "pre-invalidated" + UInt intRes1 = boolRes & 0xFFFF; + + // generate I-format output + pcmpXstrX_WRK_gen_output_fmt_I( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx + ); + + return True; + } + + /*----------------------------------------*/ + /*-- substring search on byte data --*/ + /*----------------------------------------*/ + + if (agg == 3/*equal ordered, aka substring search*/ + && (fmt == 0/*ub*/ || fmt == 2/*sb*/) + && !isSTRM) { + + /* argL: haystack, argR: needle */ + UInt ni, hi; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolRes = 0; + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + for (hi = 0; hi < 16; hi++) { + if ((validL & (1 << hi)) == 0) + // run off the end of the haystack + break; + UInt m = 1; + for (ni = 0; ni < 16; ni++) { + if ((validR & (1 << ni)) == 0) break; + UInt i = ni + hi; + if (i >= 16) break; + if (argL[i] != argR[ni]) { m = 0; break; } + } + boolRes |= (m << hi); + } + + // boolRes is "pre-invalidated" + UInt intRes1 = boolRes & 0xFFFF; + + // generate I-format output + pcmpXstrX_WRK_gen_output_fmt_I( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx + ); + + return True; + } + + /*----------------------------------------*/ + /*-- ranges, unsigned byte data --*/ + /*----------------------------------------*/ + + if (agg == 1/*ranges*/ + && fmt == 0/*ub*/ + && !isSTRM) { + + /* argL: string, argR: range-pairs */ + UInt ri, si; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolRes = 0; + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + for (si = 0; si < 16; si++) { + if ((validL & (1 << si)) == 0) + // run off the end of the string + break; + UInt m = 0; + for (ri = 0; ri < 16; ri += 2) { + if ((validR & (3 << ri)) != (3 << ri)) break; + if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { + m = 1; break; + } + } + boolRes |= (m << si); + } + + // boolRes is "pre-invalidated" + UInt intRes1 = boolRes & 0xFFFF; + + // generate I-format output + pcmpXstrX_WRK_gen_output_fmt_I( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx + ); + + return True; + } + + return False; +} + + ////////////////////////////////////////////////////////// // // // ISTRI_4A // @@ -117,46 +382,17 @@ UInt h_pcmpistri_4A ( V128* argL, V128* argR ) UInt s_pcmpistri_4A ( V128* argLU, V128* argRU ) { - /* signed bytes (also works for unsigned) - equal each (straightforward parallel compare) - polarity + (IntRes2 = IntRes1) - index 1 (want index of ms 1 bit) - */ - Int i; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolResII = 0, zmaskL = 0, zmaskR = 0; - for (i = 15; i >= 0; i--) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0); - zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0); - boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); - } - UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) - UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) - - // do invalidation, common to all equal-each cases - UInt intRes1 - = (boolResII & validL & validR) // if both valid, use cmpres - | (~ (validL | validR)); // if both invalid, force 1 - // else force 0 - intRes1 &= 0xFFFF; - - // polarity: + - UInt intRes2 = intRes1; - - // generate ecx value, common to all index-of-ms-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2)); - - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] - - return (newFlags << 16) | newECX; + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x4A, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; } void istri_4A ( void ) @@ -237,46 +473,17 @@ UInt h_pcmpistri_3A ( V128* argL, V128* argR ) UInt s_pcmpistri_3A ( V128* argLU, V128* argRU ) { - /* signed bytes (also works for unsigned) - equal each (straightforward parallel compare) - polarity Masked- (IntRes2 = IntRes1 ^ validL) - index 0 (want index of ls 1 bit) - */ - Int i; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolResII = 0, zmaskL = 0, zmaskR = 0; - for (i = 15; i >= 0; i--) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0); - zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0); - boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); - } - UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) - UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) - - // do invalidation, common to all equal-each cases - UInt intRes1 - = (boolResII & validL & validR) // if both valid, use cmpres - | (~ (validL | validR)); // if both invalid, force 1 - // else force 0 - intRes1 &= 0xFFFF; - - // polarity: Masked- - UInt intRes2 = (intRes1 ^ validL) & 0xFFFF; - - // generate ecx value, common to all index-of-ls-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2); - - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] - - return (newFlags << 16) | newECX; + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x3A, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; } void istri_3A ( void ) @@ -360,61 +567,17 @@ UInt h_pcmpistri_0C ( V128* argL, V128* argR ) UInt s_pcmpistri_0C ( V128* argLU, V128* argRU ) { - /* unsigned bytes - equal ordered (substring search) - polarity + (IntRes2 = IntRes1) - index 0 (want index of ls 1 bit) - - argL: haystack, argR: needle - */ - UInt i, hi, ni; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolRes = 0, zmaskL = 0, zmaskR = 0; - UInt keepSearching = 1; - for (i = 0; i < 16; i++) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL >> 1) | (cL == 0 ? (1 << 15) : 0); - zmaskR = (zmaskR >> 1) | (cR == 0 ? (1 << 15) : 0); - - if (argL[i] == 0) { - // run off the end of the haystack. - keepSearching = 0; - } - - UInt m = 1; - if (keepSearching) { - for (ni = 0; ni < 16; ni++) { - if (argR[ni] == 0) break; - hi = ni + i; - if (hi >= 16) break; - if (argL[hi] != argR[ni]) { m = 0; break; } - } - } else { - m = 0; - } - boolRes = (boolRes >> 1) | (m << 15); - - } - - // boolRes is "pre-invalidated" - UInt intRes1 = boolRes & 0xFFFF; - - // polarity: + - UInt intRes2 = intRes1; - - // generate ecx value, common to all index-of-ls-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2); - - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] - - return (newFlags << 16) | newECX; + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x0C, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; } void istri_0C ( void ) @@ -486,53 +649,117 @@ UInt h_pcmpistri_08 ( V128* argL, V128* argR ) UInt s_pcmpistri_08 ( V128* argLU, V128* argRU ) { - /* unsigned bytes (also works for unsigned) - equal each (straightforward parallel compare) - polarity + (IntRes2 = IntRes1) - index 0 (want index of ls 1 bit) - */ - Int i; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolResII = 0, zmaskL = 0, zmaskR = 0; - for (i = 15; i >= 0; i--) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0); - zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0); - boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); - } - UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) - UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x08, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; +} - // do invalidation, common to all equal-each cases - UInt intRes1 - = (boolResII & validL & validR) // if both valid, use cmpres - | (~ (validL | validR)); // if both invalid, force 1 - // else force 0 - intRes1 &= 0xFFFF; +void istri_08 ( void ) +{ + char* wot = "08"; + UInt(*h)(V128*,V128*) = h_pcmpistri_08; + UInt(*s)(V128*,V128*) = s_pcmpistri_08; - // polarity: + - UInt intRes2 = intRes1; + try_istri(wot,h,s, "0000000000000000", "0000000000000000"); - // generate ecx value, common to all index-of-ls-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); - return (newFlags << 16) | newECX; + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + + try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); + try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); + + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + + try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); + try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); + + try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); + try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); + + try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); + + try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); + try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); + try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); + + try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); } -void istri_08 ( void ) + + +////////////////////////////////////////////////////////// +// // +// ISTRI_1A // +// // +////////////////////////////////////////////////////////// + +UInt h_pcmpistri_1A ( V128* argL, V128* argR ) { - char* wot = "08"; - UInt(*h)(V128*,V128*) = h_pcmpistri_08; - UInt(*s)(V128*,V128*) = s_pcmpistri_08; + V128 block[2]; + memcpy(&block[0], argL, sizeof(V128)); + memcpy(&block[1], argR, sizeof(V128)); + ULong res, flags; + __asm__ __volatile__( + "subq $1024, %%rsp" "\n\t" + "movdqu 0(%2), %%xmm2" "\n\t" + "movdqu 16(%2), %%xmm11" "\n\t" + "pcmpistri $0x1A, %%xmm2, %%xmm11" "\n\t" + "pushfq" "\n\t" + "popq %%rdx" "\n\t" + "movq %%rcx, %0" "\n\t" + "movq %%rdx, %1" "\n\t" + "addq $1024, %%rsp" "\n\t" + : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0]) + : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory" + ); + return ((flags & 0x8D5) << 16) | (res & 0xFFFF); +} + +UInt s_pcmpistri_1A ( V128* argLU, V128* argRU ) +{ + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x1A, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; +} + +void istri_1A ( void ) +{ + char* wot = "1A"; + UInt(*h)(V128*,V128*) = h_pcmpistri_1A; + UInt(*s)(V128*,V128*) = s_pcmpistri_1A; try_istri(wot,h,s, "0000000000000000", "0000000000000000"); @@ -578,6 +805,252 @@ void istri_08 ( void ) +////////////////////////////////////////////////////////// +// // +// ISTRI_02 // +// // +////////////////////////////////////////////////////////// + +UInt h_pcmpistri_02 ( V128* argL, V128* argR ) +{ + V128 block[2]; + memcpy(&block[0], argL, sizeof(V128)); + memcpy(&block[1], argR, sizeof(V128)); + ULong res, flags; + __asm__ __volatile__( + "subq $1024, %%rsp" "\n\t" + "movdqu 0(%2), %%xmm2" "\n\t" + "movdqu 16(%2), %%xmm11" "\n\t" + "pcmpistri $0x02, %%xmm2, %%xmm11" "\n\t" +//"pcmpistrm $0x02, %%xmm2, %%xmm11" "\n\t" +//"movd %%xmm0, %%ecx" "\n\t" + "pushfq" "\n\t" + "popq %%rdx" "\n\t" + "movq %%rcx, %0" "\n\t" + "movq %%rdx, %1" "\n\t" + "addq $1024, %%rsp" "\n\t" + : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0]) + : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory" + ); + return ((flags & 0x8D5) << 16) | (res & 0xFFFF); +} + +UInt s_pcmpistri_02 ( V128* argLU, V128* argRU ) +{ + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x02, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; +} + +void istri_02 ( void ) +{ + char* wot = "02"; + UInt(*h)(V128*,V128*) = h_pcmpistri_02; + UInt(*s)(V128*,V128*) = s_pcmpistri_02; + + try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab"); + try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd"); + + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd"); + + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0"); + + try_istri(wot,h,s, "0000000000000000", "0000000000000000"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba"); + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb"); + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba"); + + try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0"); + + try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe"); + try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe"); +} + + +////////////////////////////////////////////////////////// +// // +// ISTRI_12 // +// // +////////////////////////////////////////////////////////// + +UInt h_pcmpistri_12 ( V128* argL, V128* argR ) +{ + V128 block[2]; + memcpy(&block[0], argL, sizeof(V128)); + memcpy(&block[1], argR, sizeof(V128)); + ULong res, flags; + __asm__ __volatile__( + "subq $1024, %%rsp" "\n\t" + "movdqu 0(%2), %%xmm2" "\n\t" + "movdqu 16(%2), %%xmm11" "\n\t" + "pcmpistri $0x12, %%xmm2, %%xmm11" "\n\t" +//"pcmpistrm $0x12, %%xmm2, %%xmm11" "\n\t" +//"movd %%xmm0, %%ecx" "\n\t" + "pushfq" "\n\t" + "popq %%rdx" "\n\t" + "movq %%rcx, %0" "\n\t" + "movq %%rdx, %1" "\n\t" + "addq $1024, %%rsp" "\n\t" + : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0]) + : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory" + ); + return ((flags & 0x8D5) << 16) | (res & 0xFFFF); +} + +UInt s_pcmpistri_12 ( V128* argLU, V128* argRU ) +{ + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x12, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; +} + +void istri_12 ( void ) +{ + char* wot = "12"; + UInt(*h)(V128*,V128*) = h_pcmpistri_12; + UInt(*s)(V128*,V128*) = s_pcmpistri_12; + + try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab"); + try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd"); + + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd"); + + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d"); + try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0"); + + try_istri(wot,h,s, "0000000000000000", "0000000000000000"); + try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); + + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd"); + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba"); + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb"); + try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba"); + + try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0"); + + try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe"); + try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe"); +} + + + +////////////////////////////////////////////////////////// +// // +// ISTRI_44 // +// // +////////////////////////////////////////////////////////// + +UInt h_pcmpistri_44 ( V128* argL, V128* argR ) +{ + V128 block[2]; + memcpy(&block[0], argL, sizeof(V128)); + memcpy(&block[1], argR, sizeof(V128)); + ULong res, flags; + __asm__ __volatile__( + "subq $1024, %%rsp" "\n\t" + "movdqu 0(%2), %%xmm2" "\n\t" + "movdqu 16(%2), %%xmm11" "\n\t" + "pcmpistri $0x44, %%xmm2, %%xmm11" "\n\t" +//"pcmpistrm $0x04, %%xmm2, %%xmm11" "\n\t" +//"movd %%xmm0, %%ecx" "\n\t" + "pushfq" "\n\t" + "popq %%rdx" "\n\t" + "movq %%rcx, %0" "\n\t" + "movq %%rdx, %1" "\n\t" + "addq $1024, %%rsp" "\n\t" + : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0]) + : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory" + ); + return ((flags & 0x8D5) << 16) | (res & 0xFFFF); +} + +UInt s_pcmpistri_44 ( V128* argLU, V128* argRU ) +{ + V128 resV; + UInt resOSZACP, resECX; + Bool ok + = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU, + zmask_from_V128(argLU), + zmask_from_V128(argRU), + 0x44, False/*!isSTRM*/ + ); + assert(ok); + resECX = *(UInt*)(&resV[0]); + return (resOSZACP << 16) | resECX; +} + +void istri_44 ( void ) +{ + char* wot = "44"; + UInt(*h)(V128*,V128*) = h_pcmpistri_44; + UInt(*s)(V128*,V128*) = s_pcmpistri_44; + + try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc"); + try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb"); + try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb"); + try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb"); + + try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb"); + try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb"); + try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb"); + try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb"); + try_istri(wot,h,s, "0000000000000000", "00000000000000cb"); + + try_istri(wot,h,s, "0000000000000000", "0000000000000000"); + + try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb"); + try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b"); + try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb"); + + try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb"); + try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb"); + try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b"); + + try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421"); + try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421"); + + try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532"); + try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532"); +} + + + ////////////////////////////////////////////////////////// @@ -591,6 +1064,10 @@ int main ( void ) istri_4A(); istri_3A(); istri_08(); + istri_1A(); + istri_02(); istri_0C(); + istri_12(); + istri_44(); return 0; }