extern void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st );
extern void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st );
+extern void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st );
extern void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* );
extern void amd64g_dirtyhelper_SxDT ( void* address,
ULong op /* 0 or 1 */ );
+extern ULong amd64g_dirtyhelper_ISTRI_08 ( VexGuestAMD64State*,
+ HWord, HWord );
+extern ULong amd64g_dirtyhelper_ISTRI_0C ( VexGuestAMD64State*,
+ HWord, HWord );
+extern ULong amd64g_dirtyhelper_ISTRI_3A ( VexGuestAMD64State*,
+ HWord, HWord );
+extern ULong amd64g_dirtyhelper_ISTRI_4A ( VexGuestAMD64State*,
+ HWord, HWord );
+
//extern void amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* );
//extern void amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* );
//extern void amd64g_dirtyhelper_CPUID_sse2 ( VexGuestAMD64State* );
}
+/* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
+ capable.
+
+ vendor_id : GenuineIntel
+ cpu family : 6
+ model : 37
+ model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz
+ stepping : 2
+ cpu MHz : 3334.000
+ cache size : 4096 KB
+ physical id : 0
+ siblings : 4
+ core id : 0
+ cpu cores : 2
+ apicid : 0
+ initial apicid : 0
+ fpu : yes
+ fpu_exception : yes
+ cpuid level : 11
+ wp : yes
+ flags : fpu vme de pse tsc msr pae mce cx8 apic sep
+ mtrr pge mca cmov pat pse36 clflush dts acpi
+ mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
+ lm constant_tsc arch_perfmon pebs bts rep_good
+ xtopology nonstop_tsc aperfmperf pni pclmulqdq
+ dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
+ xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
+ arat tpr_shadow vnmi flexpriority ept vpid
+ bogomips : 6957.57
+ clflush size : 64
+ cache_alignment : 64
+ address sizes : 36 bits physical, 48 bits virtual
+ power management:
+*/
+void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
+{
+# define SET_ABCD(_a,_b,_c,_d) \
+ do { st->guest_RAX = (ULong)(_a); \
+ st->guest_RBX = (ULong)(_b); \
+ st->guest_RCX = (ULong)(_c); \
+ st->guest_RDX = (ULong)(_d); \
+ } while (0)
+
+ UInt old_eax = (UInt)st->guest_RAX;
+ UInt old_ecx = (UInt)st->guest_RCX;
+
+ switch (old_eax) {
+ case 0x00000000:
+ SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
+ break;
+ case 0x00000001:
+ SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
+ break;
+ case 0x00000002:
+ SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
+ break;
+ case 0x00000003:
+ SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+ break;
+ case 0x00000004:
+ switch (old_ecx) {
+ case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
+ 0x0000003f, 0x00000000); break;
+ case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
+ 0x0000007f, 0x00000000); break;
+ case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
+ 0x000001ff, 0x00000000); break;
+ case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
+ 0x00000fff, 0x00000002); break;
+ default: SET_ABCD(0x00000000, 0x00000000,
+ 0x00000000, 0x00000000); break;
+ }
+ break;
+ case 0x00000005:
+ SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
+ break;
+ case 0x00000006:
+ SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
+ break;
+ case 0x00000007:
+ SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+ break;
+ case 0x00000008:
+ SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+ break;
+ case 0x00000009:
+ SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+ break;
+ case 0x0000000a:
+ SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
+ break;
+ case 0x0000000b:
+ switch (old_ecx) {
+ case 0x00000000:
+ SET_ABCD(0x00000001, 0x00000002,
+ 0x00000100, 0x00000000); break;
+ case 0x00000001:
+ SET_ABCD(0x00000004, 0x00000004,
+ 0x00000201, 0x00000000); break;
+ default:
+ SET_ABCD(0x00000000, 0x00000000,
+ old_ecx, 0x00000000); break;
+ }
+ break;
+ case 0x0000000c:
+ SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
+ break;
+ case 0x0000000d:
+ switch (old_ecx) {
+ case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
+ 0x00000100, 0x00000000); break;
+ case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
+ 0x00000201, 0x00000000); break;
+ default: SET_ABCD(0x00000000, 0x00000000,
+ old_ecx, 0x00000000); break;
+ }
+ break;
+ case 0x80000000:
+ SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
+ break;
+ case 0x80000001:
+ SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
+ break;
+ case 0x80000002:
+ SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
+ break;
+ case 0x80000003:
+ SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
+ break;
+ case 0x80000004:
+ SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
+ break;
+ case 0x80000005:
+ SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+ break;
+ case 0x80000006:
+ SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
+ break;
+ case 0x80000007:
+ SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
+ break;
+ case 0x80000008:
+ SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
+ break;
+ default:
+ SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
+ break;
+ }
+# undef SET_ABCD
+}
+
+
ULong amd64g_calculate_RCR ( ULong arg,
ULong rot_amt,
ULong rflags_in,
}
+/*---------------------------------------------------------------*/
+/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/
+/*---------------------------------------------------------------*/
+
+/* CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
+ actually it could be a clean helper, but for the fact that we can't
+ pass by value 2 x V128 to a clean helper.) Reads guest state, no
+ writes to guest state, no accesses of memory, is a pure function.
+ This relies on the property that the XMM regs are laid out
+ consecutively in the guest state, so we can index into them here.
+ Returned value (0 .. 16) is in the low 16 bits of the return value.
+ Returned bits 31:16 hold the result OSZACP value.
+*/
+ULong amd64g_dirtyhelper_ISTRI_08 ( VexGuestAMD64State* gst,
+ HWord gstOffL, HWord gstOffR )
+{
+ U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
+ U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
+ return (HWord) compute_ISTRI_08( argL, argR );
+}
+
+ULong amd64g_dirtyhelper_ISTRI_0C ( VexGuestAMD64State* gst,
+ HWord gstOffL, HWord gstOffR )
+{
+ U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
+ U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
+ return (HWord) compute_ISTRI_0C( argL, argR );
+}
+
+ULong amd64g_dirtyhelper_ISTRI_3A ( VexGuestAMD64State* gst,
+ HWord gstOffL, HWord gstOffR )
+{
+ U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
+ U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
+ return (HWord) compute_ISTRI_3A( argL, argR );
+}
+
+ULong amd64g_dirtyhelper_ISTRI_4A ( VexGuestAMD64State* gst,
+ HWord gstOffL, HWord gstOffR )
+{
+ U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
+ U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
+ return (HWord) compute_ISTRI_4A( argL, argR );
+}
+
+
/*---------------------------------------------------------------*/
/*--- Helpers for dealing with, and describing, ---*/
/*--- guest state as a whole. ---*/
SSEZERO(vex_state->guest_XMM13);
SSEZERO(vex_state->guest_XMM14);
SSEZERO(vex_state->guest_XMM15);
+ SSEZERO(vex_state->guest_XMM16);
# undef SSEZERO
#define OFFB_XMM13 offsetof(VexGuestAMD64State,guest_XMM13)
#define OFFB_XMM14 offsetof(VexGuestAMD64State,guest_XMM14)
#define OFFB_XMM15 offsetof(VexGuestAMD64State,guest_XMM15)
+#define OFFB_XMM16 offsetof(VexGuestAMD64State,guest_XMM16)
#define OFFB_EMWARN offsetof(VexGuestAMD64State,guest_EMWARN)
#define OFFB_TISTART offsetof(VexGuestAMD64State,guest_TISTART)
goto decode_success;
}
+ /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
+ (selected special cases that actually occur in glibc,
+ not by any means a complete implementation.)
+ */
+ if (have66noF2noF3(pfx)
+ && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x3A
+ && insn[2] == 0x63) {
+
+ UInt regNoL = 0;
+ UInt regNoR = 0;
+ UChar imm = 0;
+
+ /* This is a nasty kludge. We need to pass 2 x V128 to the
+ helper (which is clean). Since we can't do that, use a dirty
+ helper to compute the results directly from the XMM regs in
+ the guest state. That means for the memory case, we need to
+ move the left operand into a pseudo-register (XMM16, let's
+ call it). */
+ modrm = insn[3];
+ if (epartIsReg(modrm)) {
+ regNoL = eregOfRexRM(pfx, modrm);
+ regNoR = gregOfRexRM(pfx, modrm);
+ imm = insn[3+1];
+ delta += 3+1+1;
+ } else {
+ regNoL = 16; /* use XMM16 as an intermediary */
+ regNoR = gregOfRexRM(pfx, modrm);
+ addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+ stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
+ imm = insn[3+alen];
+ delta += 3+alen+1;
+ }
+
+ /* Now we know the XMM reg numbers for the operands, and the
+ immediate byte. Is it one we can actually handle? */
+ void* fn = NULL;
+ HChar* nm = NULL;
+ switch (imm) {
+ case 0x08: fn = &amd64g_dirtyhelper_ISTRI_08;
+ nm = "amd64g_dirtyhelper_ISTRI_08"; break;
+ case 0x0C: fn = &amd64g_dirtyhelper_ISTRI_0C;
+ nm = "amd64g_dirtyhelper_ISTRI_0C"; break;
+ case 0x3A: fn = &amd64g_dirtyhelper_ISTRI_3A;
+ nm = "amd64g_dirtyhelper_ISTRI_3A"; break;
+ case 0x4A: fn = &amd64g_dirtyhelper_ISTRI_4A;
+ nm = "amd64g_dirtyhelper_ISTRI_4A"; break;
+ default: goto decode_failure;
+ }
+ vassert(fn); vassert(nm);
+
+ UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
+ UInt gstOffR = xmmGuestRegOffset(regNoR);
+
+ IRTemp resT = newTemp(Ity_I64);
+ IRDirty* d
+ = unsafeIRDirty_1_N( resT, 0/*regparms*/,
+ nm, fn,
+ mkIRExprVec_2( mkIRExpr_HWord(gstOffL),
+ mkIRExpr_HWord(gstOffR)) );
+ /* It's not really a dirty call, but we can't use the clean
+ helper mechanism here for the very lame reason that we can't
+ pass 2 x V128s by value to a helper. Hence this roundabout
+ scheme. */
+ d->needsBBP = True;
+ d->nFxState = 2;
+ d->fxState[0].fx = Ifx_Read;
+ d->fxState[0].offset = gstOffL;
+ d->fxState[0].size = sizeof(U128);
+ d->fxState[1].fx = Ifx_Read;
+ d->fxState[1].offset = gstOffR;
+ d->fxState[1].size = sizeof(U128);
+ stmt( IRStmt_Dirty(d) );
+
+ /* Now resT[15:0] holds what the Intel docs call IntRes2, and
+ resT[31:16] holds the new OSZACP values. We must park the
+ resultin ECX and update the condition codes. */
+ putIReg64(R_RCX, binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF)));
+
+ stmt( IRStmt_Put(
+ OFFB_CC_DEP1,
+ binop(Iop_And64, binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
+ mkU64(0xFFFF))
+ ));
+ stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) ));
+ stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+ stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+ if (regNoL == 16) {
+ DIP("pcmpistri $%x,%s,%s\n",
+ (UInt)imm, dis_buf, nameXMMReg(regNoR));
+ } else {
+ DIP("pcmpistri $%x,%s,%s\n",
+ (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
+ }
+
+ goto decode_success;
+ }
/* ---------------------------------------------------- */
/* --- end of the SSE4 decoder --- */
fName = "amd64g_dirtyhelper_CPUID_sse3_and_cx16";
fAddr = &amd64g_dirtyhelper_CPUID_sse3_and_cx16;
/* This is a Core-2-like machine */
+ /* fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16"; */
+ /* fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16; */
+ /* This is a Core-i5-like machine */
}
else {
- /* Give a CPUID for at least a baseline machine, no SSE2
- and no CX16 */
+ /* Give a CPUID for at least a baseline machine, SSE2
+ only, and no CX16 */
fName = "amd64g_dirtyhelper_CPUID_baseline";
fAddr = &amd64g_dirtyhelper_CPUID_baseline;
}
}
+
+/*---------------------------------------------------------*/
+/*--- SSE4.2 PCMP{E,I}STR{I,M} helpers ---*/
+/*---------------------------------------------------------*/
+
+/* We need the definitions for OSZACP eflags/rflags offsets.
+ #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
+ require values directly. They are not going to change in the
+ future :-)
+*/
+#define SHIFT_O 11
+#define SHIFT_S 7
+#define SHIFT_Z 6
+#define SHIFT_A 4
+#define SHIFT_C 0
+#define SHIFT_P 2
+
+#define MASK_O (1 << SHIFT_O)
+#define MASK_S (1 << SHIFT_S)
+#define MASK_Z (1 << SHIFT_Z)
+#define MASK_A (1 << SHIFT_A)
+#define MASK_C (1 << SHIFT_C)
+#define MASK_P (1 << SHIFT_P)
+
+
+/* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's
+ Delight. */
+static UInt clz32 ( UInt x )
+{
+ Int y, m, n;
+ y = -(x >> 16);
+ m = (y >> 16) & 16;
+ n = 16 - m;
+ x = x >> m;
+ y = x - 0x100;
+ m = (y >> 16) & 8;
+ n = n + m;
+ x = x << m;
+ y = x - 0x1000;
+ m = (y >> 16) & 4;
+ n = n + m;
+ x = x << m;
+ y = x - 0x4000;
+ m = (y >> 16) & 2;
+ n = n + m;
+ x = x << m;
+ y = x >> 14;
+ m = y & ~(y >> 1);
+ return n + 2 - m;
+}
+
+static UInt ctz32 ( UInt x )
+{
+ return 32 - clz32((~x) & (x-1));
+}
+
+
+/* Do the computations for SSE4.2 ISTRI_XX. Not called directly from
+ generated code. Pure function, reads *argLU and *argRU, returned
+ value (0 .. 16) is in the low 16 bits of the return value.
+ Returned bits 31:16 hold the result OSZACP value.
+*/
+UInt compute_ISTRI_08 ( U128* argLU, U128* argRU )
+{
+ /* unsigned bytes (also works for unsigned)
+ equal each (straightforward parallel compare)
+ polarity + (IntRes2 = IntRes1)
+ index 0 (want index of ls 1 bit)
+ */
+ Int i;
+ UChar* argL = (UChar*)argLU;
+ UChar* argR = (UChar*)argRU;
+ UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
+ for (i = 15; i >= 0; i--) {
+ UChar cL = argL[i];
+ UChar cR = argR[i];
+ zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0);
+ zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0);
+ boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+ }
+ UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
+ UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+
+ // do invalidation, common to all equal-each cases
+ UInt intRes1
+ = (boolResII & validL & validR) // if both valid, use cmpres
+ | (~ (validL | validR)); // if both invalid, force 1
+ // else force 0
+ intRes1 &= 0xFFFF;
+
+ // polarity: +
+ UInt intRes2 = intRes1;
+
+ // generate ecx value, common to all index-of-ls-1-bit cases
+ UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+
+ // generate new flags, common to all ISTRI and ISTRM cases
+ UInt newFlags // A, P are zero
+ = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+ | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+ | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
+ | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
+
+ return (newFlags << 16) | newECX;
+}
+
+
+UInt compute_ISTRI_0C ( U128* argLU, U128* argRU )
+{
+ /* unsigned bytes
+ equal ordered (substring search)
+ polarity + (IntRes2 = IntRes1)
+ index 0 (want index of ls 1 bit)
+
+ argL: haystack, argR: needle
+ */
+ UInt i, hi, ni;
+ UChar* argL = (UChar*)argLU;
+ UChar* argR = (UChar*)argRU;
+ UInt boolRes = 0, zmaskL = 0, zmaskR = 0;
+ UInt keepSearching = 1;
+ for (i = 0; i < 16; i++) {
+ UChar cL = argL[i];
+ UChar cR = argR[i];
+ zmaskL = (zmaskL >> 1) | (cL == 0 ? (1 << 15) : 0);
+ zmaskR = (zmaskR >> 1) | (cR == 0 ? (1 << 15) : 0);
+
+ if (argL[i] == 0) {
+ // run off the end of the haystack.
+ keepSearching = 0;
+ }
+
+ UInt m = 1;
+ if (keepSearching) {
+ for (ni = 0; ni < 16; ni++) {
+ if (argR[ni] == 0) break;
+ hi = ni + i;
+ if (hi >= 16) break;
+ if (argL[hi] != argR[ni]) { m = 0; break; }
+ }
+ } else {
+ m = 0;
+ }
+ boolRes = (boolRes >> 1) | (m << 15);
+
+ }
+
+ // boolRes is "pre-invalidated"
+ UInt intRes1 = boolRes & 0xFFFF;
+
+ // polarity: +
+ UInt intRes2 = intRes1;
+
+ // generate ecx value, common to all index-of-ls-1-bit cases
+ UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+
+ // generate new flags, common to all ISTRI and ISTRM cases
+ UInt newFlags // A, P are zero
+ = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+ | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+ | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
+ | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
+
+ return (newFlags << 16) | newECX;
+}
+
+
+UInt compute_ISTRI_3A ( U128* argLU, U128* argRU )
+{
+ /* signed bytes (also works for unsigned)
+ equal each (straightforward parallel compare)
+ polarity Masked- (IntRes2 = IntRes1 ^ validL)
+ index 0 (want index of ls 1 bit)
+ */
+ Int i;
+ UChar* argL = (UChar*)argLU;
+ UChar* argR = (UChar*)argRU;
+ UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
+ for (i = 15; i >= 0; i--) {
+ UChar cL = argL[i];
+ UChar cR = argR[i];
+ zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0);
+ zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0);
+ boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+ }
+ UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
+ UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+
+ // do invalidation, common to all equal-each cases
+ UInt intRes1
+ = (boolResII & validL & validR) // if both valid, use cmpres
+ | (~ (validL | validR)); // if both invalid, force 1
+ // else force 0
+ intRes1 &= 0xFFFF;
+
+ // polarity: Masked-
+ UInt intRes2 = (intRes1 ^ validL) & 0xFFFF;
+
+ // generate ecx value, common to all index-of-ls-1-bit cases
+ UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+
+ // generate new flags, common to all ISTRI and ISTRM cases
+ UInt newFlags // A, P are zero
+ = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+ | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+ | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
+ | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
+
+ return (newFlags << 16) | newECX;
+}
+
+
+UInt compute_ISTRI_4A ( U128* argLU, U128* argRU )
+{
+ /* signed bytes (also works for unsigned)
+ equal each (straightforward parallel compare)
+ polarity + (IntRes2 = IntRes1)
+ index 1 (want index of ms 1 bit)
+ */
+ Int i;
+ UChar* argL = (UChar*)argLU;
+ UChar* argR = (UChar*)argRU;
+ UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
+ for (i = 15; i >= 0; i--) {
+ UChar cL = argL[i];
+ UChar cR = argR[i];
+ zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0);
+ zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0);
+ boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+ }
+ UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))
+ UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))
+
+ // do invalidation, common to all equal-each cases
+ UInt intRes1
+ = (boolResII & validL & validR) // if both valid, use cmpres
+ | (~ (validL | validR)); // if both invalid, force 1
+ // else force 0
+ intRes1 &= 0xFFFF;
+
+ // polarity
+ UInt intRes2 = intRes1;
+
+ // generate ecx value, common to all index-of-ms-1-bit cases
+ UInt newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
+
+ // generate new flags, common to all ISTRI and ISTRM cases
+ UInt newFlags // A, P are zero
+ = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+ | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+ | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0
+ | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]
+
+ return (newFlags << 16) | newECX;
+}
+
+
/*---------------------------------------------------------------*/
/*--- end guest_generic_x87.c ---*/
/*---------------------------------------------------------------*/
#define FP_REG(ii) (10*(7-(ii)))
-/* Do the computations for x86/amd64 FXTRACT */
+/* Do the computations for x86/amd64 FXTRACT. Called directly from
+ generated code. CLEAN HELPER. */
extern ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp );
+/* Do the computations for SSE4.2 ISTRI_XX. Not called directly from
+ generated code. Pure function, reads *argLU and *argRU, returned
+ value (0 .. 16) is in the low 16 bits of the return value.
+ Returned bits 31:16 hold the result OSZACP value. */
+extern UInt compute_ISTRI_08 ( U128* argLU, U128* argRU );
+extern UInt compute_ISTRI_0C ( U128* argLU, U128* argRU );
+extern UInt compute_ISTRI_3A ( U128* argLU, U128* argRU );
+extern UInt compute_ISTRI_4A ( U128* argLU, U128* argRU );
#endif /* ndef __VEX_GUEST_GENERIC_X87_H */
associated with a %fs value of zero. */
/* 184 */ ULong guest_FS_ZERO;
- /* XMM registers */
+ /* XMM registers. Note that these must be allocated
+ consecutively in order that the SSE4.2 PCMP{E,I}STR{I,M}
+ helpers can treat them as an array. XMM16 is a fake reg used
+ as an intermediary in handling aforementioned insns. */
/* 192 */ULong guest_SSEROUND;
/* 200 */U128 guest_XMM0;
U128 guest_XMM1;
U128 guest_XMM13;
U128 guest_XMM14;
U128 guest_XMM15;
+ U128 guest_XMM16;
/* FPU */
/* Note. Setting guest_FTOP to be ULong messes up the