From: Julian Seward Date: Mon, 21 May 2012 10:18:49 +0000 (+0000) Subject: Add initial support for Intel AVX instructions (VEX side). X-Git-Tag: svn/VALGRIND_3_8_1^2~148 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=92cc69c4fc2a86a5ef9d09a9296e747839623fdb;p=thirdparty%2Fvalgrind.git Add initial support for Intel AVX instructions (VEX side). Tracker bug is #273475. git-svn-id: svn://svn.valgrind.org/vex/trunk@2330 --- diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c index c749c397b8..10f43995d6 100644 --- a/VEX/priv/guest_amd64_helpers.c +++ b/VEX/priv/guest_amd64_helpers.c @@ -1723,22 +1723,22 @@ void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr ) _dst[2] = _src[2]; _dst[3] = _src[3]; } \ while (0) - COPY_U128( xmm[0], gst->guest_XMM0 ); - COPY_U128( xmm[1], gst->guest_XMM1 ); - COPY_U128( xmm[2], gst->guest_XMM2 ); - COPY_U128( xmm[3], gst->guest_XMM3 ); - COPY_U128( xmm[4], gst->guest_XMM4 ); - COPY_U128( xmm[5], gst->guest_XMM5 ); - COPY_U128( xmm[6], gst->guest_XMM6 ); - COPY_U128( xmm[7], gst->guest_XMM7 ); - COPY_U128( xmm[8], gst->guest_XMM8 ); - COPY_U128( xmm[9], gst->guest_XMM9 ); - COPY_U128( xmm[10], gst->guest_XMM10 ); - COPY_U128( xmm[11], gst->guest_XMM11 ); - COPY_U128( xmm[12], gst->guest_XMM12 ); - COPY_U128( xmm[13], gst->guest_XMM13 ); - COPY_U128( xmm[14], gst->guest_XMM14 ); - COPY_U128( xmm[15], gst->guest_XMM15 ); + COPY_U128( xmm[0], gst->guest_YMM0 ); + COPY_U128( xmm[1], gst->guest_YMM1 ); + COPY_U128( xmm[2], gst->guest_YMM2 ); + COPY_U128( xmm[3], gst->guest_YMM3 ); + COPY_U128( xmm[4], gst->guest_YMM4 ); + COPY_U128( xmm[5], gst->guest_YMM5 ); + COPY_U128( xmm[6], gst->guest_YMM6 ); + COPY_U128( xmm[7], gst->guest_YMM7 ); + COPY_U128( xmm[8], gst->guest_YMM8 ); + COPY_U128( xmm[9], gst->guest_YMM9 ); + COPY_U128( xmm[10], gst->guest_YMM10 ); + COPY_U128( xmm[11], gst->guest_YMM11 ); + COPY_U128( xmm[12], gst->guest_YMM12 ); + COPY_U128( xmm[13], gst->guest_YMM13 ); + COPY_U128( xmm[14], gst->guest_YMM14 ); + COPY_U128( xmm[15], gst->guest_YMM15 ); # undef COPY_U128 } @@ -1766,22 +1766,22 @@ VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr ) _dst[2] = _src[2]; _dst[3] = _src[3]; } \ while (0) - COPY_U128( gst->guest_XMM0, xmm[0] ); - COPY_U128( gst->guest_XMM1, xmm[1] ); - COPY_U128( gst->guest_XMM2, xmm[2] ); - COPY_U128( gst->guest_XMM3, xmm[3] ); - COPY_U128( gst->guest_XMM4, xmm[4] ); - COPY_U128( gst->guest_XMM5, xmm[5] ); - COPY_U128( gst->guest_XMM6, xmm[6] ); - COPY_U128( gst->guest_XMM7, xmm[7] ); - COPY_U128( gst->guest_XMM8, xmm[8] ); - COPY_U128( gst->guest_XMM9, xmm[9] ); - COPY_U128( gst->guest_XMM10, xmm[10] ); - COPY_U128( gst->guest_XMM11, xmm[11] ); - COPY_U128( gst->guest_XMM12, xmm[12] ); - COPY_U128( gst->guest_XMM13, xmm[13] ); - COPY_U128( gst->guest_XMM14, xmm[14] ); - COPY_U128( gst->guest_XMM15, xmm[15] ); + COPY_U128( gst->guest_YMM0, xmm[0] ); + COPY_U128( gst->guest_YMM1, xmm[1] ); + COPY_U128( gst->guest_YMM2, xmm[2] ); + COPY_U128( gst->guest_YMM3, xmm[3] ); + COPY_U128( gst->guest_YMM4, xmm[4] ); + COPY_U128( gst->guest_YMM5, xmm[5] ); + COPY_U128( gst->guest_YMM6, xmm[6] ); + COPY_U128( gst->guest_YMM7, xmm[7] ); + COPY_U128( gst->guest_YMM8, xmm[8] ); + COPY_U128( gst->guest_YMM9, xmm[9] ); + COPY_U128( gst->guest_YMM10, xmm[10] ); + COPY_U128( gst->guest_YMM11, xmm[11] ); + COPY_U128( gst->guest_YMM12, xmm[12] ); + COPY_U128( gst->guest_YMM13, xmm[13] ); + COPY_U128( gst->guest_YMM14, xmm[14] ); + COPY_U128( gst->guest_YMM15, xmm[15] ); # undef COPY_U128 @@ -3129,11 +3129,10 @@ ULong amd64g_dirtyhelper_PCMPxSTRx ( // In all cases, the new OSZACP value is the lowest 16 of // the return value. if (isxSTRM) { - /* gst->guest_XMM0 = resV; */ // gcc don't like that - gst->guest_XMM0[0] = resV.w32[0]; - gst->guest_XMM0[1] = resV.w32[1]; - gst->guest_XMM0[2] = resV.w32[2]; - gst->guest_XMM0[3] = resV.w32[3]; + gst->guest_YMM0[0] = resV.w32[0]; + gst->guest_YMM0[1] = resV.w32[1]; + gst->guest_YMM0[2] = resV.w32[2]; + gst->guest_YMM0[3] = resV.w32[3]; return resOSZACP & 0x8D5; } else { UInt newECX = resV.w32[0] & 0xFFFF; @@ -3507,29 +3506,31 @@ void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state ) /* Initialise the simulated FPU */ amd64g_dirtyhelper_FINIT( vex_state ); - /* Initialise the SSE state. */ -# define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0; - + /* Initialise the AVX state. */ +# define AVXZERO(_ymm) \ + do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \ + _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \ + } while (0) vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST; - SSEZERO(vex_state->guest_XMM0); - SSEZERO(vex_state->guest_XMM1); - SSEZERO(vex_state->guest_XMM2); - SSEZERO(vex_state->guest_XMM3); - SSEZERO(vex_state->guest_XMM4); - SSEZERO(vex_state->guest_XMM5); - SSEZERO(vex_state->guest_XMM6); - SSEZERO(vex_state->guest_XMM7); - SSEZERO(vex_state->guest_XMM8); - SSEZERO(vex_state->guest_XMM9); - SSEZERO(vex_state->guest_XMM10); - SSEZERO(vex_state->guest_XMM11); - SSEZERO(vex_state->guest_XMM12); - SSEZERO(vex_state->guest_XMM13); - SSEZERO(vex_state->guest_XMM14); - SSEZERO(vex_state->guest_XMM15); - SSEZERO(vex_state->guest_XMM16); - -# undef SSEZERO + AVXZERO(vex_state->guest_YMM0); + AVXZERO(vex_state->guest_YMM1); + AVXZERO(vex_state->guest_YMM2); + AVXZERO(vex_state->guest_YMM3); + AVXZERO(vex_state->guest_YMM4); + AVXZERO(vex_state->guest_YMM5); + AVXZERO(vex_state->guest_YMM6); + AVXZERO(vex_state->guest_YMM7); + AVXZERO(vex_state->guest_YMM8); + AVXZERO(vex_state->guest_YMM9); + AVXZERO(vex_state->guest_YMM10); + AVXZERO(vex_state->guest_YMM11); + AVXZERO(vex_state->guest_YMM12); + AVXZERO(vex_state->guest_YMM13); + AVXZERO(vex_state->guest_YMM14); + AVXZERO(vex_state->guest_YMM15); + AVXZERO(vex_state->guest_YMM16); + +# undef AVXZERO vex_state->guest_EMWARN = EmWarn_NONE; diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index ab79312c97..fc93730fe7 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -416,34 +416,25 @@ static void unimplemented ( HChar* str ) #define OFFB_FTOP offsetof(VexGuestAMD64State,guest_FTOP) #define OFFB_FC3210 offsetof(VexGuestAMD64State,guest_FC3210) #define OFFB_FPROUND offsetof(VexGuestAMD64State,guest_FPROUND) -//.. -//.. #define OFFB_CS offsetof(VexGuestX86State,guest_CS) -//.. #define OFFB_DS offsetof(VexGuestX86State,guest_DS) -//.. #define OFFB_ES offsetof(VexGuestX86State,guest_ES) -//.. #define OFFB_FS offsetof(VexGuestX86State,guest_FS) -//.. #define OFFB_GS offsetof(VexGuestX86State,guest_GS) -//.. #define OFFB_SS offsetof(VexGuestX86State,guest_SS) -//.. #define OFFB_LDT offsetof(VexGuestX86State,guest_LDT) -//.. #define OFFB_GDT offsetof(VexGuestX86State,guest_GDT) #define OFFB_SSEROUND offsetof(VexGuestAMD64State,guest_SSEROUND) -#define OFFB_XMM0 offsetof(VexGuestAMD64State,guest_XMM0) -#define OFFB_XMM1 offsetof(VexGuestAMD64State,guest_XMM1) -#define OFFB_XMM2 offsetof(VexGuestAMD64State,guest_XMM2) -#define OFFB_XMM3 offsetof(VexGuestAMD64State,guest_XMM3) -#define OFFB_XMM4 offsetof(VexGuestAMD64State,guest_XMM4) -#define OFFB_XMM5 offsetof(VexGuestAMD64State,guest_XMM5) -#define OFFB_XMM6 offsetof(VexGuestAMD64State,guest_XMM6) -#define OFFB_XMM7 offsetof(VexGuestAMD64State,guest_XMM7) -#define OFFB_XMM8 offsetof(VexGuestAMD64State,guest_XMM8) -#define OFFB_XMM9 offsetof(VexGuestAMD64State,guest_XMM9) -#define OFFB_XMM10 offsetof(VexGuestAMD64State,guest_XMM10) -#define OFFB_XMM11 offsetof(VexGuestAMD64State,guest_XMM11) -#define OFFB_XMM12 offsetof(VexGuestAMD64State,guest_XMM12) -#define OFFB_XMM13 offsetof(VexGuestAMD64State,guest_XMM13) -#define OFFB_XMM14 offsetof(VexGuestAMD64State,guest_XMM14) -#define OFFB_XMM15 offsetof(VexGuestAMD64State,guest_XMM15) -#define OFFB_XMM16 offsetof(VexGuestAMD64State,guest_XMM16) +#define OFFB_YMM0 offsetof(VexGuestAMD64State,guest_YMM0) +#define OFFB_YMM1 offsetof(VexGuestAMD64State,guest_YMM1) +#define OFFB_YMM2 offsetof(VexGuestAMD64State,guest_YMM2) +#define OFFB_YMM3 offsetof(VexGuestAMD64State,guest_YMM3) +#define OFFB_YMM4 offsetof(VexGuestAMD64State,guest_YMM4) +#define OFFB_YMM5 offsetof(VexGuestAMD64State,guest_YMM5) +#define OFFB_YMM6 offsetof(VexGuestAMD64State,guest_YMM6) +#define OFFB_YMM7 offsetof(VexGuestAMD64State,guest_YMM7) +#define OFFB_YMM8 offsetof(VexGuestAMD64State,guest_YMM8) +#define OFFB_YMM9 offsetof(VexGuestAMD64State,guest_YMM9) +#define OFFB_YMM10 offsetof(VexGuestAMD64State,guest_YMM10) +#define OFFB_YMM11 offsetof(VexGuestAMD64State,guest_YMM11) +#define OFFB_YMM12 offsetof(VexGuestAMD64State,guest_YMM12) +#define OFFB_YMM13 offsetof(VexGuestAMD64State,guest_YMM13) +#define OFFB_YMM14 offsetof(VexGuestAMD64State,guest_YMM14) +#define OFFB_YMM15 offsetof(VexGuestAMD64State,guest_YMM15) +#define OFFB_YMM16 offsetof(VexGuestAMD64State,guest_YMM16) #define OFFB_EMWARN offsetof(VexGuestAMD64State,guest_EMWARN) #define OFFB_TISTART offsetof(VexGuestAMD64State,guest_TISTART) @@ -475,9 +466,6 @@ static void unimplemented ( HChar* str ) #define R_R14 14 #define R_R15 15 -//.. #define R_AL (0+R_EAX) -//.. #define R_AH (4+R_EAX) - /* This is the Intel register encoding -- segment regs. */ #define R_ES 0 #define R_CS 1 @@ -649,8 +637,8 @@ static IRType szToITy ( Int n ) most especially when making sense of register fields in instructions. - The top 16 bits of the prefix are 0x3141, just as a hacky way - to ensure it really is a valid prefix. + The top 8 bits of the prefix are 0x55, just as a hacky way to + ensure it really is a valid prefix. Things you can safely assume about a well-formed prefix: * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set. @@ -661,27 +649,37 @@ static IRType szToITy ( Int n ) typedef UInt Prefix; -#define PFX_ASO (1<<0) /* address-size override present (0x67) */ -#define PFX_66 (1<<1) /* operand-size override-to-16 present (0x66) */ -#define PFX_REX (1<<2) /* REX byte present (0x40 to 0x4F) */ -#define PFX_REXW (1<<3) /* REX W bit, if REX present, else 0 */ -#define PFX_REXR (1<<4) /* REX R bit, if REX present, else 0 */ -#define PFX_REXX (1<<5) /* REX X bit, if REX present, else 0 */ -#define PFX_REXB (1<<6) /* REX B bit, if REX present, else 0 */ -#define PFX_LOCK (1<<7) /* bus LOCK prefix present (0xF0) */ -#define PFX_F2 (1<<8) /* REP/REPE/REPZ prefix present (0xF2) */ -#define PFX_F3 (1<<9) /* REPNE/REPNZ prefix present (0xF3) */ -#define PFX_CS (1<<10) /* CS segment prefix present (0x2E) */ -#define PFX_DS (1<<11) /* DS segment prefix present (0x3E) */ -#define PFX_ES (1<<12) /* ES segment prefix present (0x26) */ -#define PFX_FS (1<<13) /* FS segment prefix present (0x64) */ -#define PFX_GS (1<<14) /* GS segment prefix present (0x65) */ -#define PFX_SS (1<<15) /* SS segment prefix present (0x36) */ - -#define PFX_EMPTY 0x31410000 +#define PFX_ASO (1<<0) /* address-size override present (0x67) */ +#define PFX_66 (1<<1) /* operand-size override-to-16 present (0x66) */ +#define PFX_REX (1<<2) /* REX byte present (0x40 to 0x4F) */ +#define PFX_REXW (1<<3) /* REX W bit, if REX present, else 0 */ +#define PFX_REXR (1<<4) /* REX R bit, if REX present, else 0 */ +#define PFX_REXX (1<<5) /* REX X bit, if REX present, else 0 */ +#define PFX_REXB (1<<6) /* REX B bit, if REX present, else 0 */ +#define PFX_LOCK (1<<7) /* bus LOCK prefix present (0xF0) */ +#define PFX_F2 (1<<8) /* REP/REPE/REPZ prefix present (0xF2) */ +#define PFX_F3 (1<<9) /* REPNE/REPNZ prefix present (0xF3) */ +#define PFX_CS (1<<10) /* CS segment prefix present (0x2E) */ +#define PFX_DS (1<<11) /* DS segment prefix present (0x3E) */ +#define PFX_ES (1<<12) /* ES segment prefix present (0x26) */ +#define PFX_FS (1<<13) /* FS segment prefix present (0x64) */ +#define PFX_GS (1<<14) /* GS segment prefix present (0x65) */ +#define PFX_SS (1<<15) /* SS segment prefix present (0x36) */ +#define PFX_VEX (1<<16) /* VEX prefix present (0xC4 or 0xC5) */ +#define PFX_VEXL (1<<17) /* VEX L bit, if VEX present, else 0 */ +/* The extra register field VEX.vvvv is encoded (after not-ing it) as + PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit + positions. */ +#define PFX_VEXnV0 (1<<18) /* ~VEX vvvv[0], if VEX present, else 0 */ +#define PFX_VEXnV1 (1<<19) /* ~VEX vvvv[1], if VEX present, else 0 */ +#define PFX_VEXnV2 (1<<20) /* ~VEX vvvv[2], if VEX present, else 0 */ +#define PFX_VEXnV3 (1<<21) /* ~VEX vvvv[3], if VEX present, else 0 */ + + +#define PFX_EMPTY 0x55000000 static Bool IS_VALID_PFX ( Prefix pfx ) { - return toBool((pfx & 0xFFFF0000) == PFX_EMPTY); + return toBool((pfx & 0xFF000000) == PFX_EMPTY); } static Bool haveREX ( Prefix pfx ) { @@ -691,11 +689,9 @@ static Bool haveREX ( Prefix pfx ) { static Int getRexW ( Prefix pfx ) { return (pfx & PFX_REXW) ? 1 : 0; } -/* Apparently unused. static Int getRexR ( Prefix pfx ) { return (pfx & PFX_REXR) ? 1 : 0; } -*/ static Int getRexX ( Prefix pfx ) { return (pfx & PFX_REXX) ? 1 : 0; } @@ -783,6 +779,21 @@ static Prefix clearSegBits ( Prefix p ) p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS); } +/* Get the (inverted, hence back to "normal") VEX.vvvv field. */ +static UInt getVexNvvvv ( Prefix pfx ) { + UInt r = (UInt)pfx; + r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */ + return r & 0xF; +} + +static Bool haveVEX ( Prefix pfx ) { + return toBool(pfx & PFX_VEX); +} + +static Int getVexL ( Prefix pfx ) { + return (pfx & PFX_VEXL) ? 1 : 0; +} + /*------------------------------------------------------------*/ /*--- For dealing with escapes ---*/ @@ -1331,40 +1342,34 @@ HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm ) /*--- For dealing with XMM registers ---*/ /*------------------------------------------------------------*/ -//.. static Int segmentGuestRegOffset ( UInt sreg ) -//.. { -//.. switch (sreg) { -//.. case R_ES: return OFFB_ES; -//.. case R_CS: return OFFB_CS; -//.. case R_SS: return OFFB_SS; -//.. case R_DS: return OFFB_DS; -//.. case R_FS: return OFFB_FS; -//.. case R_GS: return OFFB_GS; -//.. default: vpanic("segmentGuestRegOffset(x86)"); -//.. } -//.. } +static Int ymmGuestRegOffset ( UInt ymmreg ) +{ + switch (ymmreg) { + case 0: return OFFB_YMM0; + case 1: return OFFB_YMM1; + case 2: return OFFB_YMM2; + case 3: return OFFB_YMM3; + case 4: return OFFB_YMM4; + case 5: return OFFB_YMM5; + case 6: return OFFB_YMM6; + case 7: return OFFB_YMM7; + case 8: return OFFB_YMM8; + case 9: return OFFB_YMM9; + case 10: return OFFB_YMM10; + case 11: return OFFB_YMM11; + case 12: return OFFB_YMM12; + case 13: return OFFB_YMM13; + case 14: return OFFB_YMM14; + case 15: return OFFB_YMM15; + default: vpanic("ymmGuestRegOffset(amd64)"); + } +} static Int xmmGuestRegOffset ( UInt xmmreg ) { - switch (xmmreg) { - case 0: return OFFB_XMM0; - case 1: return OFFB_XMM1; - case 2: return OFFB_XMM2; - case 3: return OFFB_XMM3; - case 4: return OFFB_XMM4; - case 5: return OFFB_XMM5; - case 6: return OFFB_XMM6; - case 7: return OFFB_XMM7; - case 8: return OFFB_XMM8; - case 9: return OFFB_XMM9; - case 10: return OFFB_XMM10; - case 11: return OFFB_XMM11; - case 12: return OFFB_XMM12; - case 13: return OFFB_XMM13; - case 14: return OFFB_XMM14; - case 15: return OFFB_XMM15; - default: vpanic("xmmGuestRegOffset(amd64)"); - } + /* Correct for little-endian host only. */ + vassert(!host_is_bigendian); + return ymmGuestRegOffset( xmmreg ); } /* Lanes of vector registers are always numbered from zero being the @@ -1394,16 +1399,13 @@ static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno ) return xmmGuestRegOffset( xmmreg ) + 8 * laneno; } -//.. static IRExpr* getSReg ( UInt sreg ) -//.. { -//.. return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 ); -//.. } -//.. -//.. static void putSReg ( UInt sreg, IRExpr* e ) -//.. { -//.. vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16); -//.. stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) ); -//.. } +static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno ) +{ + /* Correct for little-endian host only. */ + vassert(!host_is_bigendian); + vassert(laneno >= 0 && laneno < 2); + return ymmGuestRegOffset( ymmreg ) + 16 * laneno; +} static IRExpr* getXMMReg ( UInt xmmreg ) { @@ -1471,11 +1473,40 @@ static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e ) stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) ); } +static IRExpr* getYMMReg ( UInt xmmreg ) +{ + return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 ); +} + +static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno ) +{ + return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 ); +} + +static void putYMMReg ( UInt ymmreg, IRExpr* e ) +{ + vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256); + stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) ); +} + +static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e ) +{ + vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128); + stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) ); +} + static IRExpr* mkV128 ( UShort mask ) { return IRExpr_Const(IRConst_V128(mask)); } +/* Write the low half of a YMM reg and zero out the upper half. */ +static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e ) +{ + putYMMRegLane128( ymmreg, 0, e ); + putYMMRegLane128( ymmreg, 1, mkV128(0) ); +} + static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y ) { vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1); @@ -2087,6 +2118,17 @@ static HChar nameISize ( Int size ) } } +static HChar* nameYMMReg ( Int ymmreg ) +{ + static HChar* ymm_names[16] + = { "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15" }; + if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)"); + return ymm_names[ymmreg]; +} + /*------------------------------------------------------------*/ /*--- JMP helpers ---*/ @@ -8479,76 +8521,116 @@ static ULong dis_SSEint_E_to_G( } -/* Helper for doing SSE FP comparisons. */ - -static void findSSECmpOp ( Bool* needNot, IROp* op, - Int imm8, Bool all_lanes, Int sz ) +/* Helper for doing SSE FP comparisons. False return ==> unhandled. + This is all a bit of a kludge in that it ignores the subtleties of + ordered-vs-unordered and signalling-vs-nonsignalling in the Intel + spec. */ +static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP, + /*OUT*/IROp* opP, + /*OUT*/Bool* postNotP, + UInt imm8, Bool all_lanes, Int sz ) { - imm8 &= 7; - *needNot = False; - *op = Iop_INVALID; - if (imm8 >= 4) { - *needNot = True; - imm8 -= 4; + if (imm8 >= 32) return False; + + /* First, compute a (preSwap, op, postNot) triple from + the supplied imm8. */ + Bool pre = False; + IROp op = Iop_INVALID; + Bool not = False; + +# define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; } + switch (imm8) { + case 0x0: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ + case 0x1: XXX(False, Iop_CmpLT32Fx4, False); break; // LT + case 0x2: XXX(False, Iop_CmpLE32Fx4, False); break; // LE + case 0x3: XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD + case 0x4: XXX(False, Iop_CmpEQ32Fx4, True); break; // NE + case 0x5: XXX(False, Iop_CmpLT32Fx4, True); break; // NLT + case 0x6: XXX(False, Iop_CmpLE32Fx4, True); break; // NLE + case 0x7: XXX(False, Iop_CmpUN32Fx4, True); break; // ORD + /* "Enhanced Comparison Predicate[s] for VEX-Encoded [insns] */ + case 0xA: XXX(True, Iop_CmpLT32Fx4, True); break; // NGT_US + case 0xC: XXX(False, Iop_CmpEQ32Fx4, True); break; // NEQ_OQ + case 0xD: XXX(True, Iop_CmpLE32Fx4, False); break; // GE_OS + case 0xE: XXX(True, Iop_CmpLT32Fx4, False); break; // GT_OS + default: break; } +# undef XXX + if (op == Iop_INVALID) return False; - if (sz == 4 && all_lanes) { - switch (imm8) { - case 0: *op = Iop_CmpEQ32Fx4; return; - case 1: *op = Iop_CmpLT32Fx4; return; - case 2: *op = Iop_CmpLE32Fx4; return; - case 3: *op = Iop_CmpUN32Fx4; return; - default: break; + /* Now convert the op into one with the same arithmetic but that is + correct for the width and laneage requirements. */ + + /**/ if (sz == 4 && all_lanes) { + switch (op) { + case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break; + case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break; + case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break; + case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break; + default: vassert(0); } } - if (sz == 4 && !all_lanes) { - switch (imm8) { - case 0: *op = Iop_CmpEQ32F0x4; return; - case 1: *op = Iop_CmpLT32F0x4; return; - case 2: *op = Iop_CmpLE32F0x4; return; - case 3: *op = Iop_CmpUN32F0x4; return; - default: break; + else if (sz == 4 && !all_lanes) { + switch (op) { + case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break; + case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break; + case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break; + case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break; + default: vassert(0); } } - if (sz == 8 && all_lanes) { - switch (imm8) { - case 0: *op = Iop_CmpEQ64Fx2; return; - case 1: *op = Iop_CmpLT64Fx2; return; - case 2: *op = Iop_CmpLE64Fx2; return; - case 3: *op = Iop_CmpUN64Fx2; return; - default: break; + else if (sz == 8 && all_lanes) { + switch (op) { + case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break; + case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break; + case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break; + case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break; + default: vassert(0); } } - if (sz == 8 && !all_lanes) { - switch (imm8) { - case 0: *op = Iop_CmpEQ64F0x2; return; - case 1: *op = Iop_CmpLT64F0x2; return; - case 2: *op = Iop_CmpLE64F0x2; return; - case 3: *op = Iop_CmpUN64F0x2; return; - default: break; + else if (sz == 8 && !all_lanes) { + switch (op) { + case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break; + case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break; + case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break; + case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break; + default: vassert(0); } } - vpanic("findSSECmpOp(amd64,guest)"); + else { + vpanic("findSSECmpOp(amd64,guest)"); + } + + *preSwapP = pre; *opP = op; *postNotP = not; + return True; } -/* Handles SSE 32F/64F comparisons. */ -static ULong dis_SSEcmp_E_to_G ( VexAbiInfo* vbi, +/* Handles SSE 32F/64F comparisons. It can fail, in which case it + returns the original delta to indicate failure. */ + +static Long dis_SSE_cmp_E_to_G ( VexAbiInfo* vbi, Prefix pfx, Long delta, HChar* opname, Bool all_lanes, Int sz ) { + Long delta0 = delta; HChar dis_buf[50]; - Int alen, imm8; + Int alen; + UInt imm8; IRTemp addr; - Bool needNot = False; + Bool preSwap = False; IROp op = Iop_INVALID; + Bool postNot = False; IRTemp plain = newTemp(Ity_V128); UChar rm = getUChar(delta); UShort mask = 0; vassert(sz == 4 || sz == 8); if (epartIsReg(rm)) { imm8 = getUChar(delta+1); - findSSECmpOp(&needNot, &op, imm8, all_lanes, sz); + if (imm8 >= 8) return delta0; /* FAIL */ + Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz); + if (!ok) return delta0; /* FAIL */ + vassert(!preSwap); /* never needed for imm8 < 8 */ assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)), getXMMReg(eregOfRexRM(pfx,rm))) ); delta += 2; @@ -8559,14 +8641,20 @@ static ULong dis_SSEcmp_E_to_G ( VexAbiInfo* vbi, } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 ); imm8 = getUChar(delta+alen); - findSSECmpOp(&needNot, &op, imm8, all_lanes, sz); + if (imm8 >= 8) return delta0; /* FAIL */ + Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz); + if (!ok) return delta0; /* FAIL */ + vassert(!preSwap); /* never needed for imm8 < 8 */ assign( plain, binop( op, getXMMReg(gregOfRexRM(pfx,rm)), - all_lanes ? loadLE(Ity_V128, mkexpr(addr)) - : sz == 8 ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr))) - : /*sz==4*/ unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))) + all_lanes + ? loadLE(Ity_V128, mkexpr(addr)) + : sz == 8 + ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr))) + : /*sz==4*/ + unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))) ) ); delta += alen+1; @@ -8576,12 +8664,12 @@ static ULong dis_SSEcmp_E_to_G ( VexAbiInfo* vbi, nameXMMReg(gregOfRexRM(pfx,rm)) ); } - if (needNot && all_lanes) { + if (postNot && all_lanes) { putXMMReg( gregOfRexRM(pfx,rm), unop(Iop_NotV128, mkexpr(plain)) ); } else - if (needNot && !all_lanes) { + if (postNot && !all_lanes) { mask = toUShort(sz==4 ? 0x000F : 0x00FF); putXMMReg( gregOfRexRM(pfx,rm), binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) ); @@ -8985,12 +9073,13 @@ static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64, if effective_addr is not 16-aligned. This is required behaviour for some SSE3 instructions and all 128-bit SSSE3 instructions. This assumes that guest_RIP_curr_instr is set correctly! */ -static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) +static +void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask ) { stmt( IRStmt_Exit( binop(Iop_CmpNE64, - binop(Iop_And64,mkexpr(effective_addr),mkU64(0xF)), + binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)), mkU64(0)), Ijk_SigSEGV, IRConst_U64(guest_RIP_curr_instr), @@ -8999,6 +9088,13 @@ static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) ); } +static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) { + gen_SEGV_if_not_XX_aligned(effective_addr, 16-1); +} + +static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) { + gen_SEGV_if_not_XX_aligned(effective_addr, 32-1); +} /* Helper for deciding whether a given insn (starting at the opcode byte) may validly be used with a LOCK prefix. The following insns @@ -9136,6 +9232,334 @@ static Bool can_be_used_with_LOCK_prefix ( UChar* opc ) /*--- ---*/ /*------------------------------------------------------------*/ +static Long dis_COMISD ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx, UChar opc ) +{ + vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/); + Int alen = 0; + HChar dis_buf[50]; + IRTemp argL = newTemp(Ity_F64); + IRTemp argR = newTemp(Ity_F64); + UChar modrm = getUChar(delta); + IRTemp addr = IRTemp_INVALID; + if (epartIsReg(modrm)) { + assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm), + 0/*lowest lane*/ ) ); + delta += 1; + DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "", + opc==0x2E ? "u" : "", + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm)) ); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( argR, loadLE(Ity_F64, mkexpr(addr)) ); + delta += alen; + DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "", + opc==0x2E ? "u" : "", + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm)) ); + } + assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm), + 0/*lowest lane*/ ) ); + + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + stmt( IRStmt_Put( + OFFB_CC_DEP1, + binop( Iop_And64, + unop( Iop_32Uto64, + binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ), + mkU64(0x45) + ))); + return delta; +} + + +static Long dis_COMISS ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx, UChar opc ) +{ + vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/); + Int alen = 0; + HChar dis_buf[50]; + IRTemp argL = newTemp(Ity_F32); + IRTemp argR = newTemp(Ity_F32); + UChar modrm = getUChar(delta); + IRTemp addr = IRTemp_INVALID; + if (epartIsReg(modrm)) { + assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm), + 0/*lowest lane*/ ) ); + delta += 1; + DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "", + opc==0x2E ? "u" : "", + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm)) ); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( argR, loadLE(Ity_F32, mkexpr(addr)) ); + delta += alen; + DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "", + opc==0x2E ? "u" : "", + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm)) ); + } + assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm), + 0/*lowest lane*/ ) ); + + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + stmt( IRStmt_Put( + OFFB_CC_DEP1, + binop( Iop_And64, + unop( Iop_32Uto64, + binop(Iop_CmpF64, + unop(Iop_F32toF64,mkexpr(argL)), + unop(Iop_F32toF64,mkexpr(argR)))), + mkU64(0x45) + ))); + return delta; +} + + +static Long dis_PSHUFD_32x4 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool writesYmm ) +{ + Int order; + Int alen = 0; + HChar dis_buf[50]; + IRTemp sV = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + HChar* strV = writesYmm ? "v" : ""; + IRTemp addr = IRTemp_INVALID; + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + order = (Int)getUChar(delta+1); + delta += 1+1; + DIP("%spshufd $%d,%s,%s\n", strV, order, + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, + 1/*byte after the amode*/ ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + order = (Int)getUChar(delta+alen); + delta += alen+1; + DIP("%spshufd $%d,%s,%s\n", strV, order, + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + IRTemp s3, s2, s1, s0; + s3 = s2 = s1 = s0 = IRTemp_INVALID; + breakup128to32s( sV, &s3, &s2, &s1, &s0 ); + +# define SEL(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) + IRTemp dV = newTemp(Ity_V128); + assign(dV, + mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3), + SEL((order>>2)&3), SEL((order>>0)&3) ) + ); +# undef SEL + + (writesYmm ? putYMMRegLoAndZU : putXMMReg) + (gregOfRexRM(pfx,modrm), mkexpr(dV)); + return delta; +} + + +static IRTemp math_PSRLDQ ( IRTemp sV, Int imm ) +{ + IRTemp dV = newTemp(Ity_V128); + IRTemp hi64 = newTemp(Ity_I64); + IRTemp lo64 = newTemp(Ity_I64); + IRTemp hi64r = newTemp(Ity_I64); + IRTemp lo64r = newTemp(Ity_I64); + + vassert(imm >= 0 && imm <= 255); + if (imm >= 16) { + assign(dV, mkV128(0x0000)); + return dV; + } + + assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( lo64, unop(Iop_V128to64, mkexpr(sV)) ); + + if (imm == 0) { + assign( lo64r, mkexpr(lo64) ); + assign( hi64r, mkexpr(hi64) ); + } + else + if (imm == 8) { + assign( hi64r, mkU64(0) ); + assign( lo64r, mkexpr(hi64) ); + } + else + if (imm > 8) { + assign( hi64r, mkU64(0) ); + assign( lo64r, binop( Iop_Shr64, + mkexpr(hi64), + mkU8( 8*(imm-8) ) )); + } else { + assign( hi64r, binop( Iop_Shr64, + mkexpr(hi64), + mkU8(8 * imm) )); + assign( lo64r, + binop( Iop_Or64, + binop(Iop_Shr64, mkexpr(lo64), + mkU8(8 * imm)), + binop(Iop_Shl64, mkexpr(hi64), + mkU8(8 * (8 - imm)) ) + ) + ); + } + + assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) ); + return dV; +} + + +static Long dis_CVTxSD2SI ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx, UChar opc, Int sz ) +{ + vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/); + Int alen = 0; + HChar dis_buf[50]; + UChar modrm = getUChar(delta); + IRTemp addr = IRTemp_INVALID; + IRTemp rmode = newTemp(Ity_I32); + IRTemp f64lo = newTemp(Ity_F64); + Bool r2zero = toBool(opc == 0x2C); + + modrm = getUChar(delta); + if (epartIsReg(modrm)) { + delta += 1; + assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0)); + DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "", + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameIReg(sz, gregOfRexRM(pfx,modrm), + False)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign(f64lo, loadLE(Ity_F64, mkexpr(addr))); + delta += alen; + DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "", + dis_buf, + nameIReg(sz, gregOfRexRM(pfx,modrm), + False)); + } + + if (r2zero) { + assign( rmode, mkU32((UInt)Irrm_ZERO) ); + } else { + assign( rmode, get_sse_roundingmode() ); + } + + if (sz == 4) { + putIReg32( gregOfRexRM(pfx,modrm), + binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) ); + } else { + putIReg64( gregOfRexRM(pfx,modrm), + binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) ); + } + + return delta; +} + + +static Long dis_CVTPS2PD ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp f32lo = newTemp(Ity_F32); + IRTemp f32hi = newTemp(Ity_F32); + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( f32lo, getXMMRegLane32F(rE, 0) ); + assign( f32hi, getXMMRegLane32F(rE, 1) ); + delta += 1; + DIP("%scvtps2pd %s,%s\n", + isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) ); + assign( f32hi, loadLE(Ity_F32, + binop(Iop_Add64,mkexpr(addr),mkU64(4))) ); + delta += alen; + DIP("%scvtps2pd %s,%s\n", + isAvx ? "v" : "", dis_buf, nameXMMReg(rG)); + } + + putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) ); + putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) ); + if (isAvx) + putYMMRegLane128( rG, 1, mkV128(0)); + return delta; +} + + +static Long dis_CVTPD2PS ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + IRTemp argV = newTemp(Ity_V128); + IRTemp rmode = newTemp(Ity_I32); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( argV, getXMMReg(rE) ); + delta += 1; + DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "", + nameXMMReg(rE), nameXMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( argV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += alen; + DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "", + dis_buf, nameXMMReg(rG) ); + } + + assign( rmode, get_sse_roundingmode() ); + IRTemp t0 = newTemp(Ity_F64); + IRTemp t1 = newTemp(Ity_F64); + assign( t0, unop(Iop_ReinterpI64asF64, + unop(Iop_V128to64, mkexpr(argV))) ); + assign( t1, unop(Iop_ReinterpI64asF64, + unop(Iop_V128HIto64, mkexpr(argV))) ); + +# define CVT(_t) binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) ) + putXMMRegLane32( rG, 3, mkU32(0) ); + putXMMRegLane32( rG, 2, mkU32(0) ); + putXMMRegLane32F( rG, 1, CVT(t1) ); + putXMMRegLane32F( rG, 0, CVT(t0) ); +# undef CVT + if (isAvx) + putYMMRegLane128( rG, 1, mkV128(0) ); + + return delta; +} + + +static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, UChar opc ) +{ + IRTemp s3, s2, s1, s0, d3, d2, d1, d0; + Bool hi = toBool(opc == 0x15); + vassert(opc == 0x15/*UNPCKLPS*/ || opc == 0x14/*UNPCKHPS*/); + s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID; + breakup128to32s( dV, &d3, &d2, &d1, &d0 ); + breakup128to32s( sV, &s3, &s2, &s1, &s0 ); + IRTemp res = newTemp(Ity_V128); + assign(res, hi ? mk128from32s( s3, d3, s2, d2 ) + : mk128from32s( s1, d1, s0, d0 )); + return res; +} + + /* Note, this also handles SSE(1) insns. */ __attribute__((noinline)) static @@ -9410,39 +9834,27 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */ /* These just appear to be special cases of SHUFPS */ if (haveNo66noF2noF3(pfx) && sz == 4) { - IRTemp sV, dV; - IRTemp s3, s2, s1, s0, d3, d2, d1, d0; - Bool hi = toBool(opc == 0x15); - sV = newTemp(Ity_V128); - dV = newTemp(Ity_V128); - s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID; + Bool hi = toBool(opc == 0x15); + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); modrm = getUChar(delta); - assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); - + UInt rG = gregOfRexRM(pfx,modrm); + assign( dV, getXMMReg(rG) ); if (epartIsReg(modrm)) { - assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getXMMReg(rE) ); delta += 1; DIP("unpck%sps %s,%s\n", hi ? "h" : "l", - nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm))); + nameXMMReg(rE), nameXMMReg(rG)); } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); delta += alen; DIP("unpck%sps %s,%s\n", hi ? "h" : "l", - dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm))); - } - - breakup128to32s( dV, &d3, &d2, &d1, &d0 ); - breakup128to32s( sV, &s3, &s2, &s1, &s0 ); - - if (hi) { - putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s3, d3, s2, d2 ) ); - } else { - putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s1, d1, s0, d0 ) ); + dis_buf, nameXMMReg(rG)); } - + IRTemp res = math_UNPCKxPS_128( sV, dV, opc ); + putXMMReg( rG, mkexpr(res) ); goto decode_success; } /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */ @@ -9785,14 +10197,14 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, if (epartIsReg(modrm)) { assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) ); delta += 1; - DIP("cvtsi2sd %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm))); + DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); assign( arg32, loadLE(Ity_I32, mkexpr(addr)) ); delta += alen; - DIP("cvtsi2sd %s,%s\n", dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm)) ); + DIP("cvtsi2sdl %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm)) ); } putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0, unop(Iop_I32StoF64, mkexpr(arg32)) @@ -10007,79 +10419,44 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, truncating towards zero */ if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) { + delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz); + goto decode_success; + } + /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x + I32 in mmx, according to prevailing SSE rounding mode */ + /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x + I32 in mmx, rounding towards zero */ + if (have66noF2noF3(pfx) && sz == 2) { + IRTemp dst64 = newTemp(Ity_I64); IRTemp rmode = newTemp(Ity_I32); IRTemp f64lo = newTemp(Ity_F64); + IRTemp f64hi = newTemp(Ity_F64); Bool r2zero = toBool(opc == 0x2C); + do_MMX_preamble(); modrm = getUChar(delta); + if (epartIsReg(modrm)) { delta += 1; assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0)); - DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "", + assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1)); + DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "", nameXMMReg(eregOfRexRM(pfx,modrm)), - nameIReg(sz, gregOfRexRM(pfx,modrm), - False)); + nameMMXReg(gregLO3ofRM(modrm))); } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); assign(f64lo, loadLE(Ity_F64, mkexpr(addr))); + assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64, + mkexpr(addr), + mkU64(8) ))); delta += alen; - DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "", + DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "", dis_buf, - nameIReg(sz, gregOfRexRM(pfx,modrm), - False)); + nameMMXReg(gregLO3ofRM(modrm))); } if (r2zero) { - assign( rmode, mkU32((UInt)Irrm_ZERO) ); - } else { - assign( rmode, get_sse_roundingmode() ); - } - - if (sz == 4) { - putIReg32( gregOfRexRM(pfx,modrm), - binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) ); - } else { - putIReg64( gregOfRexRM(pfx,modrm), - binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) ); - } - - goto decode_success; - } - /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x - I32 in mmx, according to prevailing SSE rounding mode */ - /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x - I32 in mmx, rounding towards zero */ - if (have66noF2noF3(pfx) && sz == 2) { - IRTemp dst64 = newTemp(Ity_I64); - IRTemp rmode = newTemp(Ity_I32); - IRTemp f64lo = newTemp(Ity_F64); - IRTemp f64hi = newTemp(Ity_F64); - Bool r2zero = toBool(opc == 0x2C); - - do_MMX_preamble(); - modrm = getUChar(delta); - - if (epartIsReg(modrm)) { - delta += 1; - assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0)); - assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1)); - DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "", - nameXMMReg(eregOfRexRM(pfx,modrm)), - nameMMXReg(gregLO3ofRM(modrm))); - } else { - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign(f64lo, loadLE(Ity_F64, mkexpr(addr))); - assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64, - mkexpr(addr), - mkU64(8) ))); - delta += alen; - DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "", - dis_buf, - nameMMXReg(gregLO3ofRM(modrm))); - } - - if (r2zero) { - assign(rmode, mkU32((UInt)Irrm_ZERO) ); + assign(rmode, mkU32((UInt)Irrm_ZERO) ); } else { assign( rmode, get_sse_roundingmode() ); } @@ -10102,75 +10479,13 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, /* 66 0F 2F = COMISD -- 64F0x2 comparison G,E, and set ZCP */ /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */ if (have66noF2noF3(pfx) && sz == 2) { - IRTemp argL = newTemp(Ity_F64); - IRTemp argR = newTemp(Ity_F64); - modrm = getUChar(delta); - if (epartIsReg(modrm)) { - assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm), - 0/*lowest lane*/ ) ); - delta += 1; - DIP("%scomisd %s,%s\n", opc==0x2E ? "u" : "", - nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm)) ); - } else { - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign( argR, loadLE(Ity_F64, mkexpr(addr)) ); - delta += alen; - DIP("%scomisd %s,%s\n", opc==0x2E ? "u" : "", - dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm)) ); - } - assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm), - 0/*lowest lane*/ ) ); - - stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); - stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); - stmt( IRStmt_Put( - OFFB_CC_DEP1, - binop( Iop_And64, - unop( Iop_32Uto64, - binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ), - mkU64(0x45) - ))); - + delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc ); goto decode_success; } /* 0F 2F = COMISS -- 32F0x4 comparison G,E, and set ZCP */ /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */ if (haveNo66noF2noF3(pfx) && sz == 4) { - IRTemp argL = newTemp(Ity_F32); - IRTemp argR = newTemp(Ity_F32); - modrm = getUChar(delta); - if (epartIsReg(modrm)) { - assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm), - 0/*lowest lane*/ ) ); - delta += 1; - DIP("%scomiss %s,%s\n", opc==0x2E ? "u" : "", - nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm)) ); - } else { - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign( argR, loadLE(Ity_F32, mkexpr(addr)) ); - delta += alen; - DIP("%scomiss %s,%s\n", opc==0x2E ? "u" : "", - dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm)) ); - } - assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm), - 0/*lowest lane*/ ) ); - - stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); - stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); - stmt( IRStmt_Put( - OFFB_CC_DEP1, - binop( Iop_And64, - unop( Iop_32Uto64, - binop(Iop_CmpF64, - unop(Iop_F32toF64,mkexpr(argL)), - unop(Iop_F32toF64,mkexpr(argR)))), - mkU64(0x45) - ))); - + delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc ); goto decode_success; } break; @@ -10366,7 +10681,7 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 ); goto decode_success; } - /* 0F 57 = XORPS -- G = G and E */ + /* 0F 57 = XORPS -- G = G xor E */ if (haveNo66noF2noF3(pfx) && sz == 4) { delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 ); goto decode_success; @@ -10427,31 +10742,7 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x F64 in xmm(G). */ if (haveNo66noF2noF3(pfx) && sz == 4) { - IRTemp f32lo = newTemp(Ity_F32); - IRTemp f32hi = newTemp(Ity_F32); - - modrm = getUChar(delta); - if (epartIsReg(modrm)) { - assign( f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0) ); - assign( f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1) ); - delta += 1; - DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm))); - } else { - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) ); - assign( f32hi, loadLE(Ity_F32, - binop(Iop_Add64,mkexpr(addr),mkU64(4))) ); - delta += alen; - DIP("cvtps2pd %s,%s\n", dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm)) ); - } - - putXMMRegLane64F( gregOfRexRM(pfx,modrm), 1, - unop(Iop_F32toF64, mkexpr(f32hi)) ); - putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0, - unop(Iop_F32toF64, mkexpr(f32lo)) ); - + delta = dis_CVTPS2PD( vbi, pfx, delta, False/*!isAvx*/ ); goto decode_success; } /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in @@ -10510,45 +10801,9 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, lo half xmm(G), rounding according to prevailing SSE rounding mode, and zero upper half */ /* Note, this is practically identical to CVTPD2DQ. It would have - been nicer to merge them together, but the insn[] offsets differ - by one. */ + be nice to merge them together. */ if (have66noF2noF3(pfx) && sz == 2) { - IRTemp argV = newTemp(Ity_V128); - IRTemp rmode = newTemp(Ity_I32); - - modrm = getUChar(delta); - if (epartIsReg(modrm)) { - assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) ); - delta += 1; - DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm))); - } else { - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign( argV, loadLE(Ity_V128, mkexpr(addr)) ); - delta += alen; - DIP("cvtpd2ps %s,%s\n", dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm)) ); - } - - assign( rmode, get_sse_roundingmode() ); - t0 = newTemp(Ity_F64); - t1 = newTemp(Ity_F64); - assign( t0, unop(Iop_ReinterpI64asF64, - unop(Iop_V128to64, mkexpr(argV))) ); - assign( t1, unop(Iop_ReinterpI64asF64, - unop(Iop_V128HIto64, mkexpr(argV))) ); - -# define CVT(_t) binop( Iop_F64toF32, \ - mkexpr(rmode), \ - mkexpr(_t) ) - - putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, mkU32(0) ); - putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, mkU32(0) ); - putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) ); - putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) ); - -# undef CVT - + delta = dis_CVTPD2PS( vbi, pfx, delta, False/*!isAvx*/ ); goto decode_success; } break; @@ -10959,39 +11214,7 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, case 0x70: /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */ if (have66noF2noF3(pfx) && sz == 2) { - Int order; - IRTemp sV, dV, s3, s2, s1, s0; - s3 = s2 = s1 = s0 = IRTemp_INVALID; - sV = newTemp(Ity_V128); - dV = newTemp(Ity_V128); - modrm = getUChar(delta); - if (epartIsReg(modrm)) { - assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); - order = (Int)getUChar(delta+1); - delta += 1+1; - DIP("pshufd $%d,%s,%s\n", order, - nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm))); - } else { - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, - 1/*byte after the amode*/ ); - assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); - order = (Int)getUChar(delta+alen); - delta += alen+1; - DIP("pshufd $%d,%s,%s\n", order, - dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm))); - } - breakup128to32s( sV, &s3, &s2, &s1, &s0 ); - -# define SEL(n) \ - ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) - assign(dV, - mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3), - SEL((order>>2)&3), SEL((order>>0)&3) ) - ); - putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV)); -# undef SEL + delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/); goto decode_success; } /* ***--- this is an MMX class insn introduced in SSE1 ---*** */ @@ -11176,59 +11399,13 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3) { - IRTemp sV, dV, hi64, lo64, hi64r, lo64r; - Int imm = (Int)getUChar(delta+1); - Int reg = eregOfRexRM(pfx,getUChar(delta)); + Int imm = (Int)getUChar(delta+1); + Int reg = eregOfRexRM(pfx,getUChar(delta)); DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg)); - vassert(imm >= 0 && imm <= 255); delta += 2; - - sV = newTemp(Ity_V128); - dV = newTemp(Ity_V128); - hi64 = newTemp(Ity_I64); - lo64 = newTemp(Ity_I64); - hi64r = newTemp(Ity_I64); - lo64r = newTemp(Ity_I64); - - if (imm >= 16) { - putXMMReg(reg, mkV128(0x0000)); - goto decode_success; - } - + IRTemp sV = newTemp(Ity_V128); assign( sV, getXMMReg(reg) ); - assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) ); - assign( lo64, unop(Iop_V128to64, mkexpr(sV)) ); - - if (imm == 0) { - assign( lo64r, mkexpr(lo64) ); - assign( hi64r, mkexpr(hi64) ); - } - else - if (imm == 8) { - assign( hi64r, mkU64(0) ); - assign( lo64r, mkexpr(hi64) ); - } - else - if (imm > 8) { - assign( hi64r, mkU64(0) ); - assign( lo64r, binop( Iop_Shr64, - mkexpr(hi64), - mkU8( 8*(imm-8) ) )); - } else { - assign( hi64r, binop( Iop_Shr64, - mkexpr(hi64), - mkU8(8 * imm) )); - assign( lo64r, - binop( Iop_Or64, - binop(Iop_Shr64, mkexpr(lo64), - mkU8(8 * imm)), - binop(Iop_Shl64, mkexpr(hi64), - mkU8(8 * (8 - imm)) ) - ) - ); - } - - assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) ); + IRTemp dV = math_PSRLDQ( sV, imm ); putXMMReg(reg, mkexpr(dV)); goto decode_success; } @@ -11620,18 +11797,18 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, d->fxState[4].size = sizeof(ULong); d->fxState[5].fx = Ifx_Read; - d->fxState[5].offset = OFFB_XMM0; + d->fxState[5].offset = OFFB_YMM0; d->fxState[5].size = 16 * sizeof(U128); d->fxState[6].fx = Ifx_Read; d->fxState[6].offset = OFFB_SSEROUND; d->fxState[6].size = sizeof(ULong); - /* Be paranoid ... this assertion tries to ensure the 16 %xmm + /* Be paranoid ... this assertion tries to ensure the 16 %ymm images are packed back-to-back. If not, the value of d->fxState[5].size is wrong. */ - vassert(16 == sizeof(U128)); - vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16)); + vassert(32 == sizeof(U256)); + vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32)); stmt( IRStmt_Dirty(d) ); @@ -11695,18 +11872,18 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, d->fxState[4].size = sizeof(ULong); d->fxState[5].fx = Ifx_Write; - d->fxState[5].offset = OFFB_XMM0; + d->fxState[5].offset = OFFB_YMM0; d->fxState[5].size = 16 * sizeof(U128); d->fxState[6].fx = Ifx_Write; d->fxState[6].offset = OFFB_SSEROUND; d->fxState[6].size = sizeof(ULong); - /* Be paranoid ... this assertion tries to ensure the 16 %xmm + /* Be paranoid ... this assertion tries to ensure the 16 %ymm images are packed back-to-back. If not, the value of d->fxState[5].size is wrong. */ - vassert(16 == sizeof(U128)); - vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16)); + vassert(32 == sizeof(U256)); + vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32)); stmt( IRStmt_Dirty(d) ); @@ -11717,23 +11894,27 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, case 0xC2: /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */ if (haveNo66noF2noF3(pfx) && sz == 4) { - delta = dis_SSEcmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 ); - goto decode_success; + Long delta0 = delta; + delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 ); + if (delta > delta0) goto decode_success; } /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */ if (haveF3no66noF2(pfx) && sz == 4) { - delta = dis_SSEcmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 ); - goto decode_success; + Long delta0 = delta; + delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 ); + if (delta > delta0) goto decode_success; } /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */ if (haveF2no66noF3(pfx) && sz == 4) { - delta = dis_SSEcmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 ); - goto decode_success; + Long delta0 = delta; + delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 ); + if (delta > delta0) goto decode_success; } /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */ if (have66noF2noF3(pfx) && sz == 2) { - delta = dis_SSEcmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 ); - goto decode_success; + Long delta0 = delta; + delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 ); + if (delta > delta0) goto decode_success; } break; @@ -12871,6 +13052,36 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, /*--- ---*/ /*------------------------------------------------------------*/ +static Long dis_MOVDDUP_128 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp sV = newTemp(Ity_V128); + IRTemp d0 = newTemp(Ity_I64); + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getXMMReg(rE) ); + DIP("%smovddup %s,%s\n", + isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG)); + delta += 1; + assign ( d0, unop(Iop_V128to64, mkexpr(sV)) ); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( d0, loadLE(Ity_I64, mkexpr(addr)) ); + DIP("%smovddup %s,%s\n", + isAvx ? "v" : "", dis_buf, nameXMMReg(rG)); + delta += alen; + } + (isAvx ? putYMMRegLoAndZU : putXMMReg) + ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) ); + return delta; +} + + __attribute__((noinline)) static Long dis_ESC_0F__SSE3 ( Bool* decode_OK, @@ -12921,26 +13132,7 @@ Long dis_ESC_0F__SSE3 ( Bool* decode_OK, duplicating some lanes (0:1:0:1). */ if (haveF2no66noF3(pfx) && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) { - IRTemp sV = newTemp(Ity_V128); - IRTemp d0 = newTemp(Ity_I64); - - modrm = getUChar(delta); - if (epartIsReg(modrm)) { - assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) ); - DIP("movddup %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), - nameXMMReg(gregOfRexRM(pfx,modrm))); - delta += 1; - assign ( d0, unop(Iop_V128to64, mkexpr(sV)) ); - } else { - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign( d0, loadLE(Ity_I64, mkexpr(addr)) ); - DIP("movddup %s,%s\n", dis_buf, - nameXMMReg(gregOfRexRM(pfx,modrm))); - delta += alen; - } - - putXMMReg( gregOfRexRM(pfx,modrm), - binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) ); + delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ ); goto decode_success; } break; @@ -13172,6 +13364,97 @@ Long dis_ESC_0F__SSE3 ( Bool* decode_OK, /*--- ---*/ /*------------------------------------------------------------*/ +static +IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ ) +{ + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + IRTemp sevens = newTemp(Ity_I64); + IRTemp mask0x80hi = newTemp(Ity_I64); + IRTemp mask0x80lo = newTemp(Ity_I64); + IRTemp maskBit3hi = newTemp(Ity_I64); + IRTemp maskBit3lo = newTemp(Ity_I64); + IRTemp sAnd7hi = newTemp(Ity_I64); + IRTemp sAnd7lo = newTemp(Ity_I64); + IRTemp permdHi = newTemp(Ity_I64); + IRTemp permdLo = newTemp(Ity_I64); + IRTemp res = newTemp(Ity_V128); + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + assign( sevens, mkU64(0x0707070707070707ULL) ); + + /* mask0x80hi = Not(SarN8x8(sHi,7)) + maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7) + sAnd7hi = And(sHi,sevens) + permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi), + And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) ) + rHi = And(permdHi,mask0x80hi) + */ + assign( + mask0x80hi, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7)))); + + assign( + maskBit3hi, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)), + mkU8(7))); + + assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens))); + + assign( + permdHi, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)), + mkexpr(maskBit3hi)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)), + unop(Iop_Not64,mkexpr(maskBit3hi))) )); + + assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) ); + + /* And the same for the lower half of the result. What fun. */ + + assign( + mask0x80lo, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7)))); + + assign( + maskBit3lo, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)), + mkU8(7))); + + assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens))); + + assign( + permdLo, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)), + mkexpr(maskBit3lo)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)), + unop(Iop_Not64,mkexpr(maskBit3lo))) )); + + assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) ); + + assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))); + return res; +} + + __attribute__((noinline)) static Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK, @@ -13194,23 +13477,8 @@ Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK, /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */ if (have66noF2noF3(pfx) && (sz == 2 || /*redundant REX.W*/ sz == 8)) { - IRTemp sV = newTemp(Ity_V128); - IRTemp dV = newTemp(Ity_V128); - IRTemp sHi = newTemp(Ity_I64); - IRTemp sLo = newTemp(Ity_I64); - IRTemp dHi = newTemp(Ity_I64); - IRTemp dLo = newTemp(Ity_I64); - IRTemp rHi = newTemp(Ity_I64); - IRTemp rLo = newTemp(Ity_I64); - IRTemp sevens = newTemp(Ity_I64); - IRTemp mask0x80hi = newTemp(Ity_I64); - IRTemp mask0x80lo = newTemp(Ity_I64); - IRTemp maskBit3hi = newTemp(Ity_I64); - IRTemp maskBit3lo = newTemp(Ity_I64); - IRTemp sAnd7hi = newTemp(Ity_I64); - IRTemp sAnd7lo = newTemp(Ity_I64); - IRTemp permdHi = newTemp(Ity_I64); - IRTemp permdLo = newTemp(Ity_I64); + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); modrm = getUChar(delta); assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); @@ -13229,83 +13497,14 @@ Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK, nameXMMReg(gregOfRexRM(pfx,modrm))); } - assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); - assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); - assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); - assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); - - assign( sevens, mkU64(0x0707070707070707ULL) ); - - /* - mask0x80hi = Not(SarN8x8(sHi,7)) - maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7) - sAnd7hi = And(sHi,sevens) - permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi), - And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) ) - rHi = And(permdHi,mask0x80hi) - */ - assign( - mask0x80hi, - unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7)))); - - assign( - maskBit3hi, - binop(Iop_SarN8x8, - binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)), - mkU8(7))); - - assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens))); - - assign( - permdHi, - binop( - Iop_Or64, - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)), - mkexpr(maskBit3hi)), - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)), - unop(Iop_Not64,mkexpr(maskBit3hi))) )); - - assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) ); - - /* And the same for the lower half of the result. What fun. */ - - assign( - mask0x80lo, - unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7)))); - - assign( - maskBit3lo, - binop(Iop_SarN8x8, - binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)), - mkU8(7))); - - assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens))); - - assign( - permdLo, - binop( - Iop_Or64, - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)), - mkexpr(maskBit3lo)), - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)), - unop(Iop_Not64,mkexpr(maskBit3lo))) )); - - assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) ); - - putXMMReg( - gregOfRexRM(pfx,modrm), - binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)) - ); - goto decode_success; - } - /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */ - if (haveNo66noF2noF3(pfx) && sz == 4) { - IRTemp sV = newTemp(Ity_I64); - IRTemp dV = newTemp(Ity_I64); + IRTemp res = math_PSHUFB_XMM( dV, sV ); + putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res)); + goto decode_success; + } + /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */ + if (haveNo66noF2noF3(pfx) && sz == 4) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); modrm = getUChar(delta); do_MMX_preamble(); @@ -14198,6 +14397,100 @@ Long dis_ESC_0F__SSE4 ( Bool* decode_OK, /*--- ---*/ /*------------------------------------------------------------*/ +static IRTemp math_PBLENDVB ( IRTemp vecE, IRTemp vecG, + IRTemp vec0/*controlling mask*/, + UInt gran, IROp opSAR ) +{ + /* The tricky bit is to convert vec0 into a suitable mask, by + copying the most significant bit of each lane into all positions + in the lane. */ + IRTemp sh = newTemp(Ity_I8); + assign(sh, mkU8(8 * gran - 1)); + + IRTemp mask = newTemp(Ity_V128); + assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh))); + + IRTemp notmask = newTemp(Ity_V128); + assign(notmask, unop(Iop_NotV128, mkexpr(mask))); + + IRTemp res = newTemp(Ity_V128); + assign(res, binop(Iop_OrV128, + binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)), + binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask)))); + return res; +} + + +static Long dis_PMOVZXBW ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool writesYmm ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + + if ( epartIsReg(modrm) ) { + assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) ); + delta += 1; + DIP( "pmovzxbw %s,%s\n", + nameXMMReg( eregOfRexRM(pfx, modrm) ), + nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, + unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) ); + delta += alen; + DIP( "pmovzxbw %s,%s\n", + dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } + + IRExpr* res + = binop( Iop_InterleaveLO8x16, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ); + + (writesYmm ? putYMMRegLoAndZU : putXMMReg) + ( gregOfRexRM(pfx, modrm), res ); + + return delta; +} + + +static Long dis_PMOVZXWD ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool writesYmm ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + + if ( epartIsReg(modrm) ) { + assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) ); + delta += 1; + DIP( "pmovzxwd %s,%s\n", + nameXMMReg( eregOfRexRM(pfx, modrm) ), + nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, + unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) ); + delta += alen; + DIP( "pmovzxwd %s,%s\n", + dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } + + IRExpr* res + = binop( Iop_InterleaveLO16x8, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ); + + (writesYmm ? putYMMRegLoAndZU : putXMMReg) + ( gregOfRexRM(pfx, modrm), res ); + + return delta; +} + + __attribute__((noinline)) static Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, @@ -14266,22 +14559,8 @@ Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm))); assign(vec0, getXMMReg(0)); - /* Now the tricky bit is to convert vec0 into a suitable mask, - by copying the most significant bit of each lane into all - positions in the lane. */ - IRTemp sh = newTemp(Ity_I8); - assign(sh, mkU8(8 * gran - 1)); - - IRTemp mask = newTemp(Ity_V128); - assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh))); - - IRTemp notmask = newTemp(Ity_V128); - assign(notmask, unop(Iop_NotV128, mkexpr(mask))); - - IRExpr* res = binop(Iop_OrV128, - binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)), - binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))); - putXMMReg(gregOfRexRM(pfx, modrm), res); + IRTemp res = math_PBLENDVB( vecE, vecG, vec0, gran, opSAR ); + putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res)); goto decode_success; } @@ -14715,30 +14994,7 @@ Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64 Packed Move with Zero Extend from Byte to Word (XMM) */ if (have66noF2noF3(pfx) && sz == 2) { - - modrm = getUChar(delta); - - IRTemp srcVec = newTemp(Ity_V128); - - if ( epartIsReg(modrm) ) { - assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) ); - delta += 1; - DIP( "pmovzxbw %s,%s\n", - nameXMMReg( eregOfRexRM(pfx, modrm) ), - nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } else { - addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign( srcVec, - unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) ); - delta += alen; - DIP( "pmovzxbw %s,%s\n", - dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } - - putXMMReg( gregOfRexRM(pfx, modrm), - binop( Iop_InterleaveLO8x16, - IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) ); - + delta = dis_PMOVZXBW( vbi, pfx, delta, False/*!writesYmm*/); goto decode_success; } break; @@ -14824,31 +15080,7 @@ Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64 Packed Move with Zero Extend from Word to DWord (XMM) */ if (have66noF2noF3(pfx) && sz == 2) { - - modrm = getUChar(delta); - - IRTemp srcVec = newTemp(Ity_V128); - - if ( epartIsReg(modrm) ) { - assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) ); - delta += 1; - DIP( "pmovzxwd %s,%s\n", - nameXMMReg( eregOfRexRM(pfx, modrm) ), - nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } else { - addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); - assign( srcVec, - unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) ); - delta += alen; - DIP( "pmovzxwd %s,%s\n", - dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } - - putXMMReg( gregOfRexRM(pfx, modrm), - binop( Iop_InterleaveLO16x8, - IRExpr_Const( IRConst_V128(0) ), - mkexpr(srcVec) ) ); - + delta = dis_PMOVZXWD( vbi, pfx, delta, False/*!writesYmm*/); goto decode_success; } break; @@ -15114,7 +15346,7 @@ Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, regNoR = gregOfRexRM(pfx, modrm); addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); /* alignment check needed ???? */ - stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) )); + stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) )); delta += alen; } @@ -15124,8 +15356,8 @@ Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, /* Round up the arguments. Note that this is a kludge -- the use of mkU64 rather than mkIRExpr_HWord implies the assumption that the host's word size is 64-bit. */ - UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL); - UInt gstOffR = xmmGuestRegOffset(regNoR); + UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL); + UInt gstOffR = ymmGuestRegOffset(regNoR); IRExpr* opc4 = mkU64(opc); IRExpr* gstOffLe = mkU64(gstOffL); IRExpr* gstOffRe = mkU64(gstOffR); @@ -15252,6 +15484,59 @@ Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, /*--- ---*/ /*------------------------------------------------------------*/ +static Long dis_PEXTRD ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx ) +{ + IRTemp addr = IRTemp_INVALID; + IRTemp t0 = IRTemp_INVALID; + IRTemp t1 = IRTemp_INVALID; + IRTemp t2 = IRTemp_INVALID; + IRTemp t3 = IRTemp_INVALID; + UChar modrm = 0; + Int alen = 0; + HChar dis_buf[50]; + + Int imm8_10; + IRTemp xmm_vec = newTemp(Ity_V128); + IRTemp src_dword = newTemp(Ity_I32); + HChar* mbV = isAvx ? "v" : ""; + + vassert(0==getRexW(pfx)); /* ensured by caller */ + modrm = getUChar(delta); + assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) ); + breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 ); + + if ( epartIsReg( modrm ) ) { + imm8_10 = (Int)(getUChar(delta+1) & 3); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + imm8_10 = (Int)(getUChar(delta+alen) & 3); + } + + switch ( imm8_10 ) { + case 0: assign( src_dword, mkexpr(t0) ); break; + case 1: assign( src_dword, mkexpr(t1) ); break; + case 2: assign( src_dword, mkexpr(t2) ); break; + case 3: assign( src_dword, mkexpr(t3) ); break; + default: vassert(0); + } + + if ( epartIsReg( modrm ) ) { + putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) ); + delta += 1+1; + DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10, + nameXMMReg( gregOfRexRM(pfx, modrm) ), + nameIReg32( eregOfRexRM(pfx, modrm) ) ); + } else { + storeLE( mkexpr(addr), mkexpr(src_dword) ); + delta += alen+1; + DIP( "%spextrd $%d, %s,%s\n", mbV, + imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf ); + } + return delta; +} + + __attribute__((noinline)) static Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, @@ -15716,43 +16001,7 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, here the REX.W bit is _not_ present */ if (have66noF2noF3(pfx) && sz == 2 /* REX.W is _not_ present */) { - - Int imm8_10; - IRTemp xmm_vec = newTemp(Ity_V128); - IRTemp src_dword = newTemp(Ity_I32); - - modrm = getUChar(delta); - assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) ); - breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 ); - - if ( epartIsReg( modrm ) ) { - imm8_10 = (Int)(getUChar(delta+1) & 3); - } else { - addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); - imm8_10 = (Int)(getUChar(delta+alen) & 3); - } - - switch ( imm8_10 ) { - case 0: assign( src_dword, mkexpr(t0) ); break; - case 1: assign( src_dword, mkexpr(t1) ); break; - case 2: assign( src_dword, mkexpr(t2) ); break; - case 3: assign( src_dword, mkexpr(t3) ); break; - default: vassert(0); - } - - if ( epartIsReg( modrm ) ) { - putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) ); - delta += 1+1; - DIP( "pextrd $%d, %s,%s\n", imm8_10, - nameXMMReg( gregOfRexRM(pfx, modrm) ), - nameIReg32( eregOfRexRM(pfx, modrm) ) ); - } else { - storeLE( mkexpr(addr), mkexpr(src_dword) ); - delta += alen+1; - DIP( "pextrd $%d, %s,%s\n", - imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf ); - } - + delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ ); goto decode_success; } /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8 @@ -16404,7 +16653,7 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); /* No alignment check; I guess that makes sense, given that these insns are for dealing with C style strings. */ - stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) )); + stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) )); imm = getUChar(delta+alen); delta += alen+1; } @@ -16433,8 +16682,8 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, /* Round up the arguments. Note that this is a kludge -- the use of mkU64 rather than mkIRExpr_HWord implies the assumption that the host's word size is 64-bit. */ - UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL); - UInt gstOffR = xmmGuestRegOffset(regNoR); + UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL); + UInt gstOffR = ymmGuestRegOffset(regNoR); IRExpr* opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF)); IRExpr* gstOffLe = mkU64(gstOffL); @@ -16462,7 +16711,7 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, /* Declare that the helper writes XMM0. */ d->nFxState = 3; d->fxState[2].fx = Ifx_Write; - d->fxState[2].offset = xmmGuestRegOffset(0); + d->fxState[2].offset = ymmGuestRegOffset(0); d->fxState[2].size = sizeof(U128); } @@ -16518,7 +16767,7 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, regNoR = gregOfRexRM(pfx, modrm); addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); /* alignment check ???? . */ - stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) )); + stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) )); imm = getUChar(delta+alen); delta += alen+1; } @@ -16530,8 +16779,8 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, /* Round up the arguments. Note that this is a kludge -- the use of mkU64 rather than mkIRExpr_HWord implies the assumption that the host's word size is 64-bit. */ - UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL); - UInt gstOffR = xmmGuestRegOffset(regNoR); + UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL); + UInt gstOffR = ymmGuestRegOffset(regNoR); IRExpr* imme = mkU64(imm & 0xFF); IRExpr* gstOffLe = mkU64(gstOffL); @@ -19001,94 +19250,1594 @@ Long dis_ESC_0F3A ( return delta; } - //decode_failure: return deltaIN; /* fail */ } /*------------------------------------------------------------*/ /*--- ---*/ -/*--- Disassemble a single instruction ---*/ +/*--- Top-level post-escape decoders: dis_ESC_0F__VEX ---*/ /*--- ---*/ /*------------------------------------------------------------*/ -/* Disassemble a single instruction into IR. The instruction is - located in host memory at &guest_code[delta]. */ - static -DisResult disInstr_AMD64_WRK ( - /*OUT*/Bool* expect_CAS, - Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), - Bool resteerCisOk, - void* callback_opaque, - Long delta64, - VexArchInfo* archinfo, - VexAbiInfo* vbi - ) +Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG ( + /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi, + Prefix pfx, Long delta, HChar* name, + /* The actual operation. Use either 'op' or 'opfn', + but not both. */ + IROp op, IRTemp(*opFn)(IRTemp,IRTemp), + Bool invertLeftArg + ) { - IRTemp t1, t2, t3, t4, t5, t6; - UChar pre; - Int n, n_prefixes; - DisResult dres; + UChar modrm = getUChar(delta); + UInt rD = gregOfRexRM(pfx, modrm); + UInt rSL = getVexNvvvv(pfx); + IRTemp tSL = newTemp(Ity_V128); + IRTemp tSR = newTemp(Ity_V128); + IRTemp addr = IRTemp_INVALID; + HChar dis_buf[50]; + Int alen = 0; + vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/); - /* The running delta */ - Long delta = delta64; + assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL)) + : getXMMReg(rSL)); - /* Holds eip at the start of the insn, so that we can print - consistent error messages for unimplemented insns. */ - Long delta_start = delta; + if (epartIsReg(modrm)) { + UInt rSR = eregOfRexRM(pfx, modrm); + delta += 1; + assign(tSR, getXMMReg(rSR)); + DIP("%s %s,%s,%s\n", + name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + assign(tSR, loadLE(Ity_V128, mkexpr(addr))); + DIP("%s %s,%s,%s\n", + name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD)); + } - /* sz denotes the nominal data-op size of the insn; we change it to - 2 if an 0x66 prefix is seen and 8 if REX.W is 1. In case of - conflict REX.W takes precedence. */ - Int sz = 4; + IRTemp res = IRTemp_INVALID; + if (op != Iop_INVALID) { + vassert(opFn == NULL); + res = newTemp(Ity_V128); + assign(res, binop(op, mkexpr(tSL), mkexpr(tSR))); + } else { + vassert(opFn != NULL); + res = opFn(tSL, tSR); + } - /* pfx holds the summary of prefixes. */ - Prefix pfx = PFX_EMPTY; + putYMMRegLoAndZU(rD, mkexpr(res)); - /* Set result defaults. */ - dres.whatNext = Dis_Continue; - dres.len = 0; - dres.continueAt = 0; - dres.jk_StopHere = Ijk_INVALID; - *expect_CAS = False; + *uses_vvvv = True; + return delta; +} - vassert(guest_RIP_next_assumed == 0); - vassert(guest_RIP_next_mustcheck == False); - t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID; +/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp + for the operation, and no inversion of the left arg. */ +static +Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple ( + /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi, + Prefix pfx, Long delta, HChar* name, + IROp op + ) +{ + return dis_VEX_NDS_128_AnySimdPfx_0F_WIG( + uses_vvvv, vbi, pfx, delta, name, op, NULL, False); +} - DIP("\t0x%llx: ", guest_RIP_bbstart+delta); - /* Spot "Special" instructions (see comment at top of file). */ - { - UChar* code = (UChar*)(guest_code + delta); - /* Spot the 16-byte preamble: - 48C1C703 rolq $3, %rdi - 48C1C70D rolq $13, %rdi - 48C1C73D rolq $61, %rdi - 48C1C733 rolq $51, %rdi - */ - if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7 - && code[ 3] == 0x03 && - code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7 - && code[ 7] == 0x0D && - code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7 - && code[11] == 0x3D && - code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7 - && code[15] == 0x33) { - /* Got a "Special" instruction preamble. Which one is it? */ - if (code[16] == 0x48 && code[17] == 0x87 - && code[18] == 0xDB /* xchgq %rbx,%rbx */) { - /* %RDX = client_request ( %RAX ) */ - DIP("%%rdx = client_request ( %%rax )\n"); - delta += 19; - jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta); - vassert(dres.whatNext == Dis_StopHere); - goto decode_success; - } - else - if (code[16] == 0x48 && code[17] == 0x87 +/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR + generator to compute the result, and no inversion of the left + arg. */ +static +Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex ( + /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi, + Prefix pfx, Long delta, HChar* name, + IRTemp(*opFn)(IRTemp,IRTemp) + ) +{ + return dis_VEX_NDS_128_AnySimdPfx_0F_WIG( + uses_vvvv, vbi, pfx, delta, name, Iop_INVALID, opFn, False); +} + + +/* Vector by scalar shift of E into V, by an immediate byte. Modified + version of dis_SSE_shiftE_imm. */ +static +Long dis_AVX128_shiftE_to_V_imm( Prefix pfx, + Long delta, HChar* opname, IROp op ) +{ + Bool shl, shr, sar; + UChar rm = getUChar(delta); + IRTemp e0 = newTemp(Ity_V128); + IRTemp e1 = newTemp(Ity_V128); + UInt rD = getVexNvvvv(pfx); + UChar amt, size; + vassert(epartIsReg(rm)); + vassert(gregLO3ofRM(rm) == 2 + || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6); + amt = getUChar(delta+1); + delta += 2; + DIP("%s $%d,%s,%s\n", opname, + (Int)amt, + nameXMMReg(eregOfRexRM(pfx,rm)), + nameXMMReg(rD)); + assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) ); + + shl = shr = sar = False; + size = 0; + switch (op) { + //case Iop_ShlN16x8: shl = True; size = 16; break; + case Iop_ShlN32x4: shl = True; size = 32; break; + //case Iop_ShlN64x2: shl = True; size = 64; break; + //case Iop_SarN16x8: sar = True; size = 16; break; + //case Iop_SarN32x4: sar = True; size = 32; break; + //case Iop_ShrN16x8: shr = True; size = 16; break; + //case Iop_ShrN32x4: shr = True; size = 32; break; + //case Iop_ShrN64x2: shr = True; size = 64; break; + default: vassert(0); + } + + if (shl || shr) { + assign( e1, amt >= size + ? mkV128(0x0000) + : binop(op, mkexpr(e0), mkU8(amt)) + ); + } else + if (sar) { + assign( e1, amt >= size + ? binop(op, mkexpr(e0), mkU8(size-1)) + : binop(op, mkexpr(e0), mkU8(amt)) + ); + } else { + vassert(0); + } + + putYMMRegLoAndZU( rD, mkexpr(e1) ); + return delta; +} + + +/* Lower 64-bit lane only AVX128 binary operation: + G[63:0] = V[63:0] `op` E[63:0] + G[127:64] = V[127:64] + G[255:128] = 0. + The specified op must be of the 64F0x2 kind, so that it + copies the upper half of the left operand to the result. +*/ +static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv, + VexAbiInfo* vbi, + Prefix pfx, Long delta, + HChar* opname, IROp op ) +{ + HChar dis_buf[50]; + Int alen; + IRTemp addr; + UChar rm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,rm); + UInt rV = getVexNvvvv(pfx); + IRExpr* vpart = getXMMReg(rV); + if (epartIsReg(rm)) { + UInt rE = eregOfRexRM(pfx,rm); + putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) ); + DIP("%s %s,%s,%s\n", opname, + nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG)); + delta = delta+1; + } else { + /* We can only do a 64-bit memory read, so the upper half of the + E operand needs to be made simply of zeroes. */ + IRTemp epart = newTemp(Ity_V128); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( epart, unop( Iop_64UtoV128, + loadLE(Ity_I64, mkexpr(addr))) ); + putXMMReg( rG, binop(op, vpart, mkexpr(epart)) ); + DIP("%s %s,%s,%s\n", opname, + dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + delta = delta+alen; + } + putYMMRegLane128( rG, 1, mkV128(0) ); + *uses_vvvv = True; + return delta; +} + + +/* Lower 64-bit lane only AVX128 unary operation: + G[63:0] = op(E[63:0]) + G[127:64] = V[127:64] + G[255:128] = 0 + The specified op must be of the 64F0x2 kind, so that it + copies the upper half of the operand to the result. +*/ +static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv, + VexAbiInfo* vbi, + Prefix pfx, Long delta, + HChar* opname, IROp op ) +{ + HChar dis_buf[50]; + Int alen; + IRTemp addr; + UChar rm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,rm); + UInt rV = getVexNvvvv(pfx); + IRTemp e64 = newTemp(Ity_I64); + + /* Fetch E[63:0] */ + if (epartIsReg(rm)) { + UInt rE = eregOfRexRM(pfx,rm); + assign(e64, getXMMRegLane64(rE, 0)); + DIP("%s %s,%s,%s\n", opname, + nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign(e64, loadLE(Ity_I64, mkexpr(addr))); + DIP("%s %s,%s,%s\n", opname, + dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + delta += alen; + } + + /* Create a value 'arg' as V[127:64]++E[63:0] */ + IRTemp arg = newTemp(Ity_V128); + assign(arg, + binop(Iop_SetV128lo64, + getXMMReg(rV), mkexpr(e64))); + /* and apply op to it */ + putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) ); + *uses_vvvv = True; + return delta; +} + + +/* Lower 32-bit lane only AVX128 binary operation: + G[31:0] = V[31:0] `op` E[31:0] + G[127:32] = V[127:32] + G[255:128] = 0. + The specified op must be of the 32F0x4 kind, so that it + copies the upper 3/4 of the left operand to the result. +*/ +static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv, + VexAbiInfo* vbi, + Prefix pfx, Long delta, + HChar* opname, IROp op ) +{ + HChar dis_buf[50]; + Int alen; + IRTemp addr; + UChar rm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,rm); + UInt rV = getVexNvvvv(pfx); + IRExpr* vpart = getXMMReg(rV); + if (epartIsReg(rm)) { + UInt rE = eregOfRexRM(pfx,rm); + putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) ); + DIP("%s %s,%s,%s\n", opname, + nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG)); + delta = delta+1; + } else { + /* We can only do a 32-bit memory read, so the upper 3/4 of the + E operand needs to be made simply of zeroes. */ + IRTemp epart = newTemp(Ity_V128); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( epart, unop( Iop_32UtoV128, + loadLE(Ity_I32, mkexpr(addr))) ); + putXMMReg( rG, binop(op, vpart, mkexpr(epart)) ); + DIP("%s %s,%s,%s\n", opname, + dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + delta = delta+alen; + } + putYMMRegLane128( rG, 1, mkV128(0) ); + *uses_vvvv = True; + return delta; +} + + +/* Handles AVX128 32F/64F comparisons. A derivative of + dis_SSEcmp_E_to_G. It can fail, in which case it returns the + original delta to indicate failure. */ +static +Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv, + VexAbiInfo* vbi, + Prefix pfx, Long delta, + HChar* opname, Bool all_lanes, Int sz ) +{ + Long deltaIN = delta; + HChar dis_buf[50]; + Int alen; + UInt imm8; + IRTemp addr; + Bool preSwap = False; + IROp op = Iop_INVALID; + Bool postNot = False; + IRTemp plain = newTemp(Ity_V128); + UChar rm = getUChar(delta); + UShort mask = 0; + vassert(sz == 4 || sz == 8); + UInt rG = gregOfRexRM(pfx, rm); + UInt rV = getVexNvvvv(pfx); + IRExpr *argL = NULL, *argR = NULL; + if (epartIsReg(rm)) { + imm8 = getUChar(delta+1); + Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz); + if (!ok) return deltaIN; /* FAIL */ + UInt rE = eregOfRexRM(pfx,rm); + argL = getXMMReg(rV); + argR = getXMMReg(rE); + delta += 1+1; + DIP("%s $%d,%s,%s,%s\n", + opname, (Int)imm8, + nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 ); + imm8 = getUChar(delta+alen); + Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz); + if (!ok) return deltaIN; /* FAIL */ + argL = getXMMReg(rV); + argR = all_lanes ? loadLE(Ity_V128, mkexpr(addr)) + : sz == 8 ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr))) + : /*sz==4*/ unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))); + delta += alen+1; + DIP("%s $%d,%s,%s,%s\n", + opname, (Int)imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + } + + assign(plain, + preSwap ? binop(op, argR, argL) : binop(op, argL, argR)); + + /* FIXME AVX: in the case where we need a preSwap == True and + !all_lanes, I am not sure if this is correct or not. */ + + if (postNot && all_lanes) { + putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) ); + } + else + if (postNot && !all_lanes) { + mask = toUShort(sz==4 ? 0x000F : 0x00FF); + putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) ); + } + else { + putYMMRegLoAndZU( rG, mkexpr(plain) ); + } + + *uses_vvvv = True; + return delta; +} + + +__attribute__((noinline)) +static +Long dis_ESC_0F__VEX ( + /*MB_OUT*/DisResult* dres, + /*OUT*/ Bool* uses_vvvv, + Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), + Bool resteerCisOk, + void* callback_opaque, + VexArchInfo* archinfo, + VexAbiInfo* vbi, + Prefix pfx, Int sz, Long deltaIN + ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + Long delta = deltaIN; + UChar opc = getUChar(delta); + delta++; + *uses_vvvv = False; + + switch (opc) { + + case 0x10: + /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */ + /* Move 64 bits from E (mem only) to G (lo half xmm). + Bits 255-64 of the dest are zeroed out. */ + if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + UInt rG = gregOfRexRM(pfx,modrm); + IRTemp z128 = newTemp(Ity_V128); + assign(z128, mkV128(0)); + putXMMReg( rG, mkexpr(z128) ); + /* FIXME: ALIGNMENT CHECK? */ + putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) ); + putYMMRegLane128( rG, 1, mkexpr(z128) ); + DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG)); + delta += alen; + goto decode_success; + } + /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */ + /* Move 32 bits from E (mem only) to G (lo half xmm). + Bits 255-32 of the dest are zeroed out. */ + if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + UInt rG = gregOfRexRM(pfx,modrm); + IRTemp z128 = newTemp(Ity_V128); + assign(z128, mkV128(0)); + putXMMReg( rG, mkexpr(z128) ); + /* FIXME: ALIGNMENT CHECK? */ + putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) ); + putYMMRegLane128( rG, 1, mkexpr(z128) ); + DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG)); + delta += alen; + goto decode_success; + } + break; + + case 0x11: + /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */ + /* Move 64 bits from G (low half xmm) to mem only. */ + if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + UInt rG = gregOfRexRM(pfx,modrm); + /* FIXME: ALIGNMENT CHECK? */ + storeLE( mkexpr(addr), getXMMRegLane64(rG, 0)); + DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf); + delta += alen; + goto decode_success; + } + /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */ + /* Move 32 bits from G (low 1/4 xmm) to mem only. */ + if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + UInt rG = gregOfRexRM(pfx,modrm); + /* FIXME: ALIGNMENT CHECK? */ + storeLE( mkexpr(addr), getXMMRegLane32(rG, 0)); + DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf); + delta += alen; + goto decode_success; + } + /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMRegLoAndZU( rE, getXMMReg(rG) ); + DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + storeLE( mkexpr(addr), getXMMReg(rG) ); + DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf); + delta += alen; + } + goto decode_success; + } + break; + + case 0x12: + /* MOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */ + if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ ); + goto decode_success; + } + break; + + case 0x14: + /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + Bool hi = opc == 0x15; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp eV = newTemp(Ity_V128); + IRTemp vV = newTemp(Ity_V128); + assign( vV, getXMMReg(rV) ); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( eV, getXMMReg(rE) ); + delta += 1; + DIP("vunpck%sps %s,%s\n", hi ? "h" : "l", + nameXMMReg(rE), nameXMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( eV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += alen; + DIP("vunpck%sps %s,%s\n", hi ? "h" : "l", + dis_buf, nameXMMReg(rG)); + } + IRTemp res = math_UNPCKxPS_128( eV, vV, opc ); + putYMMRegLoAndZU( rG, mkexpr(res) ); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0x28: + /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMRegLoAndZU( rG, getXMMReg( rE )); + DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) ); + DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG)); + delta += alen; + } + goto decode_success; + } + /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMReg( rG, getYMMReg( rE )); + DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + gen_SEGV_if_not_32_aligned( addr ); + putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) ); + DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG)); + delta += alen; + } + goto decode_success; + } + /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMRegLoAndZU( rG, getXMMReg( rE )); + DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) ); + DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG)); + delta += alen; + } + goto decode_success; + } + break; + + case 0x29: + /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMRegLoAndZU( rE, getXMMReg(rG) ); + DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE)); + delta += 1; + goto decode_success; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + storeLE( mkexpr(addr), getXMMReg(rG) ); + DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf ); + delta += alen; + goto decode_success; + } + } + /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putYMMRegLoAndZU( rE, getXMMReg(rG) ); + DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + storeLE( mkexpr(addr), getXMMReg(rG) ); + DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf ); + delta += alen; + } + goto decode_success; + } + break; + + case 0x2A: { + IRTemp rmode = newTemp(Ity_I32); + assign( rmode, get_sse_roundingmode() ); + /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */ + if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rV = getVexNvvvv(pfx); + UInt rD = gregOfRexRM(pfx, modrm); + IRTemp arg32 = newTemp(Ity_I32); + if (epartIsReg(modrm)) { + UInt rS = eregOfRexRM(pfx,modrm); + assign( arg32, getIReg32(rS) ); + delta += 1; + DIP("vcvtsi2sdl %s,%s,%s\n", + nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( arg32, loadLE(Ity_I32, mkexpr(addr)) ); + delta += alen; + DIP("vcvtsi2sdl %s,%s,%s\n", + dis_buf, nameXMMReg(rV), nameXMMReg(rD)); + } + putXMMRegLane64F( rD, 0, + unop(Iop_I32StoF64, mkexpr(arg32))); + putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 )); + putYMMRegLane128( rD, 1, mkV128(0) ); + *uses_vvvv = True; + goto decode_success; + } + /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */ + if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) { + UChar modrm = getUChar(delta); + UInt rV = getVexNvvvv(pfx); + UInt rD = gregOfRexRM(pfx, modrm); + IRTemp arg64 = newTemp(Ity_I64); + if (epartIsReg(modrm)) { + UInt rS = eregOfRexRM(pfx,modrm); + assign( arg64, getIReg64(rS) ); + delta += 1; + DIP("vcvtsi2sdq %s,%s,%s\n", + nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( arg64, loadLE(Ity_I64, mkexpr(addr)) ); + delta += alen; + DIP("vcvtsi2sdq %s,%s,%s\n", + dis_buf, nameXMMReg(rV), nameXMMReg(rD)); + } + putXMMRegLane64F( rD, 0, + binop( Iop_I64StoF64, + get_sse_roundingmode(), + mkexpr(arg64)) ); + putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 )); + putYMMRegLane128( rD, 1, mkV128(0) ); + *uses_vvvv = True; + goto decode_success; + } + /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */ + if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) { + UChar modrm = getUChar(delta); + UInt rV = getVexNvvvv(pfx); + UInt rD = gregOfRexRM(pfx, modrm); + IRTemp arg64 = newTemp(Ity_I64); + if (epartIsReg(modrm)) { + UInt rS = eregOfRexRM(pfx,modrm); + assign( arg64, getIReg64(rS) ); + delta += 1; + DIP("vcvtsi2ssq %s,%s,%s\n", + nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( arg64, loadLE(Ity_I64, mkexpr(addr)) ); + delta += alen; + DIP("vcvtsi2ssq %s,%s,%s\n", + dis_buf, nameXMMReg(rV), nameXMMReg(rD)); + } + putXMMRegLane32F( rD, 0, + binop(Iop_F64toF32, + mkexpr(rmode), + binop(Iop_I64StoF64, mkexpr(rmode), + mkexpr(arg64)) ) ); + putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 )); + putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 )); + putYMMRegLane128( rD, 1, mkV128(0) ); + *uses_vvvv = True; + goto decode_success; + } + /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */ + if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rV = getVexNvvvv(pfx); + UInt rD = gregOfRexRM(pfx, modrm); + IRTemp arg32 = newTemp(Ity_I32); + if (epartIsReg(modrm)) { + UInt rS = eregOfRexRM(pfx,modrm); + assign( arg32, getIReg32(rS) ); + delta += 1; + DIP("vcvtsi2ssl %s,%s,%s\n", + nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( arg32, loadLE(Ity_I32, mkexpr(addr)) ); + delta += alen; + DIP("vcvtsi2ssl %s,%s,%s\n", + dis_buf, nameXMMReg(rV), nameXMMReg(rD)); + } + putXMMRegLane32F( rD, 0, + binop(Iop_F64toF32, + mkexpr(rmode), + unop(Iop_I32StoF64, mkexpr(arg32)) ) ); + putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 )); + putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 )); + putYMMRegLane128( rD, 1, mkV128(0) ); + *uses_vvvv = True; + goto decode_success; + } + break; + } + + case 0x2C: + /* VCVTTSD2SI xmm1/m64, r32 = VEX.LIG.F2.0F.W0 2C /r */ + if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) { + delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4); + goto decode_success; + } + /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */ + if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) { + delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8); + goto decode_success; + } + break; + + case 0x2E: + /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */ + if (have66noF2noF3(pfx)) { + delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc ); + goto decode_success; + } + /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */ + if (haveNo66noF2noF3(pfx)) { + delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc ); + goto decode_success; + } + break; + + case 0x51: + /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */ + if (haveF2no66noF3(pfx)) { + delta = dis_AVX128_E_V_to_G_lo64_unary( + uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 ); + goto decode_success; + } + break; + + case 0x54: + /* VANDPD r/m, rV, r ::: r = rV & r/m (MVR format) */ + /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 ); + goto decode_success; + } + /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 ); + goto decode_success; + } + break; + + case 0x55: + /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m (MVR format) */ + /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG( + uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128, + NULL, True/*invertLeftArg*/ ); + goto decode_success; + } + /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG( + uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128, + NULL, True/*invertLeftArg*/ ); + goto decode_success; + } + break; + + case 0x56: + /* VORPD r/m, rV, r ::: r = rV | r/m (MVR format) */ + /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 ); + goto decode_success; + } + /* VORPS r/m, rV, r ::: r = rV | r/m (MVR format) */ + /* VORPS = VEX.NDS.128.0F.WIG 56 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 ); + goto decode_success; + } + break; + + case 0x57: + /* VXORPD r/m, rV, r ::: r = rV ^ r/m (MVR format) */ + /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 ); + goto decode_success; + } + /* VXORPS r/m, rV, r ::: r = rV ^ r/m (MVR format) */ + /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 ); + goto decode_success; + } + break; + + case 0x58: + /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */ + if (haveF2no66noF3(pfx)) { + delta = dis_AVX128_E_V_to_G_lo64( + uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 ); + goto decode_success; + } + /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */ + if (haveF3no66noF2(pfx)) { + delta = dis_AVX128_E_V_to_G_lo32( + uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 ); + goto decode_success; + } + break; + + case 0x59: + /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */ + if (haveF2no66noF3(pfx)) { + delta = dis_AVX128_E_V_to_G_lo64( + uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 ); + goto decode_success; + } + /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */ + if (haveF3no66noF2(pfx)) { + delta = dis_AVX128_E_V_to_G_lo32( + uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 ); + goto decode_success; + } + break; + + case 0x5A: + /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */ + if (haveNo66noF2noF3(pfx) && sz == 4) { + delta = dis_CVTPS2PD( vbi, pfx, delta, True/*isAvx*/ ); + goto decode_success; + } + /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */ + if (have66noF2noF3(pfx) && sz == 2) { + delta = dis_CVTPD2PS( vbi, pfx, delta, False/*!isAvx*/ ); + goto decode_success; + } + break; + + case 0x5C: + /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */ + if (haveF2no66noF3(pfx)) { + delta = dis_AVX128_E_V_to_G_lo64( + uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 ); + goto decode_success; + } + /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */ + if (haveF3no66noF2(pfx)) { + delta = dis_AVX128_E_V_to_G_lo32( + uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 ); + goto decode_success; + } + break; + + case 0x5D: + /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */ + if (haveF2no66noF3(pfx)) { + delta = dis_AVX128_E_V_to_G_lo64( + uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 ); + goto decode_success; + } + /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */ + if (haveF3no66noF2(pfx)) { + delta = dis_AVX128_E_V_to_G_lo32( + uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 ); + goto decode_success; + } + break; + + case 0x5E: + /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */ + if (haveF2no66noF3(pfx)) { + delta = dis_AVX128_E_V_to_G_lo64( + uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 ); + goto decode_success; + } + /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */ + if (haveF3no66noF2(pfx)) { + delta = dis_AVX128_E_V_to_G_lo32( + uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 ); + goto decode_success; + } + break; + + case 0x5F: + /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */ + if (haveF2no66noF3(pfx)) { + delta = dis_AVX128_E_V_to_G_lo64( + uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 ); + goto decode_success; + } + /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */ + if (haveF3no66noF2(pfx)) { + delta = dis_AVX128_E_V_to_G_lo32( + uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 ); + goto decode_success; + } + break; + + case 0x6E: + /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) { + vassert(sz == 2); /* even tho we are transferring 4, not 2. */ + UChar modrm = getUChar(delta); + if (epartIsReg(modrm)) { + delta += 1; + putYMMRegLoAndZU( + gregOfRexRM(pfx,modrm), + unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) ) + ); + DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + putYMMRegLoAndZU( + gregOfRexRM(pfx,modrm), + unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr))) + ); + DIP("vmovd %s, %s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + goto decode_success; + } + break; + + case 0x6F: + /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */ + /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */ + if ((have66noF2noF3(pfx) /* ATC || haveF3no66noF2(pfx)*/) + && 1==getVexL(pfx)/*256*/) { + UChar modrm = getUChar(delta); + UInt rD = gregOfRexRM(pfx, modrm); + IRTemp tD = newTemp(Ity_V256); + Bool isA = have66noF2noF3(pfx); + UChar ch = isA ? 'a' : 'u'; + if (epartIsReg(modrm)) { + UInt rS = eregOfRexRM(pfx, modrm); + delta += 1; + assign(tD, getYMMReg(rS)); + DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + if (isA) + gen_SEGV_if_not_32_aligned(addr); + assign(tD, loadLE(Ity_V256, mkexpr(addr))); + DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD)); + } + putYMMReg(rD, mkexpr(tD)); + goto decode_success; + } + /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */ + /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */ + if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx)) + && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rD = gregOfRexRM(pfx, modrm); + IRTemp tD = newTemp(Ity_V128); + Bool isA = have66noF2noF3(pfx); + UChar ch = isA ? 'a' : 'u'; + if (epartIsReg(modrm)) { + UInt rS = eregOfRexRM(pfx, modrm); + delta += 1; + assign(tD, getXMMReg(rS)); + DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + if (isA) + gen_SEGV_if_not_16_aligned(addr); + assign(tD, loadLE(Ity_V128, mkexpr(addr))); + DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD)); + } + putYMMRegLoAndZU(rD, mkexpr(tD)); + goto decode_success; + } + break; + + case 0x70: + /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/); + goto decode_success; + } + break; + + case 0x72: + /* VPSLLD imm8, xmm2, xmm1 = VEX.128.66.0F.WIG 72 /6 ib */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ + && epartIsReg(getUChar(delta)) + && gregLO3ofRM(getUChar(delta)) == 6) { + delta = dis_AVX128_shiftE_to_V_imm( pfx, delta, + "vpslld", Iop_ShlN32x4 ); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0x73: + /* VPSRLDQ VEX.NDD.128.66.0F.WIG 73 /3 ib */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && epartIsReg(getUChar(delta)) + && gregLO3ofRM(getUChar(delta)) == 3) { + Int rS = eregOfRexRM(pfx,getUChar(delta)); + Int rD = getVexNvvvv(pfx); + Int imm = (Int)getUChar(delta+1); + DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD)); + delta += 2; + IRTemp vecS = newTemp(Ity_V128); + assign( vecS, getXMMReg(rS) ); + IRTemp vecD = math_PSRLDQ( vecS, imm ); + putYMMRegLoAndZU(rD, mkexpr(vecD)); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0x76: + /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m (MVR format) */ + /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 ); + goto decode_success; + } + break; + + case 0x77: + /* VZEROUPPER = VEX.128.0F.WIG 77 */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + Int i; + IRTemp zero128 = newTemp(Ity_V128); + assign(zero128, mkV128(0)); + for (i = 0; i < 16; i++) { + putYMMRegLane128(i, 1, mkexpr(zero128)); + } + DIP("vzeroupper\n"); + goto decode_success; + } + break; + + case 0x7E: + /* Note the Intel docs don't make sense for this. I think they + are wrong. They seem to imply it is a store when in fact I + think it is a load. Also it's unclear whether this is W0, W1 + or WIG. */ + /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 */ + if (haveF3no66noF2(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) { + vassert(sz == 4); /* even tho we are transferring 8, not 4. */ + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 )); + DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) ); + DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG)); + delta += alen; + } + /* zero bits 255:64 */ + putXMMRegLane64( rG, 1, mkU64(0) ); + putYMMRegLane128( rG, 1, mkV128(0) ); + goto decode_success; + } + break; + + case 0x7F: + /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + UChar modrm = getUChar(delta); + UInt rS = gregOfRexRM(pfx, modrm); + IRTemp tS = newTemp(Ity_V256); + assign(tS, getYMMReg(rS)); + if (epartIsReg(modrm)) { + UInt rD = eregOfRexRM(pfx, modrm); + delta += 1; + putYMMReg(rD, mkexpr(tS)); + DIP("vmovdqa %s,%s\n", nameYMMReg(rS), nameYMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + gen_SEGV_if_not_32_aligned(addr); + storeLE(mkexpr(addr), mkexpr(tS)); + DIP("vmovdqa %s,%s\n", nameYMMReg(rS), dis_buf); + } + goto decode_success; + } + /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */ + /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */ + if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx)) + && 0==getVexL(pfx)/*128*/) { + UChar modrm = getUChar(delta); + UInt rS = gregOfRexRM(pfx, modrm); + IRTemp tS = newTemp(Ity_V128); + Bool isA = have66noF2noF3(pfx); + UChar ch = isA ? 'a' : 'u'; + assign(tS, getXMMReg(rS)); + if (epartIsReg(modrm)) { + UInt rD = eregOfRexRM(pfx, modrm); + delta += 1; + putYMMRegLoAndZU(rD, mkexpr(tS)); + DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + if (isA) + gen_SEGV_if_not_16_aligned(addr); + storeLE(mkexpr(addr), mkexpr(tS)); + DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf); + } + goto decode_success; + } + break; + + case 0xC2: + /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */ + /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */ + if (haveF2no66noF3(pfx)) { + Long delta0 = delta; + delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta, + "vcmpsd", False/*!all_lanes*/, + 8/*sz*/); + if (delta > delta0) goto decode_success; + /* else fall through -- decoding has failed */ + } + /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */ + /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */ + if (haveF3no66noF2(pfx)) { + Long delta0 = delta; + delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta, + "vcmpss", False/*!all_lanes*/, + 4/*sz*/); + if (delta > delta0) goto decode_success; + /* else fall through -- decoding has failed */ + } + break; + + case 0xD6: + /* I can't even find any Intel docs for this one. */ + /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half + xmm) to E (mem or lo half xmm). Looks like L==0(128), W==0 + (WIG, maybe?) */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0==getRexW(pfx)/*this might be redundant, dunno*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + /* fall through, awaiting test case */ + /* dst: lo half copied, hi half zeroed */ + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 )); + DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf ); + delta += alen; + goto decode_success; + } + } + break; + + case 0xEB: + /* VPOR r/m, rV, r ::: r = rV | r/m (MVR format) */ + /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 ); + goto decode_success; + } + break; + + case 0xEF: + /* VPXOR r/m, rV, r ::: r = rV ^ r/m (MVR format) */ + /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 ); + goto decode_success; + } + break; + + case 0xF8: + /* VPSUBB r/m, rV, r ::: r = rV - r/m (MVR format) */ + /* VPSUBB = VEX.NDS.128.66.0F.WIG EF /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 ); + goto decode_success; + } + break; + + case 0xFA: + /* VPSUBD r/m, rV, r ::: r = rV - r/m (MVR format) */ + /* VPSUBD = VEX.NDS.128.66.0F.WIG FE /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 ); + goto decode_success; + } + break; + + case 0xFE: + /* VPADDD r/m, rV, r ::: r = rV + r/m (MVR format) */ + /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 ); + goto decode_success; + } + break; + + default: + break; + + } + + //decode_failure: + return deltaIN; + + decode_success: + return delta; +} + + +/*------------------------------------------------------------*/ +/*--- ---*/ +/*--- Top-level post-escape decoders: dis_ESC_0F38__VEX ---*/ +/*--- ---*/ +/*------------------------------------------------------------*/ + +__attribute__((noinline)) +static +Long dis_ESC_0F38__VEX ( + /*MB_OUT*/DisResult* dres, + /*OUT*/ Bool* uses_vvvv, + Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), + Bool resteerCisOk, + void* callback_opaque, + VexArchInfo* archinfo, + VexAbiInfo* vbi, + Prefix pfx, Int sz, Long deltaIN + ) +{ + //IRTemp addr = IRTemp_INVALID; + //Int alen = 0; + //HChar dis_buf[50]; + Long delta = deltaIN; + UChar opc = getUChar(delta); + delta++; + *uses_vvvv = False; + + switch (opc) { + + case 0x00: + /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) (MVR format) */ + /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM ); + goto decode_success; + } + break; + + case 0x30: + /* VPMOVZXBW xmm2/m64, xmm1 */ + /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_PMOVZXBW( vbi, pfx, delta, True/*writesYmm*/ ); + goto decode_success; + } + break; + + case 0x33: + /* VPMOVZXWD xmm2/m64, xmm1 */ + /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_PMOVZXWD( vbi, pfx, delta, True/*writesYmm*/ ); + goto decode_success; + } + break; + + case 0x39: + /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) (MVR format) */ + /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 ); + goto decode_success; + } + break; + + case 0x3D: + /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) (MVR format) */ + /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 ); + goto decode_success; + } + break; + + default: + break; + + } + + //decode_failure: + return deltaIN; + + decode_success: + return delta; +} + + +/*------------------------------------------------------------*/ +/*--- ---*/ +/*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX ---*/ +/*--- ---*/ +/*------------------------------------------------------------*/ + +__attribute__((noinline)) +static +Long dis_ESC_0F3A__VEX ( + /*MB_OUT*/DisResult* dres, + /*OUT*/ Bool* uses_vvvv, + Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), + Bool resteerCisOk, + void* callback_opaque, + VexArchInfo* archinfo, + VexAbiInfo* vbi, + Prefix pfx, Int sz, Long deltaIN + ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + Long delta = deltaIN; + UChar opc = getUChar(delta); + delta++; + *uses_vvvv = False; + + switch (opc) { + + case 0x16: + /* VPEXTRD imm8, r32/m32, xmm2 */ + /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) { + delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ ); + goto decode_success; + } + break; + + case 0x18: + /* VINSERTF128 r/m, rV, rD + ::: rD = insertinto(a lane in rV, 128 bits from r/m) (MVR format) */ + /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt ib = 0; + UInt rD = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp t128 = newTemp(Ity_V128); + if (epartIsReg(modrm)) { + UInt rmR = eregOfRexRM(pfx, modrm); + delta += 1; + assign(t128, getXMMReg(rmR)); + ib = getUChar(delta); + DIP("vinsertf128 $%u,%s,%s,%s\n", + ib, nameXMMReg(rmR), nameYMMReg(rV), nameYMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + assign(t128, loadLE(Ity_V128, mkexpr(addr))); + delta += alen; + ib = getUChar(delta); + DIP("vinsertf128 $%u,%s,%s,%s\n", + ib, dis_buf, nameYMMReg(rV), nameYMMReg(rD)); + } + delta++; + putYMMRegLane128(rD, 0, getYMMRegLane128(rV, 0)); + putYMMRegLane128(rD, 1, getYMMRegLane128(rV, 1)); + putYMMRegLane128(rD, ib & 1, mkexpr(t128)); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0x19: + /* VEXTRACTF128 rS, r/m + ::: r/m:V128 = a lane of rS:V256 (RM format) */ + /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt ib = 0; + UInt rS = gregOfRexRM(pfx, modrm); + IRTemp t128 = newTemp(Ity_V128); + if (epartIsReg(modrm)) { + UInt rD = eregOfRexRM(pfx, modrm); + delta += 1; + ib = getUChar(delta); + assign(t128, getYMMRegLane128(rS, ib & 1)); + putYMMRegLane128(rD, 0, mkexpr(t128)); + DIP("vextractf128 $%u,%s,%s\n", + ib, nameXMMReg(rS), nameYMMReg(rD)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + ib = getUChar(delta); + assign(t128, getYMMRegLane128(rS, ib & 1)); + storeLE(mkexpr(addr), mkexpr(t128)); + DIP("vextractf128 $%u,%s,%s\n", + ib, nameYMMReg(rS), dis_buf); + } + delta++; + /* doesn't use vvvv */ + goto decode_success; + } + break; + + case 0x4C: + /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4 + ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */ + /* VPBLENDVB = VEX.NDS.128.66.0F3A.W0 4C /r /is4 */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + UInt rIS4 = 0xFF; /* invalid */ + IRTemp vecE = newTemp(Ity_V128); + IRTemp vecG = newTemp(Ity_V128); + IRTemp vecV = newTemp(Ity_V128); + IRTemp vecIS4 = newTemp(Ity_V128); + if (epartIsReg(modrm)) { + delta++; + UInt rE = eregOfRexRM(pfx, modrm); + assign(vecE, getXMMReg(rE)); + UChar ib = getUChar(delta); + rIS4 = (ib >> 4) & 0xF; + DIP("vpblendvb %s,%s,%s,%s\n", + nameXMMReg(rIS4), nameXMMReg(rE), + nameXMMReg(rV), nameXMMReg(rG)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + assign(vecE, loadLE(Ity_V128, mkexpr(addr))); + UChar ib = getUChar(delta); + rIS4 = (ib >> 4) & 0xF; + DIP("vpblendvb %s,%s,%s,%s\n", + nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + } + delta++; + assign(vecG, getXMMReg(rG)); + assign(vecV, getXMMReg(rV)); + assign(vecIS4, getXMMReg(rIS4)); + IRTemp res = math_PBLENDVB( vecE, vecV, vecIS4, 1, Iop_SarN8x16 ); + putYMMRegLoAndZU( rG, mkexpr(res) ); + *uses_vvvv = True; + goto decode_success; + } + break; + + default: + break; + + } + + //decode_failure: + return deltaIN; + + decode_success: + return delta; +} + + +/*------------------------------------------------------------*/ +/*--- ---*/ +/*--- Disassemble a single instruction ---*/ +/*--- ---*/ +/*------------------------------------------------------------*/ + +/* Disassemble a single instruction into IR. The instruction is + located in host memory at &guest_code[delta]. */ + +static +DisResult disInstr_AMD64_WRK ( + /*OUT*/Bool* expect_CAS, + Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), + Bool resteerCisOk, + void* callback_opaque, + Long delta64, + VexArchInfo* archinfo, + VexAbiInfo* vbi + ) +{ + IRTemp t1, t2, t3, t4, t5, t6; + UChar pre; + Int n, n_prefixes; + DisResult dres; + + /* The running delta */ + Long delta = delta64; + + /* Holds eip at the start of the insn, so that we can print + consistent error messages for unimplemented insns. */ + Long delta_start = delta; + + /* sz denotes the nominal data-op size of the insn; we change it to + 2 if an 0x66 prefix is seen and 8 if REX.W is 1. In case of + conflict REX.W takes precedence. */ + Int sz = 4; + + /* pfx holds the summary of prefixes. */ + Prefix pfx = PFX_EMPTY; + + /* Holds the computed opcode-escape indication. */ + Escape esc = ESC_NONE; + + /* Set result defaults. */ + dres.whatNext = Dis_Continue; + dres.len = 0; + dres.continueAt = 0; + dres.jk_StopHere = Ijk_INVALID; + *expect_CAS = False; + + vassert(guest_RIP_next_assumed == 0); + vassert(guest_RIP_next_mustcheck == False); + + t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID; + + DIP("\t0x%llx: ", guest_RIP_bbstart+delta); + + /* Spot "Special" instructions (see comment at top of file). */ + { + UChar* code = (UChar*)(guest_code + delta); + /* Spot the 16-byte preamble: + 48C1C703 rolq $3, %rdi + 48C1C70D rolq $13, %rdi + 48C1C73D rolq $61, %rdi + 48C1C733 rolq $51, %rdi + */ + if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7 + && code[ 3] == 0x03 && + code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7 + && code[ 7] == 0x0D && + code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7 + && code[11] == 0x3D && + code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7 + && code[15] == 0x33) { + /* Got a "Special" instruction preamble. Which one is it? */ + if (code[16] == 0x48 && code[17] == 0x87 + && code[18] == 0xDB /* xchgq %rbx,%rbx */) { + /* %RDX = client_request ( %RAX ) */ + DIP("%%rdx = client_request ( %%rax )\n"); + delta += 19; + jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta); + vassert(dres.whatNext == Dis_StopHere); + goto decode_success; + } + else + if (code[16] == 0x48 && code[17] == 0x87 && code[18] == 0xC9 /* xchgq %rcx,%rcx */) { /* %RAX = guest_NRADDR */ DIP("%%rax = guest_NRADDR\n"); @@ -19144,13 +20893,79 @@ DisResult disInstr_AMD64_WRK ( if (pre & (1<<0)) pfx |= PFX_REXB; break; default: - goto not_a_prefix; + goto not_a_legacy_prefix; } n_prefixes++; delta++; } - not_a_prefix: + not_a_legacy_prefix: + /* We've used up all the non-VEX prefixes. Parse and validate a + VEX prefix if that's appropriate. */ + if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) { + /* Used temporarily for holding VEX prefixes. */ + UChar vex0 = getUChar(delta); + if (vex0 == 0xC4) { + /* 3-byte VEX */ + UChar vex1 = getUChar(delta+1); + UChar vex2 = getUChar(delta+2); + delta += 3; + pfx |= PFX_VEX; + /* Snarf contents of byte 1 */ + /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR; + /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX; + /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB; + /* m-mmmm */ + switch (vex1 & 0x1F) { + case 1: esc = ESC_0F; break; + case 2: esc = ESC_0F38; break; + case 3: esc = ESC_0F3A; break; + /* Any other m-mmmm field will #UD */ + default: goto decode_failure; + } + /* Snarf contents of byte 2 */ + /* W */ pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0; + /* ~v3 */ pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3; + /* ~v2 */ pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2; + /* ~v1 */ pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1; + /* ~v0 */ pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0; + /* L */ pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0; + /* pp */ + switch (vex2 & 3) { + case 0: break; + case 1: pfx |= PFX_66; break; + case 2: pfx |= PFX_F3; break; + case 3: pfx |= PFX_F2; break; + default: vassert(0); + } + } + else if (vex0 == 0xC5) { + /* 2-byte VEX */ + UChar vex1 = getUChar(delta+1); + delta += 2; + pfx |= PFX_VEX; + /* Snarf contents of byte 1 */ + /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR; + /* ~v3 */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3; + /* ~v2 */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2; + /* ~v1 */ pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1; + /* ~v0 */ pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0; + /* L */ pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0; + /* pp */ + switch (vex1 & 3) { + case 0: break; + case 1: pfx |= PFX_66; break; + case 2: pfx |= PFX_F3; break; + case 3: pfx |= PFX_F2; break; + default: vassert(0); + } + /* implied: */ + esc = ESC_0F; + } + /* Can't have both VEX and REX */ + if ((pfx & PFX_VEX) && (pfx & PFX_REX)) + goto decode_failure; /* can't have both */ + } /* Dump invalid combinations */ n = 0; @@ -19186,7 +21001,6 @@ DisResult disInstr_AMD64_WRK ( /* Now we should be looking at the primary opcode byte or the leading escapes. Check that any LOCK prefix is actually allowed. */ - if (pfx & PFX_LOCK) { if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) { DIP("lock "); @@ -19197,47 +21011,91 @@ DisResult disInstr_AMD64_WRK ( } /* Eat up opcode escape bytes, until we're really looking at the - primary opcode byte. */ - Escape esc = ESC_NONE; - pre = getUChar(delta); - if (pre == 0x0F) { - delta++; + primary opcode byte. But only if there's no VEX present. */ + if (!(pfx & PFX_VEX)) { + vassert(esc == ESC_NONE); pre = getUChar(delta); - switch (pre) { - case 0x38: esc = ESC_0F38; delta++; break; - case 0x3A: esc = ESC_0F3A; delta++; break; - default: esc = ESC_0F; break; + if (pre == 0x0F) { + delta++; + pre = getUChar(delta); + switch (pre) { + case 0x38: esc = ESC_0F38; delta++; break; + case 0x3A: esc = ESC_0F3A; delta++; break; + default: esc = ESC_0F; break; + } } } /* So now we're really really looking at the primary opcode byte. */ Long delta_at_primary_opcode = delta; - switch (esc) { - case ESC_NONE: - delta = dis_ESC_NONE( &dres, expect_CAS, - resteerOkFn, resteerCisOk, callback_opaque, - archinfo, vbi, pfx, sz, delta ); - break; - case ESC_0F: - delta = dis_ESC_0F ( &dres, expect_CAS, - resteerOkFn, resteerCisOk, callback_opaque, - archinfo, vbi, pfx, sz, delta ); - break; - case ESC_0F38: - delta = dis_ESC_0F38( &dres, - resteerOkFn, resteerCisOk, callback_opaque, - archinfo, vbi, pfx, sz, delta ); - break; - case ESC_0F3A: - delta = dis_ESC_0F3A( &dres, - resteerOkFn, resteerCisOk, callback_opaque, - archinfo, vbi, pfx, sz, delta ); - break; - default: - vex_printf("XXX esc = %08x\n", esc); - vassert(0); + + if (!(pfx & PFX_VEX)) { + /* Handle non-VEX prefixed instructions. "Legacy" (non-VEX) SSE + instructions preserve the upper 128 bits of YMM registers; + iow we can simply ignore the presence of the upper halves of + these registers. */ + switch (esc) { + case ESC_NONE: + delta = dis_ESC_NONE( &dres, expect_CAS, + resteerOkFn, resteerCisOk, callback_opaque, + archinfo, vbi, pfx, sz, delta ); + break; + case ESC_0F: + delta = dis_ESC_0F ( &dres, expect_CAS, + resteerOkFn, resteerCisOk, callback_opaque, + archinfo, vbi, pfx, sz, delta ); + break; + case ESC_0F38: + delta = dis_ESC_0F38( &dres, + resteerOkFn, resteerCisOk, callback_opaque, + archinfo, vbi, pfx, sz, delta ); + break; + case ESC_0F3A: + delta = dis_ESC_0F3A( &dres, + resteerOkFn, resteerCisOk, callback_opaque, + archinfo, vbi, pfx, sz, delta ); + break; + default: + vassert(0); + } + } else { + /* VEX prefixed instruction */ + /* Sloppy Intel wording: "An instruction encoded with a VEX.128 + prefix that loads a YMM register operand ..." zeroes out bits + 128 and above of the register. */ + Bool uses_vvvv = False; + switch (esc) { + case ESC_0F: + delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv, + resteerOkFn, resteerCisOk, + callback_opaque, + archinfo, vbi, pfx, sz, delta ); + break; + case ESC_0F38: + delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv, + resteerOkFn, resteerCisOk, + callback_opaque, + archinfo, vbi, pfx, sz, delta ); + break; + case ESC_0F3A: + delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv, + resteerOkFn, resteerCisOk, + callback_opaque, + archinfo, vbi, pfx, sz, delta ); + break; + default: + vex_printf("XXX VEX esc = %08x\n", esc); + break; + } + /* If the insn doesn't use VEX.vvvv then it must be all ones. + Check this. */ + if (!uses_vvvv) { + if (getVexNvvvv(pfx) != 0) + goto decode_failure; + } } + vassert(delta - delta_at_primary_opcode >= 0); vassert(delta - delta_at_primary_opcode < 16/*let's say*/); @@ -19473,6 +21331,19 @@ DisResult disInstr_AMD64_WRK ( (Int)getUChar(delta_start+5), (Int)getUChar(delta_start+6), (Int)getUChar(delta_start+7) ); + vex_printf("vex amd64->IR: REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n", + haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx), + getRexX(pfx), getRexB(pfx)); + vex_printf("vex amd64->IR: VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n", + haveVEX(pfx) ? 1 : 0, getVexL(pfx), + getVexNvvvv(pfx), + esc==ESC_NONE ? "NONE" : + esc==ESC_0F ? "0F" : + esc==ESC_0F38 ? "0F38" : + esc==ESC_0F3A ? "0F3A" : "???"); + vex_printf("vex amd64->IR: PFX.66=%d PFX.F2=%d PFX.F3=%d\n", + have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0, + haveF3(pfx) ? 1 : 0); /* Tell the dispatcher that this insn cannot be decoded, and so has not been executed, and (is currently) the next to be executed. diff --git a/VEX/priv/guest_x86_helpers.c b/VEX/priv/guest_x86_helpers.c index 9f7a8f5c1b..d555c42ee0 100644 --- a/VEX/priv/guest_x86_helpers.c +++ b/VEX/priv/guest_x86_helpers.c @@ -2729,7 +2729,11 @@ void LibVEX_GuestX86_initialise ( /*OUT*/VexGuestX86State* vex_state ) vex_state->guest_SC_CLASS = 0; vex_state->guest_IP_AT_SYSCALL = 0; - vex_state->padding1 = 0; + Int i; + for (i = 0; i < sizeof(vex_state->padding) + / sizeof(vex_state->padding[0]); i++) { + vex_state->padding[i] = 0; + } } diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 9a4f3105d7..e2bba42e6a 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -72,6 +72,11 @@ void ppHRegAMD64 ( HReg reg ) vassert(r >= 0 && r < 16); vex_printf("%%xmm%d", r); return; + case HRcVec256: + r = hregNumber(reg); + vassert(r >= 0 && r < 16); + vex_printf("%%ymm%d", r); + return; default: vpanic("ppHRegAMD64"); } @@ -120,7 +125,6 @@ HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); } HReg hregAMD64_XMM0 ( void ) { return mkHReg( 0, HRcVec128, False); } HReg hregAMD64_XMM1 ( void ) { return mkHReg( 1, HRcVec128, False); } -HReg hregAMD64_XMM2 ( void ) { return mkHReg( 2, HRcVec128, False); } HReg hregAMD64_XMM3 ( void ) { return mkHReg( 3, HRcVec128, False); } HReg hregAMD64_XMM4 ( void ) { return mkHReg( 4, HRcVec128, False); } HReg hregAMD64_XMM5 ( void ) { return mkHReg( 5, HRcVec128, False); } @@ -131,9 +135,11 @@ HReg hregAMD64_XMM9 ( void ) { return mkHReg( 9, HRcVec128, False); } HReg hregAMD64_XMM10 ( void ) { return mkHReg(10, HRcVec128, False); } HReg hregAMD64_XMM11 ( void ) { return mkHReg(11, HRcVec128, False); } HReg hregAMD64_XMM12 ( void ) { return mkHReg(12, HRcVec128, False); } -HReg hregAMD64_XMM13 ( void ) { return mkHReg(13, HRcVec128, False); } -HReg hregAMD64_XMM14 ( void ) { return mkHReg(14, HRcVec128, False); } -HReg hregAMD64_XMM15 ( void ) { return mkHReg(15, HRcVec128, False); } + +HReg hregAMD64_YMM2 ( void ) { return mkHReg( 2, HRcVec256, False); } +HReg hregAMD64_YMM13 ( void ) { return mkHReg(13, HRcVec256, False); } +HReg hregAMD64_YMM14 ( void ) { return mkHReg(14, HRcVec256, False); } +HReg hregAMD64_YMM15 ( void ) { return mkHReg(15, HRcVec256, False); } void getAllocableRegs_AMD64 ( Int* nregs, HReg** arr ) @@ -980,6 +986,23 @@ AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) { vassert(order >= 0 && order <= 0xFF); return i; } +AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, + HReg reg, AMD64AMode* addr ) { + AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr)); + i->tag = Ain_AvxLdSt; + i->Ain.AvxLdSt.isLoad = isLoad; + i->Ain.AvxLdSt.reg = reg; + i->Ain.AvxLdSt.addr = addr; + return i; +} +AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) { + AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr)); + i->tag = Ain_AvxReRg; + i->Ain.AvxReRg.op = op; + i->Ain.AvxReRg.src = re; + i->Ain.AvxReRg.dst = rg; + return i; +} AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, AMD64AMode* amFailAddr ) { AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr)); @@ -1275,6 +1298,25 @@ void ppAMD64Instr ( AMD64Instr* i, Bool mode64 ) vex_printf(","); ppHRegAMD64(i->Ain.SseShuf.dst); return; + + case Ain_AvxLdSt: + vex_printf("vmovups "); + if (i->Ain.AvxLdSt.isLoad) { + ppAMD64AMode(i->Ain.AvxLdSt.addr); + vex_printf(","); + ppHRegAMD64(i->Ain.AvxLdSt.reg); + } else { + ppHRegAMD64(i->Ain.AvxLdSt.reg); + vex_printf(","); + ppAMD64AMode(i->Ain.AvxLdSt.addr); + } + return; + case Ain_AvxReRg: + vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op)); + ppHRegAMD64(i->Ain.AvxReRg.src); + vex_printf(","); + ppHRegAMD64(i->Ain.AvxReRg.dst); + return; case Ain_EvCheck: vex_printf("(evCheck) decl "); ppAMD64AMode(i->Ain.EvCheck.amCounter); @@ -1360,7 +1402,7 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 ) /* First off, claim it trashes all the caller-saved regs which fall within the register allocator's jurisdiction. These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11 - and all the xmm registers. + and all the xmm/ymm registers. */ addHRegUse(u, HRmWrite, hregAMD64_RAX()); addHRegUse(u, HRmWrite, hregAMD64_RCX()); @@ -1373,7 +1415,6 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 ) addHRegUse(u, HRmWrite, hregAMD64_R11()); addHRegUse(u, HRmWrite, hregAMD64_XMM0()); addHRegUse(u, HRmWrite, hregAMD64_XMM1()); - addHRegUse(u, HRmWrite, hregAMD64_XMM2()); addHRegUse(u, HRmWrite, hregAMD64_XMM3()); addHRegUse(u, HRmWrite, hregAMD64_XMM4()); addHRegUse(u, HRmWrite, hregAMD64_XMM5()); @@ -1384,9 +1425,10 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 ) addHRegUse(u, HRmWrite, hregAMD64_XMM10()); addHRegUse(u, HRmWrite, hregAMD64_XMM11()); addHRegUse(u, HRmWrite, hregAMD64_XMM12()); - addHRegUse(u, HRmWrite, hregAMD64_XMM13()); - addHRegUse(u, HRmWrite, hregAMD64_XMM14()); - addHRegUse(u, HRmWrite, hregAMD64_XMM15()); + addHRegUse(u, HRmWrite, hregAMD64_YMM2()); + addHRegUse(u, HRmWrite, hregAMD64_YMM13()); + addHRegUse(u, HRmWrite, hregAMD64_YMM14()); + addHRegUse(u, HRmWrite, hregAMD64_YMM15()); /* Now we have to state any parameter-carrying registers which might be read. This depends on the regparmness. */ @@ -1567,6 +1609,24 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 ) addHRegUse(u, HRmRead, i->Ain.SseShuf.src); addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst); return; + case Ain_AvxLdSt: + addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr); + addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead, + i->Ain.AvxLdSt.reg); + return; + case Ain_AvxReRg: + if ( (i->Ain.AvxReRg.op == Asse_XOR + || i->Ain.AvxReRg.op == Asse_CMPEQ32) + && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) { + /* See comments on the case for Ain_SseReRg. */ + addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst); + } else { + addHRegUse(u, HRmRead, i->Ain.AvxReRg.src); + addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV + ? HRmWrite : HRmModify, + i->Ain.AvxReRg.dst); + } + return; case Ain_EvCheck: /* We expect both amodes only to mention %rbp, so this is in fact pointless, since %rbp isn't allocatable, but anyway.. */ @@ -1742,6 +1802,14 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) mapReg(m, &i->Ain.SseShuf.src); mapReg(m, &i->Ain.SseShuf.dst); return; + case Ain_AvxLdSt: + mapReg(m, &i->Ain.AvxLdSt.reg); + mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr); + break; + case Ain_AvxReRg: + mapReg(m, &i->Ain.AvxReRg.src); + mapReg(m, &i->Ain.AvxReRg.dst); + return; case Ain_EvCheck: /* We expect both amodes only to mention %rbp, so this is in fact pointless, since %rbp isn't allocatable, but anyway.. */ @@ -1763,25 +1831,34 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) */ Bool isMove_AMD64Instr ( AMD64Instr* i, HReg* src, HReg* dst ) { - /* Moves between integer regs */ - if (i->tag == Ain_Alu64R) { - if (i->Ain.Alu64R.op != Aalu_MOV) - return False; - if (i->Ain.Alu64R.src->tag != Armi_Reg) - return False; - *src = i->Ain.Alu64R.src->Armi.Reg.reg; - *dst = i->Ain.Alu64R.dst; - return True; - } - /* Moves between vector regs */ - if (i->tag == Ain_SseReRg) { - if (i->Ain.SseReRg.op != Asse_MOV) + switch (i->tag) { + case Ain_Alu64R: + /* Moves between integer regs */ + if (i->Ain.Alu64R.op != Aalu_MOV) + return False; + if (i->Ain.Alu64R.src->tag != Armi_Reg) + return False; + *src = i->Ain.Alu64R.src->Armi.Reg.reg; + *dst = i->Ain.Alu64R.dst; + return True; + case Ain_SseReRg: + /* Moves between SSE regs */ + if (i->Ain.SseReRg.op != Asse_MOV) + return False; + *src = i->Ain.SseReRg.src; + *dst = i->Ain.SseReRg.dst; + return True; + case Ain_AvxReRg: + /* Moves between AVX regs */ + if (i->Ain.AvxReRg.op != Asse_MOV) + return False; + *src = i->Ain.AvxReRg.src; + *dst = i->Ain.AvxReRg.dst; + return True; + default: return False; - *src = i->Ain.SseReRg.src; - *dst = i->Ain.SseReRg.dst; - return True; } - return False; + /*NOTREACHED*/ } @@ -1805,6 +1882,9 @@ void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2, case HRcVec128: *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am ); return; + case HRcVec256: + *i1 = AMD64Instr_AvxLdSt ( False/*store*/, rreg, am ); + return; default: ppHRegClass(hregClass(rreg)); vpanic("genSpill_AMD64: unimplemented regclass"); @@ -1827,6 +1907,9 @@ void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2, case HRcVec128: *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am ); return; + case HRcVec256: + *i1 = AMD64Instr_AvxLdSt ( True/*load*/, rreg, am ); + return; default: ppHRegClass(hregClass(rreg)); vpanic("genReload_AMD64: unimplemented regclass"); @@ -1884,6 +1967,17 @@ static UInt vreg2ireg ( HReg r ) return mkHReg(n, HRcInt64, False); } +/* Ditto for ymm regs. */ +static UInt dvreg2ireg ( HReg r ) +{ + UInt n; + vassert(hregClass(r) == HRcVec256); + vassert(!hregIsVirtual(r)); + n = hregNumber(r); + vassert(n <= 15); + return mkHReg(n, HRcInt64, False); +} + static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem ) { return toUChar( ((mod & 3) << 6) @@ -2086,6 +2180,83 @@ static UChar rexAMode_R ( HReg greg, HReg ereg ) } +/* Assemble a 2 or 3 byte VEX prefix from parts. rexR, rexX, rexB and + notVvvvv need to be not-ed before packing. mmmmm, rexW, L and pp go + in verbatim. There's no range checking on the bits. */ +static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB, + UInt mmmmm, UInt rexW, UInt notVvvv, + UInt L, UInt pp ) +{ + UChar byte0 = 0; + UChar byte1 = 0; + UChar byte2 = 0; + if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) { + /* 2 byte encoding is possible. */ + byte0 = 0xC5; + byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3) + | (L << 2) | pp; + } else { + /* 3 byte encoding is needed. */ + byte0 = 0xC4; + byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6) + | ((rexB ^ 1) << 5) | mmmmm; + byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp; + } + return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0); +} + +/* Make up a VEX prefix for a (greg,amode) pair. First byte in bits + 7:0 of result, second in 15:8, third (for a 3 byte prefix) in + 23:16. Has m-mmmm set to indicate a prefix of 0F, pp set to + indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and + vvvv=1111 (unused 3rd reg). */ +static UInt vexAMode_M ( HReg greg, AMD64AMode* am ) +{ + UChar L = 1; /* size = 256 */ + UChar pp = 0; /* no SIMD prefix */ + UChar mmmmm = 1; /* 0F */ + UChar notVvvv = 0; /* unused */ + UChar rexW = 0; + UChar rexR = 0; + UChar rexX = 0; + UChar rexB = 0; + /* Same logic as in rexAMode_M. */ + if (am->tag == Aam_IR) { + rexR = iregBit3(greg); + rexX = 0; /* not relevant */ + rexB = iregBit3(am->Aam.IR.reg); + } + else if (am->tag == Aam_IRRS) { + rexR = iregBit3(greg); + rexX = iregBit3(am->Aam.IRRS.index); + rexB = iregBit3(am->Aam.IRRS.base); + } else { + vassert(0); + } + return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp ); +} + +static UChar* emitVexPrefix ( UChar* p, UInt vex ) +{ + switch (vex & 0xFF) { + case 0xC5: + *p++ = 0xC5; + *p++ = (vex >> 8) & 0xFF; + vassert(0 == (vex >> 16)); + break; + case 0xC4: + *p++ = 0xC4; + *p++ = (vex >> 8) & 0xFF; + *p++ = (vex >> 16) & 0xFF; + vassert(0 == (vex >> 24)); + break; + default: + vassert(0); + } + return p; +} + + /* Emit ffree %st(N) */ static UChar* do_ffree_st ( UChar* p, Int n ) { @@ -3261,6 +3432,15 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, *p++ = (UChar)(i->Ain.SseShuf.order); goto done; + case Ain_AvxLdSt: { + UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg), + i->Ain.AvxLdSt.addr ); + p = emitVexPrefix(p, vex); + *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11); + p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr); + goto done; + } + case Ain_EvCheck: { /* We generate: (3 bytes) decl 8(%rbp) 8 == offsetof(host_EvC_COUNTER) diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index bc63bd2f7f..5e32dadbca 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -71,7 +71,6 @@ extern HReg hregAMD64_FAKE5 ( void ); extern HReg hregAMD64_XMM0 ( void ); extern HReg hregAMD64_XMM1 ( void ); -extern HReg hregAMD64_XMM2 ( void ); extern HReg hregAMD64_XMM3 ( void ); extern HReg hregAMD64_XMM4 ( void ); extern HReg hregAMD64_XMM5 ( void ); @@ -82,9 +81,11 @@ extern HReg hregAMD64_XMM9 ( void ); extern HReg hregAMD64_XMM10 ( void ); extern HReg hregAMD64_XMM11 ( void ); extern HReg hregAMD64_XMM12 ( void ); -extern HReg hregAMD64_XMM13 ( void ); -extern HReg hregAMD64_XMM14 ( void ); -extern HReg hregAMD64_XMM15 ( void ); + +extern HReg hregAMD64_YMM2 ( void ); +extern HReg hregAMD64_YMM13 ( void ); +extern HReg hregAMD64_YMM14 ( void ); +extern HReg hregAMD64_YMM15 ( void ); /* --------- Condition codes, AMD encoding. --------- */ @@ -399,6 +400,9 @@ typedef Ain_SseReRg, /* SSE binary general reg-reg, Re, Rg */ Ain_SseCMov, /* SSE conditional move */ Ain_SseShuf, /* SSE2 shuffle (pshufd) */ + Ain_AvxLdSt, /* AVX load/store 256 bits, + no alignment constraints */ + Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */ Ain_EvCheck, /* Event check */ Ain_ProfInc /* 64-bit profile counter increment */ } @@ -664,6 +668,16 @@ typedef HReg src; HReg dst; } SseShuf; + struct { + Bool isLoad; + HReg reg; + AMD64AMode* addr; + } AvxLdSt; + struct { + AMD64SseOp op; + HReg src; + HReg dst; + } AvxReRg; struct { AMD64AMode* amCounter; AMD64AMode* amFailAddr; @@ -726,6 +740,8 @@ extern AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ); +extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* ); +extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, AMD64AMode* amFailAddr ); extern AMD64Instr* AMD64Instr_ProfInc ( void ); diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index cbd30b8e6e..24c0376bf5 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -163,8 +163,8 @@ static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp ) return env->vregmap[tmp]; } -static void lookupIRTemp128 ( HReg* vrHI, HReg* vrLO, - ISelEnv* env, IRTemp tmp ) +static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO, + ISelEnv* env, IRTemp tmp ) { vassert(tmp >= 0); vassert(tmp < env->n_vregmap); @@ -189,13 +189,6 @@ static HReg newVRegI ( ISelEnv* env ) return reg; } -//.. static HReg newVRegF ( ISelEnv* env ) -//.. { -//.. HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/); -//.. env->vreg_ctr++; -//.. return reg; -//.. } - static HReg newVRegV ( ISelEnv* env ) { HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/); @@ -203,6 +196,13 @@ static HReg newVRegV ( ISelEnv* env ) return reg; } +static HReg newVRegDV ( ISelEnv* env ) +{ + HReg reg = mkHReg(env->vreg_ctr, HRcVec256, True/*virtual reg*/); + env->vreg_ctr++; + return reg; +} + /*---------------------------------------------------------*/ /*--- ISEL: Forward declarations ---*/ @@ -229,9 +229,9 @@ static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e ); static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e ); static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e ); -static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, +static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e ); -static void iselInt128Expr ( HReg* rHi, HReg* rLo, +static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e ); static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e ); @@ -246,6 +246,14 @@ static HReg iselFltExpr ( ISelEnv* env, IRExpr* e ); static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ); static HReg iselVecExpr ( ISelEnv* env, IRExpr* e ); +static HReg iselV256Expr_wrk ( ISelEnv* env, IRExpr* e ); +static HReg iselV256Expr ( ISelEnv* env, IRExpr* e ); + +static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, + ISelEnv* env, IRExpr* e ); +static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo, + ISelEnv* env, IRExpr* e ); + /*---------------------------------------------------------*/ /*--- ISEL: Misc helpers ---*/ @@ -308,7 +316,7 @@ static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst ) return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst); } -/* Make a vector reg-reg move. */ +/* Make a vector (128 bit) reg-reg move. */ static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst ) { @@ -317,6 +325,15 @@ static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst ) return AMD64Instr_SseReRg(Asse_MOV, src, dst); } +/* Make a double-vector (256 bit) reg-reg move. */ + +static AMD64Instr* mk_dvMOVsd_RR ( HReg src, HReg dst ) +{ + vassert(hregClass(src) == HRcVec256); + vassert(hregClass(dst) == HRcVec256); + return AMD64Instr_AvxReRg(Asse_MOV, src, dst); +} + /* Advance/retreat %rsp by n. */ static void add_to_rsp ( ISelEnv* env, Int n ) @@ -350,46 +367,6 @@ static void push_uimm64( ISelEnv* env, ULong uimm64 ) } } -//.. /* Given an amode, return one which references 4 bytes further -//.. along. */ -//.. -//.. static X86AMode* advance4 ( X86AMode* am ) -//.. { -//.. X86AMode* am4 = dopyX86AMode(am); -//.. switch (am4->tag) { -//.. case Xam_IRRS: -//.. am4->Xam.IRRS.imm += 4; break; -//.. case Xam_IR: -//.. am4->Xam.IR.imm += 4; break; -//.. default: -//.. vpanic("advance4(x86,host)"); -//.. } -//.. return am4; -//.. } -//.. -//.. -//.. /* Push an arg onto the host stack, in preparation for a call to a -//.. helper function of some kind. Returns the number of 32-bit words -//.. pushed. */ -//.. -//.. static Int pushArg ( ISelEnv* env, IRExpr* arg ) -//.. { -//.. IRType arg_ty = typeOfIRExpr(env->type_env, arg); -//.. if (arg_ty == Ity_I32) { -//.. addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg))); -//.. return 1; -//.. } else -//.. if (arg_ty == Ity_I64) { -//.. HReg rHi, rLo; -//.. iselInt64Expr(&rHi, &rLo, env, arg); -//.. addInstr(env, X86Instr_Push(X86RMI_Reg(rHi))); -//.. addInstr(env, X86Instr_Push(X86RMI_Reg(rLo))); -//.. return 2; -//.. } -//.. ppIRExpr(arg); -//.. vpanic("pushArg(x86): can't handle arg of this type"); -//.. } - /* Used only in doHelperCall. If possible, produce a single instruction which computes 'e' into 'dst'. If not possible, return @@ -579,11 +556,11 @@ void doHelperCall ( ISelEnv* env, /* SLOW SCHEME; move via temporaries */ slowscheme: -#if 0 -if (n_args > 0) {for (i = 0; args[i]; i++) { -ppIRExpr(args[i]); vex_printf(" "); } -vex_printf("\n");} -#endif +# if 0 /* debug only */ + if (n_args > 0) {for (i = 0; args[i]; i++) { + ppIRExpr(args[i]); vex_printf(" "); } + vex_printf("\n");} +# endif argreg = 0; if (passBBP) { @@ -819,23 +796,6 @@ static ULong bitmask8_to_bytemask64 ( UShort w8 ) } -//.. /* Round an x87 FPU value to 53-bit-mantissa precision, to be used -//.. after most non-simple FPU operations (simple = +, -, *, / and -//.. sqrt). -//.. -//.. This could be done a lot more efficiently if needed, by loading -//.. zero and adding it to the value to be rounded (fldz ; faddp?). -//.. */ -//.. static void roundToF64 ( ISelEnv* env, HReg reg ) -//.. { -//.. X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP()); -//.. sub_from_esp(env, 8); -//.. addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp)); -//.. addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp)); -//.. add_to_esp(env, 8); -//.. } - - /*---------------------------------------------------------*/ /*--- ISEL: Integer expressions (64/32/16/8 bit) ---*/ /*---------------------------------------------------------*/ @@ -1325,68 +1285,6 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) return dst; } -//.. if (e->Iex.Binop.op == Iop_F64toI32 || e->Iex.Binop.op == Iop_F64toI16) { -//.. Int sz = e->Iex.Binop.op == Iop_F64toI16 ? 2 : 4; -//.. HReg rf = iselDblExpr(env, e->Iex.Binop.arg2); -//.. HReg dst = newVRegI(env); -//.. -//.. /* Used several times ... */ -//.. X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP()); -//.. -//.. /* rf now holds the value to be converted, and rrm holds the -//.. rounding mode value, encoded as per the IRRoundingMode -//.. enum. The first thing to do is set the FPU's rounding -//.. mode accordingly. */ -//.. -//.. /* Create a space for the format conversion. */ -//.. /* subl $4, %esp */ -//.. sub_from_esp(env, 4); -//.. -//.. /* Set host rounding mode */ -//.. set_FPU_rounding_mode( env, e->Iex.Binop.arg1 ); -//.. -//.. /* gistw/l %rf, 0(%esp) */ -//.. addInstr(env, X86Instr_FpLdStI(False/*store*/, sz, rf, zero_esp)); -//.. -//.. if (sz == 2) { -//.. /* movzwl 0(%esp), %dst */ -//.. addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst)); -//.. } else { -//.. /* movl 0(%esp), %dst */ -//.. vassert(sz == 4); -//.. addInstr(env, X86Instr_Alu32R( -//.. Xalu_MOV, X86RMI_Mem(zero_esp), dst)); -//.. } -//.. -//.. /* Restore default FPU rounding. */ -//.. set_FPU_rounding_default( env ); -//.. -//.. /* addl $4, %esp */ -//.. add_to_esp(env, 4); -//.. return dst; -//.. } -//.. -//.. /* C3210 flags following FPU partial remainder (fprem), both -//.. IEEE compliant (PREM1) and non-IEEE compliant (PREM). */ -//.. if (e->Iex.Binop.op == Iop_PRemC3210F64 -//.. || e->Iex.Binop.op == Iop_PRem1C3210F64) { -//.. HReg junk = newVRegF(env); -//.. HReg dst = newVRegI(env); -//.. HReg srcL = iselDblExpr(env, e->Iex.Binop.arg1); -//.. HReg srcR = iselDblExpr(env, e->Iex.Binop.arg2); -//.. addInstr(env, X86Instr_FpBinary( -//.. e->Iex.Binop.op==Iop_PRemC3210F64 -//.. ? Xfp_PREM : Xfp_PREM1, -//.. srcL,srcR,junk -//.. )); -//.. /* The previous pseudo-insn will have left the FPU's C3210 -//.. flags set correctly. So bag them. */ -//.. addInstr(env, X86Instr_FpStSW_AX()); -//.. addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst)); -//.. addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst)); -//.. return dst; -//.. } - break; } @@ -1523,16 +1421,6 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst)); return dst; } -//.. case Iop_64HIto32: { -//.. HReg rHi, rLo; -//.. iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg); -//.. return rHi; /* and abandon rLo .. poor wee thing :-) */ -//.. } -//.. case Iop_64to32: { -//.. HReg rHi, rLo; -//.. iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg); -//.. return rLo; /* similar stupid comment to the above ... */ -//.. } case Iop_16HIto8: case Iop_32HIto16: case Iop_64HIto32: { @@ -1640,16 +1528,42 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) /* V128{HI}to64 */ case Iop_V128HIto64: case Iop_V128to64: { - Int off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0; HReg dst = newVRegI(env); + Int off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16; + HReg rsp = hregAMD64_RSP(); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); - AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); - AMD64AMode* rspN = AMD64AMode_IR(off, hregAMD64_RSP()); - sub_from_rsp(env, 16); - addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp0)); + AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); + AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, + 16, vec, m16_rsp)); addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, - AMD64RMI_Mem(rspN), dst )); - add_to_rsp(env, 16); + AMD64RMI_Mem(off_rsp), dst )); + return dst; + } + + case Iop_V256to64_0: case Iop_V256to64_1: + case Iop_V256to64_2: case Iop_V256to64_3: { + HReg vHi, vLo, vec; + iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg); + /* Do the first part of the selection by deciding which of + the 128 bit registers do look at, and second part using + the same scheme as for V128{HI}to64 above. */ + Int off = 0; + switch (e->Iex.Unop.op) { + case Iop_V256to64_0: vec = vLo; off = -16; break; + case Iop_V256to64_1: vec = vLo; off = -8; break; + case Iop_V256to64_2: vec = vHi; off = -16; break; + case Iop_V256to64_3: vec = vHi; off = -8; break; + default: vassert(0); + } + HReg dst = newVRegI(env); + HReg rsp = hregAMD64_RSP(); + AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); + AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, + 16, vec, m16_rsp)); + addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, + AMD64RMI_Mem(off_rsp), dst )); return dst; } @@ -2388,95 +2302,15 @@ static void iselInt128Expr ( HReg* rHi, HReg* rLo, static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e ) { -//.. HWord fn = 0; /* helper fn for most SIMD64 stuff */ vassert(e); vassert(typeOfIRExpr(env->type_env,e) == Ity_I128); -//.. /* 64-bit literal */ -//.. if (e->tag == Iex_Const) { -//.. ULong w64 = e->Iex.Const.con->Ico.U64; -//.. UInt wHi = ((UInt)(w64 >> 32)) & 0xFFFFFFFF; -//.. UInt wLo = ((UInt)w64) & 0xFFFFFFFF; -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. vassert(e->Iex.Const.con->tag == Ico_U64); -//.. addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi)); -//.. addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } - /* read 128-bit IRTemp */ if (e->tag == Iex_RdTmp) { - lookupIRTemp128( rHi, rLo, env, e->Iex.RdTmp.tmp); + lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp); return; } -//.. /* 64-bit load */ -//.. if (e->tag == Iex_LDle) { -//.. HReg tLo, tHi; -//.. X86AMode *am0, *am4; -//.. vassert(e->Iex.LDle.ty == Ity_I64); -//.. tLo = newVRegI(env); -//.. tHi = newVRegI(env); -//.. am0 = iselIntExpr_AMode(env, e->Iex.LDle.addr); -//.. am4 = advance4(am0); -//.. addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo )); -//.. addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi )); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* 64-bit GET */ -//.. if (e->tag == Iex_Get) { -//.. X86AMode* am = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP()); -//.. X86AMode* am4 = advance4(am); -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo )); -//.. addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi )); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* 64-bit GETI */ -//.. if (e->tag == Iex_GetI) { -//.. X86AMode* am -//.. = genGuestArrayOffset( env, e->Iex.GetI.descr, -//.. e->Iex.GetI.ix, e->Iex.GetI.bias ); -//.. X86AMode* am4 = advance4(am); -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo )); -//.. addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi )); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* 64-bit Mux0X */ -//.. if (e->tag == Iex_Mux0X) { -//.. HReg e0Lo, e0Hi, eXLo, eXHi, r8; -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0); -//.. iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX); -//.. addInstr(env, mk_iMOVsd_RR(eXHi, tHi)); -//.. addInstr(env, mk_iMOVsd_RR(eXLo, tLo)); -//.. r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond); -//.. addInstr(env, X86Instr_Test32(X86RI_Imm(0xFF), X86RM_Reg(r8))); -//.. /* This assumes the first cmov32 doesn't trash the condition -//.. codes, so they are still available for the second cmov32 */ -//.. addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi)); -//.. addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } - /* --------- BINARY ops --------- */ if (e->tag == Iex_Binop) { switch (e->Iex.Binop.op) { @@ -2528,276 +2362,11 @@ static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2); return; -//.. /* Or64/And64/Xor64 */ -//.. case Iop_Or64: -//.. case Iop_And64: -//.. case Iop_Xor64: { -//.. HReg xLo, xHi, yLo, yHi; -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR -//.. : e->Iex.Binop.op==Iop_And64 ? Xalu_AND -//.. : Xalu_XOR; -//.. iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1); -//.. addInstr(env, mk_iMOVsd_RR(xHi, tHi)); -//.. addInstr(env, mk_iMOVsd_RR(xLo, tLo)); -//.. iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2); -//.. addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi)); -//.. addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* Add64/Sub64 */ -//.. case Iop_Add64: -//.. case Iop_Sub64: { -//.. HReg xLo, xHi, yLo, yHi; -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1); -//.. addInstr(env, mk_iMOVsd_RR(xHi, tHi)); -//.. addInstr(env, mk_iMOVsd_RR(xLo, tLo)); -//.. iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2); -//.. if (e->Iex.Binop.op==Iop_Add64) { -//.. addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo)); -//.. addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi)); -//.. } else { -//.. addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo)); -//.. addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi)); -//.. } -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* 32HLto64(e1,e2) */ -//.. case Iop_32HLto64: -//.. *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1); -//.. *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2); -//.. return; -//.. -//.. /* 64-bit shifts */ -//.. case Iop_Shl64: { -//.. /* We use the same ingenious scheme as gcc. Put the value -//.. to be shifted into %hi:%lo, and the shift amount into -//.. %cl. Then (dsts on right, a la ATT syntax): -//.. -//.. shldl %cl, %lo, %hi -- make %hi be right for the -//.. -- shift amt %cl % 32 -//.. shll %cl, %lo -- make %lo be right for the -//.. -- shift amt %cl % 32 -//.. -//.. Now, if (shift amount % 64) is in the range 32 .. 63, -//.. we have to do a fixup, which puts the result low half -//.. into the result high half, and zeroes the low half: -//.. -//.. testl $32, %ecx -//.. -//.. cmovnz %lo, %hi -//.. movl $0, %tmp -- sigh; need yet another reg -//.. cmovnz %tmp, %lo -//.. */ -//.. HReg rAmt, sHi, sLo, tHi, tLo, tTemp; -//.. tLo = newVRegI(env); -//.. tHi = newVRegI(env); -//.. tTemp = newVRegI(env); -//.. rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2); -//.. iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1); -//.. addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX())); -//.. addInstr(env, mk_iMOVsd_RR(sHi, tHi)); -//.. addInstr(env, mk_iMOVsd_RR(sLo, tLo)); -//.. /* Ok. Now shift amt is in %ecx, and value is in tHi/tLo -//.. and those regs are legitimately modifiable. */ -//.. addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi)); -//.. addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, X86RM_Reg(tLo))); -//.. addInstr(env, X86Instr_Test32(X86RI_Imm(32), -//.. X86RM_Reg(hregX86_ECX()))); -//.. addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi)); -//.. addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp)); -//.. addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. case Iop_Shr64: { -//.. /* We use the same ingenious scheme as gcc. Put the value -//.. to be shifted into %hi:%lo, and the shift amount into -//.. %cl. Then: -//.. -//.. shrdl %cl, %hi, %lo -- make %lo be right for the -//.. -- shift amt %cl % 32 -//.. shrl %cl, %hi -- make %hi be right for the -//.. -- shift amt %cl % 32 -//.. -//.. Now, if (shift amount % 64) is in the range 32 .. 63, -//.. we have to do a fixup, which puts the result high half -//.. into the result low half, and zeroes the high half: -//.. -//.. testl $32, %ecx -//.. -//.. cmovnz %hi, %lo -//.. movl $0, %tmp -- sigh; need yet another reg -//.. cmovnz %tmp, %hi -//.. */ -//.. HReg rAmt, sHi, sLo, tHi, tLo, tTemp; -//.. tLo = newVRegI(env); -//.. tHi = newVRegI(env); -//.. tTemp = newVRegI(env); -//.. rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2); -//.. iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1); -//.. addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX())); -//.. addInstr(env, mk_iMOVsd_RR(sHi, tHi)); -//.. addInstr(env, mk_iMOVsd_RR(sLo, tLo)); -//.. /* Ok. Now shift amt is in %ecx, and value is in tHi/tLo -//.. and those regs are legitimately modifiable. */ -//.. addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo)); -//.. addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, X86RM_Reg(tHi))); -//.. addInstr(env, X86Instr_Test32(X86RI_Imm(32), -//.. X86RM_Reg(hregX86_ECX()))); -//.. addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo)); -//.. addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp)); -//.. addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* F64 -> I64 */ -//.. /* Sigh, this is an almost exact copy of the F64 -> I32/I16 -//.. case. Unfortunately I see no easy way to avoid the -//.. duplication. */ -//.. case Iop_F64toI64: { -//.. HReg rf = iselDblExpr(env, e->Iex.Binop.arg2); -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. -//.. /* Used several times ... */ -//.. /* Careful ... this sharing is only safe because -//.. zero_esp/four_esp do not hold any registers which the -//.. register allocator could attempt to swizzle later. */ -//.. X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP()); -//.. X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP()); -//.. -//.. /* rf now holds the value to be converted, and rrm holds -//.. the rounding mode value, encoded as per the -//.. IRRoundingMode enum. The first thing to do is set the -//.. FPU's rounding mode accordingly. */ -//.. -//.. /* Create a space for the format conversion. */ -//.. /* subl $8, %esp */ -//.. sub_from_esp(env, 8); -//.. -//.. /* Set host rounding mode */ -//.. set_FPU_rounding_mode( env, e->Iex.Binop.arg1 ); -//.. -//.. /* gistll %rf, 0(%esp) */ -//.. addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp)); -//.. -//.. /* movl 0(%esp), %dstLo */ -//.. /* movl 4(%esp), %dstHi */ -//.. addInstr(env, X86Instr_Alu32R( -//.. Xalu_MOV, X86RMI_Mem(zero_esp), tLo)); -//.. addInstr(env, X86Instr_Alu32R( -//.. Xalu_MOV, X86RMI_Mem(four_esp), tHi)); -//.. -//.. /* Restore default FPU rounding. */ -//.. set_FPU_rounding_default( env ); -//.. -//.. /* addl $8, %esp */ -//.. add_to_esp(env, 8); -//.. -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. default: break; } } /* if (e->tag == Iex_Binop) */ - -//.. /* --------- UNARY ops --------- */ -//.. if (e->tag == Iex_Unop) { -//.. switch (e->Iex.Unop.op) { -//.. -//.. /* 32Sto64(e) */ -//.. case Iop_32Sto64: { -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); -//.. addInstr(env, mk_iMOVsd_RR(src,tHi)); -//.. addInstr(env, mk_iMOVsd_RR(src,tLo)); -//.. addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tHi))); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* 32Uto64(e) */ -//.. case Iop_32Uto64: { -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); -//.. addInstr(env, mk_iMOVsd_RR(src,tLo)); -//.. addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } - -//.. /* could do better than this, but for now ... */ -//.. case Iop_1Sto64: { -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg); -//.. addInstr(env, X86Instr_Set32(cond,tLo)); -//.. addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, X86RM_Reg(tLo))); -//.. addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tLo))); -//.. addInstr(env, mk_iMOVsd_RR(tLo, tHi)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. /* Not64(e) */ -//.. case Iop_Not64: { -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. HReg sHi, sLo; -//.. iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg); -//.. addInstr(env, mk_iMOVsd_RR(sHi, tHi)); -//.. addInstr(env, mk_iMOVsd_RR(sLo, tLo)); -//.. addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tHi))); -//.. addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tLo))); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } -//.. -//.. default: -//.. break; -//.. } -//.. } /* if (e->tag == Iex_Unop) */ -//.. -//.. -//.. /* --------- CCALL --------- */ -//.. if (e->tag == Iex_CCall) { -//.. HReg tLo = newVRegI(env); -//.. HReg tHi = newVRegI(env); -//.. -//.. /* Marshal args, do the call, clear stack. */ -//.. doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args ); -//.. -//.. addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi)); -//.. addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo)); -//.. *rHi = tHi; -//.. *rLo = tLo; -//.. return; -//.. } - ppIRExpr(e); vpanic("iselInt128Expr"); } @@ -3379,8 +2948,6 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } -//.. case Iop_Recip64Fx2: op = Xsse_RCPF; goto do_64Fx2_unary; -//.. case Iop_RSqrt64Fx2: op = Asse_RSQRTF; goto do_64Fx2_unary; case Iop_Sqrt64Fx2: op = Asse_SQRTF; goto do_64Fx2_unary; do_64Fx2_unary: { @@ -3408,8 +2975,6 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } -//.. case Iop_Recip64F0x2: op = Xsse_RCPF; goto do_64F0x2_unary; -//.. case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary; case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary; do_64F0x2_unary: { @@ -3453,6 +3018,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) if (e->tag == Iex_Binop) { switch (e->Iex.Binop.op) { + /* FIXME: could we generate MOVQ here? */ case Iop_SetV128lo64: { HReg dst = newVRegV(env); HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1); @@ -3464,6 +3030,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } + /* FIXME: could we generate MOVD here? */ case Iop_SetV128lo32: { HReg dst = newVRegV(env); HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1); @@ -3476,13 +3043,16 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) } case Iop_64HLtoV128: { - AMD64AMode* rsp = AMD64AMode_IR(0, hregAMD64_RSP()); + HReg rsp = hregAMD64_RSP(); + AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp); + AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); + AMD64RI* qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1); + AMD64RI* qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2); + addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp)); + addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp)); HReg dst = newVRegV(env); - /* do this via the stack (easy, convenient, etc) */ - addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg1))); - addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg2))); - addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp)); - add_to_rsp(env, 16); + /* One store-forwarding stall coming up, oh well :-( */ + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp)); return dst; } @@ -3810,6 +3380,153 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) } +/*---------------------------------------------------------*/ +/*--- ISEL: SIMD (V256) expressions, 256 bit. ---*/ +/*---------------------------------------------------------*/ + +static HReg iselV256Expr ( ISelEnv* env, IRExpr* e ) +{ + HReg r = iselV256Expr_wrk( env, e ); +# if 0 + vex_printf("\n"); ppIRExpr(e); vex_printf("\n"); +# endif + vassert(hregClass(r) == HRcVec256); + vassert(hregIsVirtual(r)); + return r; +} + + +/* DO NOT CALL THIS DIRECTLY */ +static HReg iselV256Expr_wrk ( ISelEnv* env, IRExpr* e ) +{ + //HWord fn = 0; /* address of helper fn, if required */ + //Bool arg1isEReg = False; + //AMD64SseOp op = Asse_INVALID; + IRType ty = typeOfIRExpr(env->type_env,e); + vassert(e); + vassert(ty == Ity_V256); +#if 0 + if (e->tag == Iex_RdTmp) { + return lookupIRTemp(env, e->Iex.RdTmp.tmp); + } + + if (e->tag == Iex_Get) { + HReg dst = newVRegDV(env); + addInstr(env, AMD64Instr_AvxLdSt( + True/*load*/, + dst, + AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP()) + ) + ); + return dst; + } + + if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) { + HReg dst = newVRegDV(env); + AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr); + addInstr(env, AMD64Instr_AvxLdSt( True/*load*/, dst, am )); + return dst; + } +#endif + //avx_fail: + vex_printf("iselV256Expr (amd64, subarch = %s): can't reduce\n", + LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps)); + ppIRExpr(e); + vpanic("iselV256Expr_wrk"); +} + + +/*---------------------------------------------------------*/ +/*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/ +/*---------------------------------------------------------*/ + +static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo, + ISelEnv* env, IRExpr* e ) +{ + iselDVecExpr_wrk( rHi, rLo, env, e ); +# if 0 + vex_printf("\n"); ppIRExpr(e); vex_printf("\n"); +# endif + vassert(hregClass(*rHi) == HRcVec128); + vassert(hregClass(*rLo) == HRcVec128); + vassert(hregIsVirtual(*rHi)); + vassert(hregIsVirtual(*rLo)); +} + + +/* DO NOT CALL THIS DIRECTLY */ +static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, + ISelEnv* env, IRExpr* e ) +{ + vassert(e); + IRType ty = typeOfIRExpr(env->type_env,e); + vassert(ty == Ity_V256); + + /* read 256-bit IRTemp */ + if (e->tag == Iex_RdTmp) { + lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp); + return; + } + + if (e->tag == Iex_Get) { + HReg vHi = newVRegV(env); + HReg vLo = newVRegV(env); + HReg rbp = hregAMD64_RBP(); + AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp); + AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp); + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0)); + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16)); + *rHi = vHi; + *rLo = vLo; + return; + } + + if (e->tag == Iex_Load) { + HReg vHi = newVRegV(env); + HReg vLo = newVRegV(env); + HReg rA = iselIntExpr_R(env, e->Iex.Load.addr); + AMD64AMode* am0 = AMD64AMode_IR(0, rA); + AMD64AMode* am16 = AMD64AMode_IR(16, rA); + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0)); + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16)); + *rHi = vHi; + *rLo = vLo; + return; + } + + if (e->tag == Iex_Qop && e->Iex.Qop.op == Iop_64x4toV256) { + HReg rsp = hregAMD64_RSP(); + HReg vHi = newVRegV(env); + HReg vLo = newVRegV(env); + AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp); + AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); + /* arg1 is the most significant (Q3), arg4 the least (Q0) */ + /* Get all the args into regs, before messing with the stack. */ + AMD64RI* q3 = iselIntExpr_RI(env, e->Iex.Qop.arg1); + AMD64RI* q2 = iselIntExpr_RI(env, e->Iex.Qop.arg2); + AMD64RI* q1 = iselIntExpr_RI(env, e->Iex.Qop.arg3); + AMD64RI* q0 = iselIntExpr_RI(env, e->Iex.Qop.arg4); + /* less significant lane (Q2) at the lower address (-16(rsp)) */ + addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp)); + addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp)); + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp)); + /* and then the lower half .. */ + addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp)); + addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp)); + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp)); + *rHi = vHi; + *rLo = vLo; + return; + } + + //avx_fail: + vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n", + LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps)); + ppIRExpr(e); + vpanic("iselDVecExpr_wrk"); +} + + /*---------------------------------------------------------*/ /*--- ISEL: Statements ---*/ /*---------------------------------------------------------*/ @@ -3865,6 +3582,16 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am)); return; } + if (tyd == Ity_V256) { + HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr); + AMD64AMode* am0 = AMD64AMode_IR(0, rA); + AMD64AMode* am16 = AMD64AMode_IR(16, rA); + HReg vHi, vLo; + iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0)); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16)); + return; + } break; } @@ -3893,13 +3620,6 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) hregAMD64_RBP()))); return; } - if (ty == Ity_V128) { - HReg vec = iselVecExpr(env, stmt->Ist.Put.data); - AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, - hregAMD64_RBP()); - addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am)); - return; - } if (ty == Ity_F32) { HReg f32 = iselFltExpr(env, stmt->Ist.Put.data); AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP()); @@ -3914,6 +3634,23 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am )); return; } + if (ty == Ity_V128) { + HReg vec = iselVecExpr(env, stmt->Ist.Put.data); + AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, + hregAMD64_RBP()); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am)); + return; + } + if (ty == Ity_V256) { + HReg vHi, vLo; + iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data); + HReg rbp = hregAMD64_RBP(); + AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp); + AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0)); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16)); + return; + } break; } @@ -3981,7 +3718,7 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) if (ty == Ity_I128) { HReg rHi, rLo, dstHi, dstLo; iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data); - lookupIRTemp128( &dstHi, &dstLo, env, tmp); + lookupIRTempPair( &dstHi, &dstLo, env, tmp); addInstr(env, mk_iMOVsd_RR(rHi,dstHi) ); addInstr(env, mk_iMOVsd_RR(rLo,dstLo) ); return; @@ -4010,6 +3747,14 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) addInstr(env, mk_vMOVsd_RR(src, dst)); return; } + if (ty == Ity_V256) { + HReg rHi, rLo, dstHi, dstLo; + iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data); + lookupIRTempPair( &dstHi, &dstLo, env, tmp); + addInstr(env, mk_vMOVsd_RR(rHi,dstHi) ); + addInstr(env, mk_vMOVsd_RR(rLo,dstLo) ); + return; + } break; } @@ -4358,17 +4103,25 @@ HInstrArray* iselSB_AMD64 ( IRSB* bb, hregHI = hreg = INVALID_HREG; switch (bb->tyenv->types[i]) { case Ity_I1: - case Ity_I8: - case Ity_I16: - case Ity_I32: - case Ity_I64: hreg = mkHReg(j++, HRcInt64, True); break; - case Ity_I128: hreg = mkHReg(j++, HRcInt64, True); - hregHI = mkHReg(j++, HRcInt64, True); break; + case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64: + hreg = mkHReg(j++, HRcInt64, True); + break; + case Ity_I128: + hreg = mkHReg(j++, HRcInt64, True); + hregHI = mkHReg(j++, HRcInt64, True); + break; case Ity_F32: case Ity_F64: - case Ity_V128: hreg = mkHReg(j++, HRcVec128, True); break; - default: ppIRType(bb->tyenv->types[i]); - vpanic("iselBB(amd64): IRTemp type"); + case Ity_V128: + hreg = mkHReg(j++, HRcVec128, True); + break; + case Ity_V256: + hreg = mkHReg(j++, HRcVec128, True); + hregHI = mkHReg(j++, HRcVec128, True); + break; + default: + ppIRType(bb->tyenv->types[i]); + vpanic("iselBB(amd64): IRTemp type"); } env->vregmap[i] = hreg; env->vregmapHI[i] = hregHI; diff --git a/VEX/priv/host_generic_reg_alloc2.c b/VEX/priv/host_generic_reg_alloc2.c index 5052d9df8f..2fc97c3c6f 100644 --- a/VEX/priv/host_generic_reg_alloc2.c +++ b/VEX/priv/host_generic_reg_alloc2.c @@ -207,10 +207,13 @@ Int findMostDistantlyMentionedVReg ( /* Check that this vreg has been assigned a sane spill offset. */ static inline void sanity_check_spill_offset ( VRegLR* vreg ) { - if (vreg->reg_class == HRcVec128 || vreg->reg_class == HRcFlt64) { - vassert(0 == ((UShort)vreg->spill_offset % 16)); - } else { - vassert(0 == ((UShort)vreg->spill_offset % 8)); + switch (vreg->reg_class) { + case HRcVec256: + vassert(0 == ((UShort)vreg->spill_offset % 32)); break; + case HRcVec128: case HRcFlt64: + vassert(0 == ((UShort)vreg->spill_offset % 16)); break; + default: + vassert(0 == ((UShort)vreg->spill_offset % 8)); break; } } @@ -398,9 +401,9 @@ HInstrArray* doRegisterAllocation ( not at each insn processed. */ Bool do_sanity_check; - vassert(0 == (guest_sizeB % 16)); - vassert(0 == (LibVEX_N_SPILL_BYTES % 16)); - vassert(0 == (N_SPILL64S % 2)); + vassert(0 == (guest_sizeB % 32)); + vassert(0 == (LibVEX_N_SPILL_BYTES % 32)); + vassert(0 == (N_SPILL64S % 4)); /* The live range numbers are signed shorts, and so limiting the number of insns to 10000 comfortably guards against them @@ -790,18 +793,24 @@ HInstrArray* doRegisterAllocation ( /* Each spill slot is 8 bytes long. For vregs which take more than 64 bits to spill (classes Flt64 and Vec128), we have to allocate - two spill slots. + two consecutive spill slots. For 256 bit registers (class + Vec256), we have to allocate four consecutive spill slots. For Vec128-class on PowerPC, the spill slot's actual address must be 16-byte aligned. Since the spill slot's address is computed as an offset from the guest state pointer, and since the user of the generated code must set that pointer to a - 16-aligned value, we have the residual obligation here of + 32-aligned value, we have the residual obligation here of choosing a 16-aligned spill slot offset for Vec128-class values. Since each spill slot is 8 bytes long, that means for Vec128-class values we must allocated a spill slot number which is zero mod 2. + Similarly, for Vec256 calss on amd64, find a spill slot number + which is zero mod 4. This guarantees it will be 32 byte + aligned, which isn't actually necessary on amd64 (we use movUpd + etc to spill), but seems like good practice. + Do a rank-based allocation of vregs to spill slot numbers. We put as few values as possible in spill slots, but nevertheless need to have a spill slot available for all vregs, just in case. @@ -821,48 +830,72 @@ HInstrArray* doRegisterAllocation ( } /* The spill slots are 64 bits in size. As per the comment on - definition of HRegClass in host_generic_regs.h, that means, to - spill a vreg of class Flt64 or Vec128, we'll need to find two - adjacent spill slots to use. Note, this logic needs to kept - in sync with the size info on the definition of HRegClass. */ - - if (vreg_lrs[j].reg_class == HRcVec128 - || vreg_lrs[j].reg_class == HRcFlt64) { - - /* Find two adjacent free slots in which between them provide - up to 128 bits in which to spill the vreg. Since we are - trying to find an even:odd pair, move along in steps of 2 - (slots). */ - - for (k = 0; k < N_SPILL64S-1; k += 2) - if (ss_busy_until_before[k] <= vreg_lrs[j].live_after - && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after) - break; - if (k >= N_SPILL64S-1) { - vpanic("LibVEX_N_SPILL_BYTES is too low. " - "Increase and recompile."); - } - if (0) vex_printf("16-byte spill offset in spill slot %d\n", (Int)k); - ss_busy_until_before[k+0] = vreg_lrs[j].dead_before; - ss_busy_until_before[k+1] = vreg_lrs[j].dead_before; - - } else { + definition of HRegClass in host_generic_regs.h, that means, + to spill a vreg of class Flt64 or Vec128, we'll need to find + two adjacent spill slots to use. For Vec256, we'll need to + find four adjacent slots to use. Note, this logic needs to + kept in sync with the size info on the definition of + HRegClass. */ + switch (vreg_lrs[j].reg_class) { + + case HRcVec256: + /* Find four adjacent free slots in which between them + provide 256 bits in which to spill the vreg. Since we + are trying to find an 32-byte-aligned slot, move along + in steps of 4 (slots). */ + for (k = 0; k < N_SPILL64S-3; k += 4) + if (ss_busy_until_before[k+0] <= vreg_lrs[j].live_after + && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after + && ss_busy_until_before[k+2] <= vreg_lrs[j].live_after + && ss_busy_until_before[k+3] <= vreg_lrs[j].live_after) + break; + if (k >= N_SPILL64S-3) { + vpanic("LibVEX_N_SPILL_BYTES is too low. " + "Increase and recompile."); + } + if (0) vex_printf("32-byte spill offset in spill slot %d\n", + (Int)k); + ss_busy_until_before[k+0] = vreg_lrs[j].dead_before; + ss_busy_until_before[k+1] = vreg_lrs[j].dead_before; + ss_busy_until_before[k+2] = vreg_lrs[j].dead_before; + ss_busy_until_before[k+3] = vreg_lrs[j].dead_before; + break; - /* The ordinary case -- just find a single spill slot. */ + case HRcVec128: case HRcFlt64: + /* Find two adjacent free slots in which between them + provide up to 128 bits in which to spill the vreg. + Since we are trying to find an even:odd pair, move + along in steps of 2 (slots). */ + for (k = 0; k < N_SPILL64S-1; k += 2) + if (ss_busy_until_before[k+0] <= vreg_lrs[j].live_after + && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after) + break; + if (k >= N_SPILL64S-1) { + vpanic("LibVEX_N_SPILL_BYTES is too low. " + "Increase and recompile."); + } + if (0) vex_printf("16-byte spill offset in spill slot %d\n", + (Int)k); + ss_busy_until_before[k+0] = vreg_lrs[j].dead_before; + ss_busy_until_before[k+1] = vreg_lrs[j].dead_before; + break; - /* Find the lowest-numbered spill slot which is available at - the start point of this interval, and assign the interval - to it. */ - for (k = 0; k < N_SPILL64S; k++) - if (ss_busy_until_before[k] <= vreg_lrs[j].live_after) - break; - if (k == N_SPILL64S) { - vpanic("LibVEX_N_SPILL_BYTES is too low. " - "Increase and recompile."); - } - ss_busy_until_before[k] = vreg_lrs[j].dead_before; + default: + /* The ordinary case -- just find a single spill slot. */ + /* Find the lowest-numbered spill slot which is available + at the start point of this interval, and assign the + interval to it. */ + for (k = 0; k < N_SPILL64S; k++) + if (ss_busy_until_before[k] <= vreg_lrs[j].live_after) + break; + if (k == N_SPILL64S) { + vpanic("LibVEX_N_SPILL_BYTES is too low. " + "Increase and recompile."); + } + ss_busy_until_before[k] = vreg_lrs[j].dead_before; + break; - } + } /* switch (vreg_lrs[j].reg_class) { */ /* This reflects LibVEX's hard-wired knowledge of the baseBlock layout: the guest state, then two equal sized areas following diff --git a/VEX/priv/host_generic_regs.c b/VEX/priv/host_generic_regs.c index 713add9cb7..999c7f2abf 100644 --- a/VEX/priv/host_generic_regs.c +++ b/VEX/priv/host_generic_regs.c @@ -49,6 +49,7 @@ void ppHRegClass ( HRegClass hrc ) case HRcFlt64: vex_printf("HRcFlt64"); break; case HRcVec64: vex_printf("HRcVec64"); break; case HRcVec128: vex_printf("HRcVec128"); break; + case HRcVec256: vex_printf("HRcVec256"); break; default: vpanic("ppHRegClass"); } } @@ -65,6 +66,7 @@ void ppHReg ( HReg r ) case HRcFlt64: vex_printf("%%%sD%d", maybe_v, regNo); return; case HRcVec64: vex_printf("%%%sv%d", maybe_v, regNo); return; case HRcVec128: vex_printf("%%%sV%d", maybe_v, regNo); return; + case HRcVec256: vex_printf("%%%sDV%d", maybe_v, regNo); return; default: vpanic("ppHReg"); } } diff --git a/VEX/priv/host_generic_regs.h b/VEX/priv/host_generic_regs.h index e5c25b5263..f23d7f6375 100644 --- a/VEX/priv/host_generic_regs.h +++ b/VEX/priv/host_generic_regs.h @@ -87,6 +87,7 @@ typedef UInt HReg; so won't fit in a 64-bit slot) HRcVec64 64 bits HRcVec128 128 bits + HRcVec256 256 bits If you add another regclass, you must remember to update host_generic_reg_alloc2.c accordingly. @@ -99,7 +100,8 @@ typedef HRcFlt32=5, /* 32-bit float */ HRcFlt64=6, /* 64-bit float */ HRcVec64=7, /* 64-bit SIMD */ - HRcVec128=8 /* 128-bit SIMD */ + HRcVec128=8, /* 128-bit SIMD */ + HRcVec256=9 } HRegClass; @@ -122,7 +124,7 @@ static inline HReg mkHReg ( UInt regno, HRegClass rc, Bool virtual ) { static inline HRegClass hregClass ( HReg r ) { UInt rc = r; rc = (rc >> 28) & 0x0F; - vassert(rc >= HRcInt32 && rc <= HRcVec128); + vassert(rc >= HRcInt32 && rc <= HRcVec256); return (HRegClass)rc; } diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 0382a50104..4d671f6b0d 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -57,10 +57,11 @@ void ppIRType ( IRType ty ) case Ity_F32: vex_printf( "F32"); break; case Ity_F64: vex_printf( "F64"); break; case Ity_F128: vex_printf( "F128"); break; - case Ity_V128: vex_printf( "V128"); break; case Ity_D32: vex_printf( "D32"); break; case Ity_D64: vex_printf( "D64"); break; case Ity_D128: vex_printf( "D128"); break; + case Ity_V128: vex_printf( "V128"); break; + case Ity_V256: vex_printf( "V256"); break; default: vex_printf("ty = 0x%x\n", (Int)ty); vpanic("ppIRType"); } @@ -976,6 +977,11 @@ void ppIROp ( IROp op ) return; case Iop_ReinterpI64asD64: vex_printf("ReinterpI64asD64"); return; case Iop_ReinterpD64asI64: vex_printf("ReinterpD64asI64"); return; + case Iop_V256to64_0: vex_printf("V256to64_0"); return; + case Iop_V256to64_1: vex_printf("V256to64_1"); return; + case Iop_V256to64_2: vex_printf("V256to64_2"); return; + case Iop_V256to64_3: vex_printf("V256to64_3"); return; + case Iop_64x4toV256: vex_printf("64x4toV256"); return; default: vpanic("ppIROp(1)"); } @@ -2724,6 +2730,13 @@ void typeOfPrimop ( IROp op, case Iop_DivD128: TERNARY(ity_RMode,Ity_D128,Ity_D128, Ity_D128); + case Iop_V256to64_0: case Iop_V256to64_1: + case Iop_V256to64_2: case Iop_V256to64_3: + UNARY(Ity_V256, Ity_I64); + + case Iop_64x4toV256: + QUATERNARY(Ity_I64, Ity_I64, Ity_I64, Ity_I64, Ity_V256); + default: ppIROp(op); vpanic("typeOfPrimop"); @@ -2869,7 +2882,7 @@ Bool isPlausibleIRType ( IRType ty ) case Ity_I64: case Ity_I128: case Ity_F32: case Ity_F64: case Ity_F128: case Ity_D32: case Ity_D64: case Ity_D128: - case Ity_V128: + case Ity_V128: case Ity_V256: return True; default: return False; @@ -3742,10 +3755,11 @@ Int sizeofIRType ( IRType ty ) case Ity_F32: return 4; case Ity_F64: return 8; case Ity_F128: return 16; - case Ity_V128: return 16; case Ity_D32: return 4; case Ity_D64: return 8; case Ity_D128: return 16; + case Ity_V128: return 16; + case Ity_V256: return 32; default: vex_printf("\n"); ppIRType(ty); vex_printf("\n"); vpanic("sizeofIRType"); } diff --git a/VEX/priv/ir_opt.c b/VEX/priv/ir_opt.c index 240d89e7de..bec7e0610b 100644 --- a/VEX/priv/ir_opt.c +++ b/VEX/priv/ir_opt.c @@ -5045,7 +5045,8 @@ static void considerExpensives ( /*OUT*/Bool* hasGetIorPutI, case Ity_I1: case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64: case Ity_I128: break; - case Ity_F32: case Ity_F64: case Ity_F128: case Ity_V128: + case Ity_F32: case Ity_F64: case Ity_F128: + case Ity_V128: case Ity_V256: *hasVorFtemps = True; break; case Ity_D32: case Ity_D64: case Ity_D128: diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c index 83a93e59ca..f3516960e4 100644 --- a/VEX/priv/main_main.c +++ b/VEX/priv/main_main.c @@ -1055,6 +1055,7 @@ static HChar* show_hwcaps_amd64 ( UInt hwcaps ) very stupid. We should add strings independently based on feature bits, but then it would be hard to return a string that didn't need deallocating by the caller.) */ + /* FIXME: show_hwcaps_s390x is a much better way to do this. */ switch (hwcaps) { case 0: return "amd64-sse2"; diff --git a/VEX/pub/libvex_basictypes.h b/VEX/pub/libvex_basictypes.h index 4731d71246..b04e6e471b 100644 --- a/VEX/pub/libvex_basictypes.h +++ b/VEX/pub/libvex_basictypes.h @@ -62,6 +62,9 @@ typedef signed long long int Long; /* Always 128 bits. */ typedef UInt U128[4]; +/* Always 256 bits. */ +typedef UInt U256[8]; + /* A union for doing 128-bit vector primitives conveniently. */ typedef union { diff --git a/VEX/pub/libvex_guest_amd64.h b/VEX/pub/libvex_guest_amd64.h index 7d5d354ed9..9e634d8d60 100644 --- a/VEX/pub/libvex_guest_amd64.h +++ b/VEX/pub/libvex_guest_amd64.h @@ -96,28 +96,28 @@ typedef associated with a %fs value of zero. */ /* 200 */ ULong guest_FS_ZERO; - /* XMM registers. Note that these must be allocated + /* YMM registers. Note that these must be allocated consecutively in order that the SSE4.2 PCMP{E,I}STR{I,M} - helpers can treat them as an array. XMM16 is a fake reg used + helpers can treat them as an array. YMM16 is a fake reg used as an intermediary in handling aforementioned insns. */ /* 208 */ULong guest_SSEROUND; - /* 216 */U128 guest_XMM0; - U128 guest_XMM1; - U128 guest_XMM2; - U128 guest_XMM3; - U128 guest_XMM4; - U128 guest_XMM5; - U128 guest_XMM6; - U128 guest_XMM7; - U128 guest_XMM8; - U128 guest_XMM9; - U128 guest_XMM10; - U128 guest_XMM11; - U128 guest_XMM12; - U128 guest_XMM13; - U128 guest_XMM14; - U128 guest_XMM15; - U128 guest_XMM16; + /* 216 */U256 guest_YMM0; + U256 guest_YMM1; + U256 guest_YMM2; + U256 guest_YMM3; + U256 guest_YMM4; + U256 guest_YMM5; + U256 guest_YMM6; + U256 guest_YMM7; + U256 guest_YMM8; + U256 guest_YMM9; + U256 guest_YMM10; + U256 guest_YMM11; + U256 guest_YMM12; + U256 guest_YMM13; + U256 guest_YMM14; + U256 guest_YMM15; + U256 guest_YMM16; /* FPU */ /* Note. Setting guest_FTOP to be ULong messes up the diff --git a/VEX/pub/libvex_guest_x86.h b/VEX/pub/libvex_guest_x86.h index e0b1b7631f..ead8b68a61 100644 --- a/VEX/pub/libvex_guest_x86.h +++ b/VEX/pub/libvex_guest_x86.h @@ -221,8 +221,8 @@ typedef been interrupted by a signal. */ UInt guest_IP_AT_SYSCALL; - /* Padding to make it have an 16-aligned size */ - UInt padding1; + /* Padding to make it have an 32-aligned size */ + UInt padding[5]; } VexGuestX86State; diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 73db9e0cde..502dcf6b45 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -231,7 +231,8 @@ typedef Ity_D64, /* 64-bit Decimal floating point */ Ity_D128, /* 128-bit Decimal floating point */ Ity_F128, /* 128-bit floating point; implementation defined */ - Ity_V128 /* 128-bit SIMD */ + Ity_V128, /* 128-bit SIMD */ + Ity_V256 /* 256-bit SIMD */ } IRType; @@ -1407,7 +1408,18 @@ typedef /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate See floating-point equiwalents for details. */ - Iop_Recip32x4, Iop_Rsqrte32x4 + Iop_Recip32x4, Iop_Rsqrte32x4, + + /* ------------------ 256-bit SIMD Integer. ------------------ */ + + /* Pack/unpack */ + Iop_V256to64_0, // V256 -> I64, extract least sigificant lane + Iop_V256to64_1, + Iop_V256to64_2, + Iop_V256to64_3, // V256 -> I64, extract most sigificant lane + + Iop_64x4toV256 // (I64,I64,I64,I64)->V256 + // first arg is most significant lane } IROp;