From: Julian Seward Date: Wed, 27 Mar 2013 11:37:33 +0000 (+0000) Subject: AMD64: Add support for AVX2, BMI1, BMI2 and FMA instructions (VEX side). X-Git-Tag: svn/VALGRIND_3_9_0^2~93 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=26fc722aa7a2e9ee4aee659a76cf8014f387f888;p=thirdparty%2Fvalgrind.git AMD64: Add support for AVX2, BMI1, BMI2 and FMA instructions (VEX side). Fixes #305728. (Jakub Jelinek, jakub@redhat.com) git-svn-id: svn://svn.valgrind.org/vex/trunk@2702 --- diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h index 487f6f89a9..3bd52d4839 100644 --- a/VEX/priv/guest_amd64_defs.h +++ b/VEX/priv/guest_amd64_defs.h @@ -154,6 +154,9 @@ extern ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo, ULong dHi, ULong dLo, ULong imm_and_return_control_bit ); +extern ULong amd64g_calculate_pext ( ULong, ULong ); +extern ULong amd64g_calculate_pdep ( ULong, ULong ); + /* --- DIRTY HELPERS --- */ extern ULong amd64g_dirtyhelper_loadF80le ( ULong/*addr*/ ); @@ -508,6 +511,18 @@ enum { AMD64G_CC_OP_SMULL, /* 51 */ AMD64G_CC_OP_SMULQ, /* 52 */ + AMD64G_CC_OP_ANDN32, /* 53 */ + AMD64G_CC_OP_ANDN64, /* 54 DEP1 = res, DEP2 = 0, NDEP = unused */ + + AMD64G_CC_OP_BLSI32, /* 55 */ + AMD64G_CC_OP_BLSI64, /* 56 DEP1 = res, DEP2 = arg, NDEP = unused */ + + AMD64G_CC_OP_BLSMSK32,/* 57 */ + AMD64G_CC_OP_BLSMSK64,/* 58 DEP1 = res, DEP2 = arg, NDEP = unused */ + + AMD64G_CC_OP_BLSR32, /* 59 */ + AMD64G_CC_OP_BLSR64, /* 60 DEP1 = res, DEP2 = arg, NDEP = unused */ + AMD64G_CC_OP_NUMBER }; diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c index 488757fe04..c3cf1e20b5 100644 --- a/VEX/priv/guest_amd64_helpers.c +++ b/VEX/priv/guest_amd64_helpers.c @@ -492,6 +492,72 @@ static inline ULong idULong ( ULong x ) } \ } +/*-------------------------------------------------------------*/ + +#define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE) \ +{ \ + PREAMBLE(DATA_BITS); \ + { Long cf, pf, af, zf, sf, of; \ + cf = 0; \ + pf = 0; \ + af = 0; \ + zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ + sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ + of = 0; \ + return cf | pf | af | zf | sf | of; \ + } \ +} + +/*-------------------------------------------------------------*/ + +#define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE) \ +{ \ + PREAMBLE(DATA_BITS); \ + { Long cf, pf, af, zf, sf, of; \ + cf = ((DATA_UTYPE)CC_DEP2 != 0); \ + pf = 0; \ + af = 0; \ + zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ + sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ + of = 0; \ + return cf | pf | af | zf | sf | of; \ + } \ +} + +/*-------------------------------------------------------------*/ + +#define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE) \ +{ \ + PREAMBLE(DATA_BITS); \ + { Long cf, pf, af, zf, sf, of; \ + cf = ((DATA_UTYPE)CC_DEP2 == 0); \ + pf = 0; \ + af = 0; \ + zf = 0; \ + sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ + of = 0; \ + return cf | pf | af | zf | sf | of; \ + } \ +} + +/*-------------------------------------------------------------*/ + +#define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE) \ +{ \ + PREAMBLE(DATA_BITS); \ + { Long cf, pf, af, zf, sf, of; \ + cf = ((DATA_UTYPE)CC_DEP2 == 0); \ + pf = 0; \ + af = 0; \ + zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \ + sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \ + of = 0; \ + return cf | pf | af | zf | sf | of; \ + } \ +} + +/*-------------------------------------------------------------*/ + #if PROFILE_RFLAGS @@ -655,6 +721,18 @@ ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op, case AMD64G_CC_OP_SMULQ: ACTIONS_SMULQ; + case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt ); + case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong ); + + case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt ); + case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong ); + + case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt ); + case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong ); + + case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt ); + case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong ); + default: /* shouldn't really make these calls from generated code */ vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)" @@ -3139,6 +3217,36 @@ ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo, return res; } +/* CALLED FROM GENERATED CODE: CLEAN HELPER */ +ULong amd64g_calculate_pext ( ULong src_masked, ULong mask ) +{ + ULong dst = 0; + ULong src_bit; + ULong dst_bit = 1; + for (src_bit = 1; src_bit; src_bit <<= 1) { + if (mask & src_bit) { + if (src_masked & src_bit) dst |= dst_bit; + dst_bit <<= 1; + } + } + return dst; +} + +/* CALLED FROM GENERATED CODE: CLEAN HELPER */ +ULong amd64g_calculate_pdep ( ULong src, ULong mask ) +{ + ULong dst = 0; + ULong dst_bit; + ULong src_bit = 1; + for (dst_bit = 1; dst_bit; dst_bit <<= 1) { + if (mask & dst_bit) { + if (src & src_bit) dst |= dst_bit; + src_bit <<= 1; + } + } + return dst; +} + /*---------------------------------------------------------------*/ /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/ /*---------------------------------------------------------------*/ diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 2b200fc190..481f7531c2 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -1290,6 +1290,38 @@ const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm ) } +static +IRExpr* getIRegV ( Int sz, Prefix pfx ) +{ + if (sz == 4) { + sz = 8; + return unop(Iop_64to32, + IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ), + szToITy(sz) )); + } else { + return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ), + szToITy(sz) ); + } +} + +static +void putIRegV ( Int sz, Prefix pfx, IRExpr* e ) +{ + vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz)); + if (sz == 4) { + e = unop(Iop_32Uto64,e); + } + stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) ); +} + +static +const HChar* nameIRegV ( Int sz, Prefix pfx ) +{ + return nameIReg( sz, getVexNvvvv(pfx), False ); +} + + + /* Produce the guest state offset for a reference to the 'e' register field in a modrm byte, taking into account REX (or its absence), and the size of the access. eregOfRexRM will assert if mod_reg_rm @@ -2677,6 +2709,88 @@ IRTemp disAMode ( /*OUT*/Int* len, } +/* Similarly for VSIB addressing. This returns just the addend, + and fills in *rI and *vscale with the register number of the vector + index and its multiplicand. */ +static +IRTemp disAVSIBMode ( /*OUT*/Int* len, + VexAbiInfo* vbi, Prefix pfx, Long delta, + /*OUT*/HChar* buf, /*OUT*/UInt* rI, + IRType ty, /*OUT*/Int* vscale ) +{ + UChar mod_reg_rm = getUChar(delta); + const HChar *vindex; + + *len = 0; + *rI = 0; + *vscale = 0; + buf[0] = (UChar)0; + if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm)) + return IRTemp_INVALID; + + UChar sib = getUChar(delta+1); + UChar scale = toUChar((sib >> 6) & 3); + UChar index_r = toUChar((sib >> 3) & 7); + UChar base_r = toUChar(sib & 7); + Long d = 0; + /* correct since #(R13) == 8 + #(RBP) */ + Bool base_is_BPor13 = toBool(base_r == R_RBP); + delta += 2; + *len = 2; + + *rI = index_r | (getRexX(pfx) << 3); + if (ty == Ity_V128) + vindex = nameXMMReg(*rI); + else + vindex = nameYMMReg(*rI); + *vscale = 1<> 6) { + case 0: + if (base_is_BPor13) { + d = getSDisp32(delta); + *len += 4; + if (scale == 0) { + DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex); + } else { + DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<IR: sbb %%r,%%r optimisation(1)\n"); - putIRegG(size,pfx,rm, mkU(ty,0)); + putIRegG(size,pfx,rm, mkU(ty,0)); } assign( dst0, getIRegG(size,pfx,rm) ); @@ -3734,7 +3848,7 @@ ULong dis_Grp8_Imm ( VexAbiInfo* vbi, /* Write the result back, if non-BT. */ if (gregLO3ofRM(modrm) != 4 /* BT */) { if (epartIsReg(modrm)) { - putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m))); + putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m))); } else { if (pfx & PFX_LOCK) { casLE( mkexpr(t_addr), @@ -3931,7 +4045,7 @@ ULong dis_Grp3 ( VexAbiInfo* vbi, } else { addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /* we have to inform disAMode of any immediate - bytes used */ + bytes used */ gregLO3ofRM(modrm)==0/*TEST*/ ? imin(4,sz) : 0 @@ -4212,9 +4326,9 @@ ULong dis_Grp5 ( VexAbiInfo* vbi, putIReg64(R_RSP, mkexpr(t2) ); storeLE( mkexpr(t2), mkexpr(t3) ); break; - } else { + } else { goto unhandled; /* awaiting test case */ - } + } default: unhandled: *decode_OK = False; @@ -4673,6 +4787,34 @@ static IRTemp gen_LZCNT ( IRType ty, IRTemp src ) } +/* Generate an IR sequence to do a count-trailing-zeroes operation on + the supplied IRTemp, and return a new IRTemp holding the result. + 'ty' may be Ity_I16, Ity_I32 or Ity_I64 only. In the case where + the argument is zero, return the number of bits in the word (the + natural semantics). */ +static IRTemp gen_TZCNT ( IRType ty, IRTemp src ) +{ + vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16); + + IRTemp src64 = newTemp(Ity_I64); + assign(src64, widenUto64( mkexpr(src) )); + + // Ctz64 has undefined semantics when its input is zero, so + // special-case around that. + IRTemp res64 = newTemp(Ity_I64); + assign(res64, + IRExpr_ITE( + binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)), + mkU64(8 * sizeofIRType(ty)), + unop(Iop_Ctz64, mkexpr(src64)) + )); + + IRTemp res = newTemp(ty); + assign(res, narrowTo(ty, mkexpr(res64))); + return res; +} + + /*------------------------------------------------------------*/ /*--- ---*/ /*--- x87 FLOATING POINT INSTRUCTIONS ---*/ @@ -5248,7 +5390,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok, issue. If needed, side-exit to the next insn, reporting the warning, so that Valgrind's dispatcher sees the warning. */ - assign(ew, unop(Iop_64to32,mkexpr(w64)) ); + assign(ew, unop(Iop_64to32,mkexpr(w64)) ); put_emwarn( mkexpr(ew) ); stmt( IRStmt_Exit( @@ -7512,7 +7654,7 @@ ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi, binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)), binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)) )); - /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */ + /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */ assign( res64, binop(Iop_Shr64, binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)), @@ -8142,8 +8284,7 @@ ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok, putIRegG(sz, pfx, rm, mkexpr(tmpd)); putIRegE(sz, pfx, rm, mkexpr(tmpt1)); DIP("xadd%c %s, %s\n", - nameISize(sz), nameIRegG(sz,pfx,rm), - nameIRegE(sz,pfx,rm)); + nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm)); *decode_ok = True; return 1+delta0; } @@ -8570,7 +8711,7 @@ static ULong dis_SSEint_E_to_G( } putXMMReg( gregOfRexRM(pfx,rm), eLeft ? binop(op, epart, gpart) - : binop(op, gpart, epart) ); + : binop(op, gpart, epart) ); return delta; } @@ -8743,7 +8884,7 @@ static Long dis_SSE_cmp_E_to_G ( VexAbiInfo* vbi, ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr))) : /*sz==4*/ unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))) - ) + ) ); delta += alen+1; DIP("%s $%d,%s,%s\n", opname, @@ -9267,6 +9408,31 @@ static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) { return math_PABS_XMM(aa, 1); } +/* YMM version of math_PABS_XMM. */ +static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB ) +{ + IRTemp res = newTemp(Ity_V256); + IRTemp aaHi = IRTemp_INVALID; + IRTemp aaLo = IRTemp_INVALID; + breakupV256toV128s(aa, &aaHi, &aaLo); + assign(res, binop(Iop_V128HLtoV256, + mkexpr(math_PABS_XMM(aaHi, laneszB)), + mkexpr(math_PABS_XMM(aaLo, laneszB)))); + return res; +} + +static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) { + return math_PABS_YMM(aa, 4); +} + +static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) { + return math_PABS_YMM(aa, 2); +} + +static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) { + return math_PABS_YMM(aa, 1); +} + static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64, IRTemp lo64, Long byteShift ) { @@ -9634,6 +9800,47 @@ static Long dis_PSHUFD_32x4 ( VexAbiInfo* vbi, Prefix pfx, } +static Long dis_PSHUFD_32x8 ( VexAbiInfo* vbi, Prefix pfx, Long delta ) +{ + Int order; + Int alen = 0; + HChar dis_buf[50]; + IRTemp sV = newTemp(Ity_V256); + UChar modrm = getUChar(delta); + IRTemp addr = IRTemp_INVALID; + UInt rG = gregOfRexRM(pfx,modrm); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getYMMReg(rE) ); + order = (Int)getUChar(delta+1); + delta += 1+1; + DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, + 1/*byte after the amode*/ ); + assign( sV, loadLE(Ity_V256, mkexpr(addr)) ); + order = (Int)getUChar(delta+alen); + delta += alen+1; + DIP("vpshufd $%d,%s,%s\n", order, dis_buf, nameYMMReg(rG)); + } + + IRTemp s[8]; + s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID; + breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4], + &s[3], &s[2], &s[1], &s[0] ); + + putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)], + s[4 + ((order>>4)&3)], + s[4 + ((order>>2)&3)], + s[4 + ((order>>0)&3)], + s[0 + ((order>>6)&3)], + s[0 + ((order>>4)&3)], + s[0 + ((order>>2)&3)], + s[0 + ((order>>0)&3)] ) ); + return delta; +} + + static IRTemp math_PSRLDQ ( IRTemp sV, Int imm ) { IRTemp dV = newTemp(Ity_V128); @@ -10280,6 +10487,28 @@ static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx, } +static Long dis_PMOVMSKB_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta ) +{ + UChar modrm = getUChar(delta); + vassert(epartIsReg(modrm)); /* ensured by caller */ + UInt rE = eregOfRexRM(pfx,modrm); + UInt rG = gregOfRexRM(pfx,modrm); + IRTemp t0 = newTemp(Ity_V128); + IRTemp t1 = newTemp(Ity_V128); + IRTemp t2 = newTemp(Ity_I16); + IRTemp t3 = newTemp(Ity_I16); + assign(t0, getYMMRegLane128(rE, 0)); + assign(t1, getYMMRegLane128(rE, 1)); + assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0))); + assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1))); + putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2))); + DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG)); + delta += 1; + return delta; +} + + /* FIXME: why not just use InterleaveLO / InterleaveHI? I think the relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */ /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */ @@ -10542,6 +10771,22 @@ static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV ) } +static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV ) +{ + /* This is a really poor translation -- could be improved if + performance critical */ + IRTemp sHi, sLo, dHi, dLo; + sHi = sLo = dHi = dLo = IRTemp_INVALID; + breakupV256toV128s( dV, &dHi, &dLo); + breakupV256toV128s( sV, &sHi, &sLo); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, + mkexpr(math_PMULUDQ_128(sHi, dHi)), + mkexpr(math_PMULUDQ_128(sLo, dLo)))); + return res; +} + + static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV ) { /* This is a really poor translation -- could be improved if @@ -10558,6 +10803,22 @@ static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV ) } +static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV ) +{ + /* This is a really poor translation -- could be improved if + performance critical */ + IRTemp sHi, sLo, dHi, dLo; + sHi = sLo = dHi = dLo = IRTemp_INVALID; + breakupV256toV128s( dV, &dHi, &dLo); + breakupV256toV128s( sV, &sHi, &sLo); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, + mkexpr(math_PMULDQ_128(sHi, dHi)), + mkexpr(math_PMULDQ_128(sLo, dLo)))); + return res; +} + + static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV ) { IRTemp sVhi, sVlo, dVhi, dVlo; @@ -10580,6 +10841,20 @@ static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV ) } +static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV ) +{ + IRTemp sHi, sLo, dHi, dLo; + sHi = sLo = dHi = dLo = IRTemp_INVALID; + breakupV256toV128s( dV, &dHi, &dLo); + breakupV256toV128s( sV, &sHi, &sLo); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, + mkexpr(math_PMADDWD_128(dHi, sHi)), + mkexpr(math_PMADDWD_128(dLo, sLo)))); + return res; +} + + static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV ) { IRTemp addV = newTemp(Ity_V128); @@ -10713,6 +10988,54 @@ static Long dis_PSHUFxW_128 ( VexAbiInfo* vbi, Prefix pfx, } +/* Handle 256 bit PSHUFLW and PSHUFHW. */ +static Long dis_PSHUFxW_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool xIsH ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt imm8; + IRTemp sV, s[8], sV64[4], dVhi, dVlo; + sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID; + s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID; + sV = newTemp(Ity_V256); + dVhi = newTemp(Ity_I64); + dVlo = newTemp(Ity_I64); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getYMMReg(rE) ); + imm8 = (UInt)getUChar(delta+1); + delta += 1+1; + DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l', + imm8, nameYMMReg(rE), nameYMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 ); + assign( sV, loadLE(Ity_V256, mkexpr(addr)) ); + imm8 = (UInt)getUChar(delta+alen); + delta += alen+1; + DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l', + imm8, dis_buf, nameYMMReg(rG)); + } + + breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] ); + breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] ); + breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] ); + + assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)], + s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) ); + assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)], + s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) ); + putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3], + xIsH ? sV64[2] : dVhi, + xIsH ? dVlo : sV64[1], + xIsH ? sV64[0] : dVlo ) ); + return delta; +} + + static Long dis_PEXTRW_128_EregOnly_toG ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx ) { @@ -10923,6 +11246,20 @@ static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV ) } +static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV ) +{ + IRTemp sHi, sLo, dHi, dLo; + sHi = sLo = dHi = dLo = IRTemp_INVALID; + breakupV256toV128s( dV, &dHi, &dLo); + breakupV256toV128s( sV, &sHi, &sLo); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, + mkexpr(math_PSADBW_128(dHi, sHi)), + mkexpr(math_PSADBW_128(dLo, sLo)))); + return res; +} + + static Long dis_MASKMOVDQU ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx ) { @@ -11259,9 +11596,9 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, modrm = getUChar(delta); if (epartIsReg(modrm)) { putXMMReg( eregOfRexRM(pfx,modrm), - getXMMReg( gregOfRexRM(pfx,modrm) ) ); + getXMMReg( gregOfRexRM(pfx,modrm) ) ); DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), - nameXMMReg(eregOfRexRM(pfx,modrm))); + nameXMMReg(eregOfRexRM(pfx,modrm))); delta += 1; } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); @@ -11607,9 +11944,9 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, modrm = getUChar(delta); if (epartIsReg(modrm)) { putXMMReg( eregOfRexRM(pfx,modrm), - getXMMReg( gregOfRexRM(pfx,modrm) ) ); + getXMMReg( gregOfRexRM(pfx,modrm) ) ); DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), - nameXMMReg(eregOfRexRM(pfx,modrm))); + nameXMMReg(eregOfRexRM(pfx,modrm))); delta += 1; } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); @@ -12618,7 +12955,7 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, # define SEL(n) \ ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) assign(dV, - mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3), + mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3), SEL((order>>2)&3), SEL((order>>0)&3) ) ); putMMXReg(gregLO3ofRM(modrm), mkexpr(dV)); @@ -12799,12 +13136,12 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK, getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) ); DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), nameIReg32(eregOfRexRM(pfx,modrm))); - } else { + } else { putIReg64( eregOfRexRM(pfx,modrm), getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) ); DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), nameIReg64(eregOfRexRM(pfx,modrm))); - } + } } else { addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); delta += alen; @@ -14451,6 +14788,21 @@ IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ ) } +static +IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ ) +{ + IRTemp sHi, sLo, dHi, dLo; + sHi = sLo = dHi = dLo = IRTemp_INVALID; + breakupV256toV128s( dV, &dHi, &dLo); + breakupV256toV128s( sV, &sHi, &sLo); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, + mkexpr(math_PSHUFB_XMM(dHi, sHi)), + mkexpr(math_PSHUFB_XMM(dLo, sLo)))); + return res; +} + + static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx, UChar opc ) { @@ -14490,14 +14842,16 @@ static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); assign( sV, getXMMReg(rE) ); - DIP("ph%s %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG)); + DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str, + nameXMMReg(rE), nameXMMReg(rG)); delta += 1; } else { addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); if (!isAvx) gen_SEGV_if_not_16_aligned( addr ); assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); - DIP("ph%s %s,%s\n", str, dis_buf, nameXMMReg(rG)); + DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str, + dis_buf, nameXMMReg(rG)); delta += alen; } @@ -14523,6 +14877,78 @@ static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, } +static Long dis_PHADD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + const HChar* str = "???"; + IROp opV64 = Iop_INVALID; + IROp opCatO = Iop_CatOddLanes16x4; + IROp opCatE = Iop_CatEvenLanes16x4; + IRTemp sV = newTemp(Ity_V256); + IRTemp dV = newTemp(Ity_V256); + IRTemp s3, s2, s1, s0, d3, d2, d1, d0; + s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx); + + switch (opc) { + case 0x01: opV64 = Iop_Add16x4; str = "addw"; break; + case 0x02: opV64 = Iop_Add32x2; str = "addd"; break; + case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break; + case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break; + case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break; + case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break; + default: vassert(0); + } + if (opc == 0x02 || opc == 0x06) { + opCatO = Iop_InterleaveHI32x2; + opCatE = Iop_InterleaveLO32x2; + } + + assign( dV, getYMMReg(rV) ); + + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getYMMReg(rE) ); + DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG)); + delta += 1; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( sV, loadLE(Ity_V256, mkexpr(addr)) ); + DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG)); + delta += alen; + } + + breakupV256to64s( dV, &d3, &d2, &d1, &d0 ); + breakupV256to64s( sV, &s3, &s2, &s1, &s0 ); + + /* This isn't a particularly efficient way to compute the + result, but at least it avoids a proliferation of IROps, + hence avoids complication all the backends. */ + + putYMMReg( rG, + binop(Iop_V128HLtoV256, + binop(Iop_64HLtoV128, + binop(opV64, + binop(opCatE,mkexpr(s3),mkexpr(s2)), + binop(opCatO,mkexpr(s3),mkexpr(s2)) ), + binop(opV64, + binop(opCatE,mkexpr(d3),mkexpr(d2)), + binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ), + binop(Iop_64HLtoV128, + binop(opV64, + binop(opCatE,mkexpr(s1),mkexpr(s0)), + binop(opCatO,mkexpr(s1),mkexpr(s0)) ), + binop(opV64, + binop(opCatE,mkexpr(d1),mkexpr(d0)), + binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) ); + return delta; +} + + static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV ) { IRTemp sVoddsSX = newTemp(Ity_V128); @@ -14549,6 +14975,21 @@ static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV ) } +static +IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV ) +{ + IRTemp sHi, sLo, dHi, dLo; + sHi = sLo = dHi = dLo = IRTemp_INVALID; + breakupV256toV128s( dV, &dHi, &dLo); + breakupV256toV128s( sV, &sHi, &sLo); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, + mkexpr(math_PMADDUBSW_128(dHi, sHi)), + mkexpr(math_PMADDUBSW_128(dLo, sLo)))); + return res; +} + + __attribute__((noinline)) static Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK, @@ -15257,32 +15698,31 @@ Long dis_ESC_0F__SSE4 ( Bool* decode_OK, } break; - case 0xBD: - /* F3 0F BD -- LZCNT (count leading zeroes. An AMD extension, - which we can only decode if we're sure this is an AMD cpu - that supports LZCNT, since otherwise it's BSR, which behaves - differently. Bizarrely, my Sandy Bridge also accepts these - instructions but produces different results. */ + case 0xBC: + /* F3 0F BC -- TZCNT (count trailing zeroes. A BMI extension, + which we can only decode if we're sure this is a BMI1 capable cpu + that supports TZCNT, since otherwise it's BSF, which behaves + differently on zero source. */ if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */ - && (sz == 2 || sz == 4 || sz == 8) - && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) { + && (sz == 2 || sz == 4 || sz == 8) + && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) { /*IRType*/ ty = szToITy(sz); IRTemp src = newTemp(ty); modrm = getUChar(delta); if (epartIsReg(modrm)) { assign(src, getIRegE(sz, pfx, modrm)); delta += 1; - DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm), + DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm), nameIRegG(sz, pfx, modrm)); } else { addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0); assign(src, loadLE(ty, mkexpr(addr))); delta += alen; - DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf, + DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf, nameIRegG(sz, pfx, modrm)); } - IRTemp res = gen_LZCNT(ty, src); + IRTemp res = gen_TZCNT(ty, src); putIRegG(sz, pfx, modrm, mkexpr(res)); // Update flags. This is pretty lame .. perhaps can do better @@ -15318,18 +15758,79 @@ Long dis_ESC_0F__SSE4 ( Bool* decode_OK, } break; - default: - break; - - } - - //decode_failure: - *decode_OK = False; - return deltaIN; - - decode_success: - *decode_OK = True; - return delta; + case 0xBD: + /* F3 0F BD -- LZCNT (count leading zeroes. An AMD extension, + which we can only decode if we're sure this is an AMD cpu + that supports LZCNT, since otherwise it's BSR, which behaves + differently. Bizarrely, my Sandy Bridge also accepts these + instructions but produces different results. */ + if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */ + && (sz == 2 || sz == 4 || sz == 8) + && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) { + /*IRType*/ ty = szToITy(sz); + IRTemp src = newTemp(ty); + modrm = getUChar(delta); + if (epartIsReg(modrm)) { + assign(src, getIRegE(sz, pfx, modrm)); + delta += 1; + DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm), + nameIRegG(sz, pfx, modrm)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0); + assign(src, loadLE(ty, mkexpr(addr))); + delta += alen; + DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf, + nameIRegG(sz, pfx, modrm)); + } + + IRTemp res = gen_LZCNT(ty, src); + putIRegG(sz, pfx, modrm, mkexpr(res)); + + // Update flags. This is pretty lame .. perhaps can do better + // if this turns out to be performance critical. + // O S A P are cleared. Z is set if RESULT == 0. + // C is set if SRC is zero. + IRTemp src64 = newTemp(Ity_I64); + IRTemp res64 = newTemp(Ity_I64); + assign(src64, widenUto64(mkexpr(src))); + assign(res64, widenUto64(mkexpr(res))); + + IRTemp oszacp = newTemp(Ity_I64); + assign( + oszacp, + binop(Iop_Or64, + binop(Iop_Shl64, + unop(Iop_1Uto64, + binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))), + mkU8(AMD64G_CC_SHIFT_Z)), + binop(Iop_Shl64, + unop(Iop_1Uto64, + binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))), + mkU8(AMD64G_CC_SHIFT_C)) + ) + ); + + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) )); + stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) )); + + goto decode_success; + } + break; + + default: + break; + + } + + //decode_failure: + *decode_OK = False; + return deltaIN; + + decode_success: + *decode_OK = True; + return delta; } @@ -15722,6 +16223,47 @@ static Long dis_PMOVxXBW_128 ( VexAbiInfo* vbi, Prefix pfx, } +/* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */ +static Long dis_PMOVxXBW_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool xIsZ ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + UChar how = xIsZ ? 'z' : 's'; + UInt rG = gregOfRexRM(pfx, modrm); + if ( epartIsReg(modrm) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcVec, getXMMReg(rE) ); + delta += 1; + DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) ); + delta += alen; + DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) ); + } + + /* First do zero extend. */ + IRExpr* res + = binop( Iop_V128HLtoV256, + binop( Iop_InterleaveHI8x16, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ), + binop( Iop_InterleaveLO8x16, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) ); + /* And if needed sign extension as well. */ + if (!xIsZ) + res = binop( Iop_SarN16x16, + binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) ); + + putYMMReg ( rG, res ); + + return delta; +} + + static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx, Bool xIsZ ) { @@ -15761,6 +16303,45 @@ static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx, } +static Long dis_PMOVxXWD_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool xIsZ ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + UChar how = xIsZ ? 'z' : 's'; + UInt rG = gregOfRexRM(pfx, modrm); + + if ( epartIsReg(modrm) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcVec, getXMMReg(rE) ); + delta += 1; + DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) ); + delta += alen; + DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) ); + } + + IRExpr* res + = binop( Iop_V128HLtoV256, + binop( Iop_InterleaveHI16x8, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ), + binop( Iop_InterleaveLO16x8, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) ); + if (!xIsZ) + res = binop(Iop_SarN32x8, + binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16)); + + putYMMReg ( rG, res ); + + return delta; +} + + static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx ) { @@ -15794,6 +16375,41 @@ static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx, } +static Long dis_PMOVSXWQ_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcBytes = newTemp(Ity_I64); + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp s3, s2, s1, s0; + s3 = s2 = s1 = s0 = IRTemp_INVALID; + + if ( epartIsReg( modrm ) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcBytes, getXMMRegLane64( rE, 0 ) ); + delta += 1; + DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) ); + delta += alen; + DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) ); + } + + breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 ); + putYMMReg( rG, binop( Iop_V128HLtoV256, + binop( Iop_64HLtoV128, + unop( Iop_16Sto64, mkexpr(s3) ), + unop( Iop_16Sto64, mkexpr(s2) ) ), + binop( Iop_64HLtoV128, + unop( Iop_16Sto64, mkexpr(s1) ), + unop( Iop_16Sto64, mkexpr(s0) ) ) ) ); + return delta; +} + + static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx ) { @@ -15830,6 +16446,45 @@ static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx, } +static Long dis_PMOVZXWQ_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + + if ( epartIsReg( modrm ) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcVec, getXMMReg(rE) ); + delta += 1; + DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, + unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) ); + delta += alen; + DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) ); + } + + IRTemp zeroVec = newTemp( Ity_V128 ); + assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) ); + + putYMMReg( rG, binop( Iop_V128HLtoV256, + binop( Iop_InterleaveHI16x8, + mkexpr(zeroVec), + binop( Iop_InterleaveLO16x8, + mkexpr(zeroVec), mkexpr(srcVec) ) ), + binop( Iop_InterleaveLO16x8, + mkexpr(zeroVec), + binop( Iop_InterleaveLO16x8, + mkexpr(zeroVec), mkexpr(srcVec) ) ) ) ); + return delta; +} + + /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */ static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx, Bool xIsZ ) @@ -15877,6 +16532,59 @@ static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx, } +/* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */ +static Long dis_PMOVxXDQ_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool xIsZ ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + UChar how = xIsZ ? 'z' : 's'; + UInt rG = gregOfRexRM(pfx, modrm); + /* Compute both srcI64 -- the value to expand -- and srcVec -- same + thing in a V128, with arbitrary junk in the top 64 bits. Use + one or both of them and let iropt clean up afterwards (as + usual). */ + if ( epartIsReg(modrm) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcVec, getXMMReg(rE) ); + delta += 1; + DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) ); + delta += alen; + DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) ); + } + + IRExpr* res; + if (xIsZ) + res = binop( Iop_V128HLtoV256, + binop( Iop_InterleaveHI32x4, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ), + binop( Iop_InterleaveLO32x4, + IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) ); + else { + IRTemp s3, s2, s1, s0; + s3 = s2 = s1 = s0 = IRTemp_INVALID; + breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 ); + res = binop( Iop_V128HLtoV256, + binop( Iop_64HLtoV128, + unop( Iop_32Sto64, mkexpr(s3) ), + unop( Iop_32Sto64, mkexpr(s2) ) ), + binop( Iop_64HLtoV128, + unop( Iop_32Sto64, mkexpr(s1) ), + unop( Iop_32Sto64, mkexpr(s0) ) ) ); + } + + putYMMReg ( rG, res ); + + return delta; +} + + /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */ static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx, Bool xIsZ ) @@ -15920,6 +16628,53 @@ static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx, } +/* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */ +static Long dis_PMOVxXBD_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool xIsZ ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + UChar how = xIsZ ? 'z' : 's'; + UInt rG = gregOfRexRM(pfx, modrm); + if ( epartIsReg(modrm) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcVec, getXMMReg(rE) ); + delta += 1; + DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, + unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) ); + delta += alen; + DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) ); + } + + IRTemp zeroVec = newTemp(Ity_V128); + assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) ); + + IRExpr* res + = binop( Iop_V128HLtoV256, + binop(Iop_InterleaveHI8x16, + mkexpr(zeroVec), + binop(Iop_InterleaveLO8x16, + mkexpr(zeroVec), mkexpr(srcVec)) ), + binop(Iop_InterleaveLO8x16, + mkexpr(zeroVec), + binop(Iop_InterleaveLO8x16, + mkexpr(zeroVec), mkexpr(srcVec)) ) ); + if (!xIsZ) + res = binop(Iop_SarN32x8, + binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24)); + + putYMMReg ( rG, res ); + + return delta; +} + + /* Handles 128 bit versions of PMOVSXBQ. */ static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx ) @@ -15953,6 +16708,52 @@ static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx, } +/* Handles 256 bit versions of PMOVSXBQ. */ +static Long dis_PMOVSXBQ_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + IRTemp srcBytes = newTemp(Ity_I32); + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + if ( epartIsReg(modrm) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcBytes, getXMMRegLane32( rE, 0 ) ); + delta += 1; + DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) ); + delta += alen; + DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) ); + } + + putYMMReg + ( rG, binop( Iop_V128HLtoV256, + binop( Iop_64HLtoV128, + unop( Iop_8Sto64, + unop( Iop_16HIto8, + unop( Iop_32HIto16, + mkexpr(srcBytes) ) ) ), + unop( Iop_8Sto64, + unop( Iop_16to8, + unop( Iop_32HIto16, + mkexpr(srcBytes) ) ) ) ), + binop( Iop_64HLtoV128, + unop( Iop_8Sto64, + unop( Iop_16HIto8, + unop( Iop_32to16, + mkexpr(srcBytes) ) ) ), + unop( Iop_8Sto64, + unop( Iop_16to8, + unop( Iop_32to16, + mkexpr(srcBytes) ) ) ) ) ) ); + return delta; +} + + /* Handles 128 bit versions of PMOVZXBQ. */ static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx ) @@ -15992,16 +16793,61 @@ static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx, } -static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx, - Long delta, Bool isAvx ) +/* Handles 256 bit versions of PMOVZXBQ. */ +static Long dis_PMOVZXBQ_256 ( VexAbiInfo* vbi, Prefix pfx, + Long delta ) { - IRTemp addr = IRTemp_INVALID; - Int alen = 0; + IRTemp addr = IRTemp_INVALID; + Int alen = 0; HChar dis_buf[50]; - UChar modrm = getUChar(delta); - const HChar* mbV = isAvx ? "v" : ""; - IRTemp sV = newTemp(Ity_V128); - IRTemp sHi = newTemp(Ity_I64); + IRTemp srcVec = newTemp(Ity_V128); + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + if ( epartIsReg(modrm) ) { + UInt rE = eregOfRexRM(pfx, modrm); + assign( srcVec, getXMMReg(rE) ); + delta += 1; + DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( srcVec, + unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ))); + delta += alen; + DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) ); + } + + IRTemp zeroVec = newTemp(Ity_V128); + assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) ); + + putYMMReg + ( rG, binop( Iop_V128HLtoV256, + binop( Iop_InterleaveHI8x16, + mkexpr(zeroVec), + binop( Iop_InterleaveLO8x16, + mkexpr(zeroVec), + binop( Iop_InterleaveLO8x16, + mkexpr(zeroVec), mkexpr(srcVec) ) ) ), + binop( Iop_InterleaveLO8x16, + mkexpr(zeroVec), + binop( Iop_InterleaveLO8x16, + mkexpr(zeroVec), + binop( Iop_InterleaveLO8x16, + mkexpr(zeroVec), mkexpr(srcVec) ) ) ) + ) ); + return delta; +} + + +static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool isAvx ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + UChar modrm = getUChar(delta); + const HChar* mbV = isAvx ? "v" : ""; + IRTemp sV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); IRTemp sLo = newTemp(Ity_I64); IRTemp dLo = newTemp(Ity_I64); UInt rG = gregOfRexRM(pfx,modrm); @@ -19427,7 +20273,7 @@ Long dis_ESC_NONE ( cond = mkAnd1(cond, zbit); break; default: - vassert(0); + vassert(0); } stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) ); @@ -20258,9 +21104,17 @@ Long dis_ESC_0F ( return delta; case 0xBC: /* BSF Gv,Ev */ - if (haveF2(pfx)) goto decode_failure; - delta = dis_bs_E_G ( vbi, pfx, sz, delta, True ); - return delta; + if (!haveF2orF3(pfx) + || (haveF3noF2(pfx) + && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) { + /* no-F2 no-F3 0F BC = BSF + or F3 0F BC = REP; BSF on older CPUs. */ + delta = dis_bs_E_G ( vbi, pfx, sz, delta, True ); + return delta; + } + /* Fall through, since F3 0F BC is TZCNT, and needs to + be handled by dis_ESC_0F__SSE4. */ + break; case 0xBD: /* BSR Gv,Ev */ if (!haveF2orF3(pfx) @@ -20913,6 +21767,192 @@ static ULong dis_AVX128_shiftV_byE ( VexAbiInfo* vbi, } +/* Vector by scalar shift of V by the amount specified at the bottom + of E. */ +static ULong dis_AVX256_shiftV_byE ( VexAbiInfo* vbi, + Prefix pfx, Long delta, + const HChar* opname, IROp op ) +{ + HChar dis_buf[50]; + Int alen, size; + IRTemp addr; + Bool shl, shr, sar; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx);; + IRTemp g0 = newTemp(Ity_V256); + IRTemp g1 = newTemp(Ity_V256); + IRTemp amt = newTemp(Ity_I64); + IRTemp amt8 = newTemp(Ity_I8); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( amt, getXMMRegLane64(rE, 0) ); + DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE), + nameYMMReg(rV), nameYMMReg(rG) ); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( amt, loadLE(Ity_I64, mkexpr(addr)) ); + DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) ); + delta += alen; + } + assign( g0, getYMMReg(rV) ); + assign( amt8, unop(Iop_64to8, mkexpr(amt)) ); + + shl = shr = sar = False; + size = 0; + switch (op) { + case Iop_ShlN16x16: shl = True; size = 32; break; + case Iop_ShlN32x8: shl = True; size = 32; break; + case Iop_ShlN64x4: shl = True; size = 64; break; + case Iop_SarN16x16: sar = True; size = 16; break; + case Iop_SarN32x8: sar = True; size = 32; break; + case Iop_ShrN16x16: shr = True; size = 16; break; + case Iop_ShrN32x8: shr = True; size = 32; break; + case Iop_ShrN64x4: shr = True; size = 64; break; + default: vassert(0); + } + + if (shl || shr) { + assign( + g1, + IRExpr_ITE( + binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)), + binop(op, mkexpr(g0), mkexpr(amt8)), + binop(Iop_V128HLtoV256, mkV128(0), mkV128(0)) + ) + ); + } else + if (sar) { + assign( + g1, + IRExpr_ITE( + binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)), + binop(op, mkexpr(g0), mkexpr(amt8)), + binop(op, mkexpr(g0), mkU8(size-1)) + ) + ); + } else { + vassert(0); + } + + putYMMReg( rG, mkexpr(g1) ); + return delta; +} + + +/* Vector by vector shift of V by the amount specified at the bottom + of E. Vector by vector shifts are defined for all shift amounts, + so not using Iop_S*x* here (and SSE2 doesn't support variable shifts + anyway). */ +static ULong dis_AVX_var_shiftV_byE ( VexAbiInfo* vbi, + Prefix pfx, Long delta, + const HChar* opname, IROp op, Bool isYMM ) +{ + HChar dis_buf[50]; + Int alen, size, i; + IRTemp addr; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx);; + IRTemp sV = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128); + IRTemp amt = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128); + IRTemp amts[8], sVs[8], res[8]; + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) ); + if (isYMM) { + DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE), + nameYMMReg(rV), nameYMMReg(rG) ); + } else { + DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE), + nameXMMReg(rV), nameXMMReg(rG) ); + } + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) ); + if (isYMM) { + DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), + nameYMMReg(rG) ); + } else { + DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), + nameXMMReg(rG) ); + } + delta += alen; + } + assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) ); + + size = 0; + switch (op) { + case Iop_Shl32: size = 32; break; + case Iop_Shl64: size = 64; break; + case Iop_Sar32: size = 32; break; + case Iop_Shr32: size = 32; break; + case Iop_Shr64: size = 64; break; + default: vassert(0); + } + + for (i = 0; i < 8; i++) { + sVs[i] = IRTemp_INVALID; + amts[i] = IRTemp_INVALID; + } + switch (size) { + case 32: + if (isYMM) { + breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4], + &sVs[3], &sVs[2], &sVs[1], &sVs[0] ); + breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4], + &amts[3], &amts[2], &amts[1], &amts[0] ); + } else { + breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] ); + breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] ); + } + break; + case 64: + if (isYMM) { + breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] ); + breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] ); + } else { + breakupV128to64s( sV, &sVs[1], &sVs[0] ); + breakupV128to64s( amt, &amts[1], &amts[0] ); + } + break; + default: vassert(0); + } + for (i = 0; i < 8; i++) + if (sVs[i] != IRTemp_INVALID) { + res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64); + assign( res[i], + IRExpr_ITE( + binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U, + mkexpr(amts[i]), + size == 32 ? mkU32(size) : mkU64(size)), + binop(op, mkexpr(sVs[i]), + unop(size == 32 ? Iop_32to8 : Iop_64to8, + mkexpr(amts[i]))), + op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1)) + : size == 32 ? mkU32(0) : mkU64(0) + )); + } + switch (size) { + case 32: + for (i = 0; i < 8; i++) + putYMMRegLane32( rG, i, (i < 4 || isYMM) + ? mkexpr(res[i]) : mkU32(0) ); + break; + case 64: + for (i = 0; i < 4; i++) + putYMMRegLane64( rG, i, (i < 2 || isYMM) + ? mkexpr(res[i]) : mkU64(0) ); + break; + default: vassert(0); + } + + return delta; +} + + /* Vector by scalar shift of E into V, by an immediate byte. Modified version of dis_SSE_shiftE_imm. */ static @@ -20970,6 +22010,64 @@ Long dis_AVX128_shiftE_to_V_imm( Prefix pfx, } +/* Vector by scalar shift of E into V, by an immediate byte. Modified + version of dis_AVX128_shiftE_to_V_imm. */ +static +Long dis_AVX256_shiftE_to_V_imm( Prefix pfx, + Long delta, const HChar* opname, IROp op ) +{ + Bool shl, shr, sar; + UChar rm = getUChar(delta); + IRTemp e0 = newTemp(Ity_V256); + IRTemp e1 = newTemp(Ity_V256); + UInt rD = getVexNvvvv(pfx); + UChar amt, size; + vassert(epartIsReg(rm)); + vassert(gregLO3ofRM(rm) == 2 + || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6); + amt = getUChar(delta+1); + delta += 2; + DIP("%s $%d,%s,%s\n", opname, + (Int)amt, + nameYMMReg(eregOfRexRM(pfx,rm)), + nameYMMReg(rD)); + assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) ); + + shl = shr = sar = False; + size = 0; + switch (op) { + case Iop_ShlN16x16: shl = True; size = 16; break; + case Iop_ShlN32x8: shl = True; size = 32; break; + case Iop_ShlN64x4: shl = True; size = 64; break; + case Iop_SarN16x16: sar = True; size = 16; break; + case Iop_SarN32x8: sar = True; size = 32; break; + case Iop_ShrN16x16: shr = True; size = 16; break; + case Iop_ShrN32x8: shr = True; size = 32; break; + case Iop_ShrN64x4: shr = True; size = 64; break; + default: vassert(0); + } + + + if (shl || shr) { + assign( e1, amt >= size + ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0)) + : binop(op, mkexpr(e0), mkU8(amt)) + ); + } else + if (sar) { + assign( e1, amt >= size + ? binop(op, mkexpr(e0), mkU8(size-1)) + : binop(op, mkexpr(e0), mkU8(amt)) + ); + } else { + vassert(0); + } + + putYMMReg( rD, mkexpr(e1) ); + return delta; +} + + /* Lower 64-bit lane only AVX128 binary operation: G[63:0] = V[63:0] `op` E[63:0] G[127:64] = V[127:64] @@ -21483,6 +22581,21 @@ static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv, } +/* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp + for the operation, no inversion of the left arg, and no swapping of + args. */ +static +Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple ( + /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi, + Prefix pfx, Long delta, const HChar* name, + IROp op + ) +{ + return dis_VEX_NDS_256_AnySimdPfx_0F_WIG( + uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False); +} + + /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR generator to compute the result, no inversion of the left arg, and no swapping of args. */ @@ -21499,6 +22612,39 @@ Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex ( } +/* Handles AVX256 unary E-to-G all-lanes operations. */ +static +Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv, + VexAbiInfo* vbi, + Prefix pfx, Long delta, + const HChar* opname, + IRTemp (*opFn)(IRTemp) ) +{ + HChar dis_buf[50]; + Int alen; + IRTemp addr; + IRTemp res = newTemp(Ity_V256); + IRTemp arg = newTemp(Ity_V256); + UChar rm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, rm); + if (epartIsReg(rm)) { + UInt rE = eregOfRexRM(pfx,rm); + assign(arg, getYMMReg(rE)); + delta += 1; + DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign(arg, loadLE(Ity_V256, mkexpr(addr))); + delta += alen; + DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG)); + } + res = opFn(arg); + putYMMReg( rG, mkexpr(res) ); + *uses_vvvv = False; + return delta; +} + + /* Handles AVX256 unary E-to-G all-lanes operations. */ static Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv, @@ -21605,37 +22751,123 @@ static Long dis_CVTPD2PS_256 ( VexAbiInfo* vbi, Prefix pfx, } -__attribute__((noinline)) -static -Long dis_ESC_0F__VEX ( - /*MB_OUT*/DisResult* dres, - /*OUT*/ Bool* uses_vvvv, - Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), - Bool resteerCisOk, - void* callback_opaque, - VexArchInfo* archinfo, - VexAbiInfo* vbi, - Prefix pfx, Int sz, Long deltaIN - ) +static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op ) { - IRTemp addr = IRTemp_INVALID; - Int alen = 0; - HChar dis_buf[50]; - Long delta = deltaIN; - UChar opc = getUChar(delta); - delta++; - *uses_vvvv = False; + IRTemp tLhi, tLlo, tRhi, tRlo; + tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID; + IRTemp res = newTemp(Ity_V256); + breakupV256toV128s( tL, &tLhi, &tLlo ); + breakupV256toV128s( tR, &tRhi, &tRlo ); + assign( res, binop( Iop_V128HLtoV256, + binop( op, mkexpr(tRhi), mkexpr(tLhi) ), + binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) ); + return res; +} - switch (opc) { - case 0x10: - /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */ - /* Move 64 bits from E (mem only) to G (lo half xmm). - Bits 255-64 of the dest are zeroed out. */ - if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) { - UChar modrm = getUChar(delta); - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - UInt rG = gregOfRexRM(pfx,modrm); +static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 ); +} + + +static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 ); +} + + +static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 ); +} + + +static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 ); +} + + +static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 ); +} + + +static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 ); +} + + +static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 ); +} + + +static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 ); +} + + +static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 ); +} + + +static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 ); +} + + +static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 ); +} + + +static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR ) +{ + return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 ); +} + + +__attribute__((noinline)) +static +Long dis_ESC_0F__VEX ( + /*MB_OUT*/DisResult* dres, + /*OUT*/ Bool* uses_vvvv, + Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), + Bool resteerCisOk, + void* callback_opaque, + VexArchInfo* archinfo, + VexAbiInfo* vbi, + Prefix pfx, Int sz, Long deltaIN + ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + Long delta = deltaIN; + UChar opc = getUChar(delta); + delta++; + *uses_vvvv = False; + + switch (opc) { + + case 0x10: + /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */ + /* Move 64 bits from E (mem only) to G (lo half xmm). + Bits 255-64 of the dest are zeroed out. */ + if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + UInt rG = gregOfRexRM(pfx,modrm); IRTemp z128 = newTemp(Ity_V128); assign(z128, mkV128(0)); putXMMReg( rG, mkexpr(z128) ); @@ -23128,6 +24360,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */ + /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpcklbw", + math_VPUNPCKLBW_YMM ); + goto decode_success; + } break; case 0x61: @@ -23140,6 +24380,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */ + /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpcklwd", + math_VPUNPCKLWD_YMM ); + goto decode_success; + } break; case 0x62: @@ -23152,6 +24400,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */ + /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpckldq", + math_VPUNPCKLDQ_YMM ); + goto decode_success; + } break; case 0x63: @@ -23164,6 +24420,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */ + /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpacksswb", + math_VPACKSSWB_YMM ); + goto decode_success; + } break; case 0x64: @@ -23174,6 +24438,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 ); goto decode_success; } + /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */ + /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 ); + goto decode_success; + } break; case 0x65: @@ -23184,6 +24455,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 ); goto decode_success; } + /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */ + /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 ); + goto decode_success; + } break; case 0x66: @@ -23194,6 +24472,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 ); goto decode_success; } + /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */ + /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 ); + goto decode_success; + } break; case 0x67: @@ -23206,6 +24491,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */ + /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpackuswb", + math_VPACKUSWB_YMM ); + goto decode_success; + } break; case 0x68: @@ -23218,6 +24511,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */ + /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpckhbw", + math_VPUNPCKHBW_YMM ); + goto decode_success; + } break; case 0x69: @@ -23230,6 +24531,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */ + /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpckhwd", + math_VPUNPCKHWD_YMM ); + goto decode_success; + } break; case 0x6A: @@ -23242,6 +24551,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */ + /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpckhdq", + math_VPUNPCKHDQ_YMM ); + goto decode_success; + } break; case 0x6B: @@ -23254,6 +24571,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */ + /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpackssdw", + math_VPACKSSDW_YMM ); + goto decode_success; + } break; case 0x6C: @@ -23266,6 +24591,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */ + /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpcklqdq", + math_VPUNPCKLQDQ_YMM ); + goto decode_success; + } break; case 0x6D: @@ -23278,6 +24611,14 @@ Long dis_ESC_0F__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */ + /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpunpckhqdq", + math_VPUNPCKHQDQ_YMM ); + goto decode_success; + } break; case 0x6E: @@ -23392,18 +24733,33 @@ Long dis_ESC_0F__VEX ( delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/); goto decode_success; } + /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PSHUFD_32x8( vbi, pfx, delta); + goto decode_success; + } /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */ if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) { delta = dis_PSHUFxW_128( vbi, pfx, delta, True/*isAvx*/, False/*!xIsH*/ ); goto decode_success; } + /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */ + if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ ); + goto decode_success; + } /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */ if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) { delta = dis_PSHUFxW_128( vbi, pfx, delta, True/*isAvx*/, True/*xIsH*/ ); goto decode_success; } + /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */ + if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ ); + goto decode_success; + } break; case 0x71: @@ -23433,6 +24789,32 @@ Long dis_ESC_0F__VEX ( } /* else fall through */ } + /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */ + /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */ + /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ + && epartIsReg(getUChar(delta))) { + if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpsrlw", Iop_ShrN16x16 ); + *uses_vvvv = True; + goto decode_success; + } + if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpsraw", Iop_SarN16x16 ); + *uses_vvvv = True; + goto decode_success; + } + if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpsllw", Iop_ShlN16x16 ); + *uses_vvvv = True; + goto decode_success; + } + /* else fall through */ + } break; case 0x72: @@ -23462,6 +24844,32 @@ Long dis_ESC_0F__VEX ( } /* else fall through */ } + /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */ + /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */ + /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ + && epartIsReg(getUChar(delta))) { + if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpsrld", Iop_ShrN32x8 ); + *uses_vvvv = True; + goto decode_success; + } + if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpsrad", Iop_SarN32x8 ); + *uses_vvvv = True; + goto decode_success; + } + if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpslld", Iop_ShlN32x8 ); + *uses_vvvv = True; + goto decode_success; + } + /* else fall through */ + } break; case 0x73: @@ -23506,6 +24914,54 @@ Long dis_ESC_0F__VEX ( } /* else fall through */ } + /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */ + /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */ + /* VPSRLQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */ + /* VPSLLQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && epartIsReg(getUChar(delta))) { + Int rS = eregOfRexRM(pfx,getUChar(delta)); + Int rD = getVexNvvvv(pfx); + if (gregLO3ofRM(getUChar(delta)) == 3) { + IRTemp vecS0 = newTemp(Ity_V128); + IRTemp vecS1 = newTemp(Ity_V128); + Int imm = (Int)getUChar(delta+1); + DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD)); + delta += 2; + assign( vecS0, getYMMRegLane128(rS, 0)); + assign( vecS1, getYMMRegLane128(rS, 1)); + putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm ))); + putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm ))); + *uses_vvvv = True; + goto decode_success; + } + if (gregLO3ofRM(getUChar(delta)) == 7) { + IRTemp vecS0 = newTemp(Ity_V128); + IRTemp vecS1 = newTemp(Ity_V128); + Int imm = (Int)getUChar(delta+1); + DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD)); + delta += 2; + assign( vecS0, getYMMRegLane128(rS, 0)); + assign( vecS1, getYMMRegLane128(rS, 1)); + putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm ))); + putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm ))); + *uses_vvvv = True; + goto decode_success; + } + if (gregLO3ofRM(getUChar(delta)) == 2) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpsrlq", Iop_ShrN64x4 ); + *uses_vvvv = True; + goto decode_success; + } + if (gregLO3ofRM(getUChar(delta)) == 6) { + delta = dis_AVX256_shiftE_to_V_imm( pfx, delta, + "vpsllq", Iop_ShlN64x4 ); + *uses_vvvv = True; + goto decode_success; + } + /* else fall through */ + } break; case 0x74: @@ -23516,6 +24972,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 ); goto decode_success; } + /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */ + /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 ); + goto decode_success; + } break; case 0x75: @@ -23526,6 +24989,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 ); goto decode_success; } + /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */ + /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 ); + goto decode_success; + } break; case 0x76: @@ -23536,6 +25006,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 ); goto decode_success; } + /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */ + /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 ); + goto decode_success; + } break; case 0x77: @@ -24102,6 +25579,14 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; + } + /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpsrlw", Iop_ShrN16x16 ); + *uses_vvvv = True; + goto decode_success; + } break; @@ -24113,6 +25598,13 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpsrld", Iop_ShrN32x8 ); + *uses_vvvv = True; + goto decode_success; + } break; case 0xD3: @@ -24123,6 +25615,13 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpsrlq", Iop_ShrN64x4 ); + *uses_vvvv = True; + goto decode_success; + } break; case 0xD4: @@ -24133,6 +25632,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 ); goto decode_success; } + /* VPADDQ r/m, rV, r ::: r = rV + r/m */ + /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 ); + goto decode_success; + } break; case 0xD5: @@ -24142,8 +25648,14 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 ); goto decode_success; } - break; - + /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 ); + goto decode_success; + } + break; + case 0xD6: /* I can't even find any Intel docs for this one. */ /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half @@ -24172,6 +25684,11 @@ Long dis_ESC_0F__VEX ( delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ ); goto decode_success; } + /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVMSKB_256( vbi, pfx, delta ); + goto decode_success; + } break; case 0xD8: @@ -24181,7 +25698,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 ); goto decode_success; } - break; + /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 ); + goto decode_success; + } + break; case 0xD9: /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */ @@ -24190,6 +25713,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 ); goto decode_success; } + /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 ); + goto decode_success; + } break; case 0xDA: @@ -24199,6 +25728,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 ); goto decode_success; } + /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 ); + goto decode_success; + } break; case 0xDB: @@ -24209,6 +25744,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 ); goto decode_success; } + /* VPAND r/m, rV, r ::: r = rV & r/m */ + /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 ); + goto decode_success; + } break; case 0xDC: @@ -24218,6 +25760,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 ); goto decode_success; } + /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 ); + goto decode_success; + } break; case 0xDD: @@ -24227,6 +25775,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 ); goto decode_success; } + /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 ); + goto decode_success; + } break; case 0xDE: @@ -24236,6 +25790,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 ); goto decode_success; } + /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 ); + goto decode_success; + } break; case 0xDF: @@ -24247,6 +25807,14 @@ Long dis_ESC_0F__VEX ( NULL, True/*invertLeftArg*/, False/*swapArgs*/ ); goto decode_success; } + /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */ + /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG( + uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256, + NULL, True/*invertLeftArg*/, False/*swapArgs*/ ); + goto decode_success; + } break; case 0xE0: @@ -24256,6 +25824,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 ); goto decode_success; } + /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 ); + goto decode_success; + } break; case 0xE1: @@ -24266,6 +25840,13 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpsraw", Iop_SarN16x16 ); + *uses_vvvv = True; + goto decode_success; + } break; case 0xE2: @@ -24276,6 +25857,13 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpsrad", Iop_SarN32x8 ); + *uses_vvvv = True; + goto decode_success; + } break; case 0xE3: @@ -24285,6 +25873,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 ); goto decode_success; } + /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 ); + goto decode_success; + } break; case 0xE4: @@ -24294,6 +25888,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 ); goto decode_success; } + /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 ); + goto decode_success; + } break; case 0xE5: @@ -24303,6 +25903,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 ); goto decode_success; } + /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 ); + goto decode_success; + } break; case 0xE6: @@ -24378,6 +25984,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 ); goto decode_success; } + /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 ); + goto decode_success; + } break; case 0xE9: @@ -24387,6 +25999,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 ); goto decode_success; } + /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 ); + goto decode_success; + } break; case 0xEA: @@ -24397,6 +26015,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 ); goto decode_success; } + /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */ + /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 ); + goto decode_success; + } break; case 0xEB: @@ -24407,6 +26032,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 ); goto decode_success; } + /* VPOR r/m, rV, r ::: r = rV | r/m */ + /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 ); + goto decode_success; + } break; case 0xEC: @@ -24416,6 +26048,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 ); goto decode_success; } + /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 ); + goto decode_success; + } break; case 0xED: @@ -24425,6 +26063,12 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 ); goto decode_success; } + /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_V_to_G( + uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 ); + goto decode_success; + } break; case 0xEE: @@ -24435,6 +26079,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 ); goto decode_success; } + /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */ + /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 ); + goto decode_success; + } break; case 0xEF: @@ -24445,6 +26096,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 ); goto decode_success; } + /* VPXOR r/m, rV, r ::: r = rV ^ r/m */ + /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 ); + goto decode_success; + } break; case 0xF0: @@ -24484,6 +26142,14 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; + } + /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpsllw", Iop_ShlN16x16 ); + *uses_vvvv = True; + goto decode_success; + } break; @@ -24495,6 +26161,13 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpslld", Iop_ShlN32x8 ); + *uses_vvvv = True; + goto decode_success; + } break; case 0xF3: @@ -24505,6 +26178,13 @@ Long dis_ESC_0F__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_shiftV_byE( vbi, pfx, delta, + "vpsllq", Iop_ShlN64x4 ); + *uses_vvvv = True; + goto decode_success; + } break; case 0xF4: @@ -24515,6 +26195,13 @@ Long dis_ESC_0F__VEX ( "vpmuludq", math_PMULUDQ_128 ); goto decode_success; } + /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, + "vpmuludq", math_PMULUDQ_256 ); + goto decode_success; + } break; case 0xF5: @@ -24525,6 +26212,13 @@ Long dis_ESC_0F__VEX ( "vpmaddwd", math_PMADDWD_128 ); goto decode_success; } + /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, + "vpmaddwd", math_PMADDWD_256 ); + goto decode_success; + } break; case 0xF6: @@ -24535,6 +26229,13 @@ Long dis_ESC_0F__VEX ( "vpsadbw", math_PSADBW_128 ); goto decode_success; } + /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, + "vpsadbw", math_PSADBW_256 ); + goto decode_success; + } break; case 0xF7: @@ -24554,6 +26255,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 ); goto decode_success; } + /* VPSUBB r/m, rV, r ::: r = rV - r/m */ + /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 ); + goto decode_success; + } break; case 0xF9: @@ -24564,6 +26272,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 ); goto decode_success; } + /* VPSUBW r/m, rV, r ::: r = rV - r/m */ + /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 ); + goto decode_success; + } break; case 0xFA: @@ -24574,6 +26289,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 ); goto decode_success; } + /* VPSUBD r/m, rV, r ::: r = rV - r/m */ + /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 ); + goto decode_success; + } break; case 0xFB: @@ -24584,6 +26306,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 ); goto decode_success; } + /* VPSUBQ r/m, rV, r ::: r = rV - r/m */ + /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 ); + goto decode_success; + } break; case 0xFC: @@ -24594,6 +26323,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 ); goto decode_success; } + /* VPADDB r/m, rV, r ::: r = rV + r/m */ + /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 ); + goto decode_success; + } break; case 0xFD: @@ -24604,6 +26340,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 ); goto decode_success; } + /* VPADDW r/m, rV, r ::: r = rV + r/m */ + /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 ); + goto decode_success; + } break; case 0xFE: @@ -24614,6 +26357,13 @@ Long dis_ESC_0F__VEX ( uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 ); goto decode_success; } + /* VPADDD r/m, rV, r ::: r = rV + r/m */ + /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 ); + goto decode_success; + } break; default: @@ -24695,81 +26445,504 @@ static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV ) return res; } -__attribute__((noinline)) -static -Long dis_ESC_0F38__VEX ( - /*MB_OUT*/DisResult* dres, - /*OUT*/ Bool* uses_vvvv, - Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), - Bool resteerCisOk, - void* callback_opaque, - VexArchInfo* archinfo, - VexAbiInfo* vbi, - Prefix pfx, Int sz, Long deltaIN - ) +static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV ) { - IRTemp addr = IRTemp_INVALID; - Int alen = 0; - HChar dis_buf[50]; - Long delta = deltaIN; - UChar opc = getUChar(delta); - delta++; - *uses_vvvv = False; + /* In the control vector, zero out all but the bottom three bits of + each 32-bit lane. */ + IRExpr* cv1 = binop(Iop_ShrN32x8, + binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)), + mkU8(29)); + /* And use the resulting cleaned-up control vector as steering + in a Perm operation. */ + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1)); + return res; +} - switch (opc) { +static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv, + VexAbiInfo* vbi, Prefix pfx, Long delta, + const HChar* opname, IROp op8 ) +{ + HChar dis_buf[50]; + Int alen; + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src = newTemp(ty); + IRTemp amt = newTemp(ty); + UChar rm = getUChar(delta); - case 0x00: - /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */ - /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */ - if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { - delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex( - uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM ); - goto decode_success; - } - break; + assign( amt, getIRegV(size,pfx) ); + if (epartIsReg(rm)) { + assign( src, getIRegE(size,pfx,rm) ); + DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), + nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm)); + delta++; + } else { + IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src, loadLE(ty, mkexpr(addr)) ); + DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf, + nameIRegG(size,pfx,rm)); + delta += alen; + } - case 0x01: - case 0x02: - case 0x03: - /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */ - /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */ - /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */ - if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { - delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc ); - *uses_vvvv = True; - goto decode_success; - } - break; + putIRegG( size, pfx, rm, + binop(mkSizedOp(ty,op8), mkexpr(src), + narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt), + mkU(ty,8*size-1)))) ); + /* Flags aren't modified. */ + *uses_vvvv = True; + return delta; +} - case 0x04: - /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */ - if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { - delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex( - uses_vvvv, vbi, pfx, delta, "vpmaddubsw", - math_PMADDUBSW_128 ); - goto decode_success; - } + +static Long dis_FMA ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc ) +{ + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + Bool scalar = (opc & 0xF) > 7 && (opc & 1); + IRType ty = getRexW(pfx) ? Ity_F64 : Ity_F32; + IRType vty = scalar ? ty : getVexL(pfx) ? Ity_V256 : Ity_V128; + IRTemp vX = newTemp(vty); + IRTemp vY = newTemp(vty); + IRTemp vZ = newTemp(vty); + IRExpr *x[8], *y[8], *z[8]; + IRTemp addr = IRTemp_INVALID; + HChar dis_buf[50]; + Int alen = 0; + const HChar *name; + const HChar *suffix; + const HChar *order; + Bool negateRes = False; + Bool negateZeven = False; + Bool negateZodd = False; + Int i, j; + Int count; + static IROp ops[] = { Iop_V256to64_0, Iop_V256to64_1, + Iop_V256to64_2, Iop_V256to64_3, + Iop_V128to64, Iop_V128HIto64 }; + + switch (opc & 0xF) { + case 0x6: + name = "addsub"; + negateZeven = True; + break; + case 0x7: + name = "subadd"; + negateZodd = True; + break; + case 0x8: + case 0x9: + name = "add"; + break; + case 0xA: + case 0xB: + name = "sub"; + negateZeven = True; + negateZodd = True; + break; + case 0xC: + case 0xD: + name = "add"; + negateRes = True; + negateZeven = True; + negateZodd = True; + break; + case 0xE: + case 0xF: + name = "sub"; + negateRes = True; break; - - case 0x05: - case 0x06: - case 0x07: - /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */ - /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */ - /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */ - if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { - delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc ); - *uses_vvvv = True; - goto decode_success; - } + default: + vpanic("dis_FMA(amd64)"); break; + } + switch (opc & 0xF0) { + case 0x90: order = "132"; break; + case 0xA0: order = "213"; break; + case 0xB0: order = "231"; break; + default: vpanic("dis_FMA(amd64)"); break; + } + if (scalar) + suffix = ty == Ity_F64 ? "sd" : "ss"; + else + suffix = ty == Ity_F64 ? "pd" : "ps"; - case 0x08: - case 0x09: - case 0x0A: - /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */ - /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */ - /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */ + if (scalar) { + assign( vX, ty == Ity_F64 + ? getXMMRegLane64F(rG, 0) : getXMMRegLane32F(rG, 0) ); + assign( vZ, ty == Ity_F64 + ? getXMMRegLane64F(rV, 0) : getXMMRegLane32F(rV, 0) ); + } else { + assign( vX, vty == Ity_V256 ? getYMMReg(rG) : getXMMReg(rG) ); + assign( vZ, vty == Ity_V256 ? getYMMReg(rV) : getXMMReg(rV) ); + } + + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + if (scalar) + assign( vY, ty == Ity_F64 + ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) ); + else + assign( vY, vty == Ity_V256 ? getYMMReg(rE) : getXMMReg(rE) ); + if (vty == Ity_V256) { + DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "", + name, order, suffix, nameYMMReg(rE), nameYMMReg(rV), + nameYMMReg(rG)); + } else { + DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "", + name, order, suffix, nameXMMReg(rE), nameXMMReg(rV), + nameXMMReg(rG)); + } + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + assign(vY, loadLE(vty, mkexpr(addr))); + if (vty == Ity_V256) { + DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "", + name, order, suffix, dis_buf, nameYMMReg(rV), + nameYMMReg(rG)); + } else { + DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "", + name, order, suffix, dis_buf, nameXMMReg(rV), + nameXMMReg(rG)); + } + } + + /* vX/vY/vZ now in 132 order. If it is different order, swap the + arguments. */ + if ((opc & 0xF0) != 0x90) { + IRTemp tem = vX; + if ((opc & 0xF0) == 0xA0) { + vX = vZ; + vZ = vY; + vY = tem; + } else { + vX = vZ; + vZ = tem; + } + } + + if (scalar) { + count = 1; + x[0] = mkexpr(vX); + y[0] = mkexpr(vY); + z[0] = mkexpr(vZ); + } else if (ty == Ity_F32) { + count = vty == Ity_V256 ? 8 : 4; + j = vty == Ity_V256 ? 0 : 4; + for (i = 0; i < count; i += 2) { + IRTemp tem = newTemp(Ity_I64); + assign(tem, unop(ops[i / 2 + j], mkexpr(vX))); + x[i] = unop(Iop_64to32, mkexpr(tem)); + x[i + 1] = unop(Iop_64HIto32, mkexpr(tem)); + tem = newTemp(Ity_I64); + assign(tem, unop(ops[i / 2 + j], mkexpr(vY))); + y[i] = unop(Iop_64to32, mkexpr(tem)); + y[i + 1] = unop(Iop_64HIto32, mkexpr(tem)); + tem = newTemp(Ity_I64); + assign(tem, unop(ops[i / 2 + j], mkexpr(vZ))); + z[i] = unop(Iop_64to32, mkexpr(tem)); + z[i + 1] = unop(Iop_64HIto32, mkexpr(tem)); + } + } else { + count = vty == Ity_V256 ? 4 : 2; + j = vty == Ity_V256 ? 0 : 4; + for (i = 0; i < count; i++) { + x[i] = unop(ops[i + j], mkexpr(vX)); + y[i] = unop(ops[i + j], mkexpr(vY)); + z[i] = unop(ops[i + j], mkexpr(vZ)); + } + } + if (!scalar) + for (i = 0; i < count; i++) { + IROp op = ty == Ity_F64 + ? Iop_ReinterpI64asF64 : Iop_ReinterpI32asF32; + x[i] = unop(op, x[i]); + y[i] = unop(op, y[i]); + z[i] = unop(op, z[i]); + } + for (i = 0; i < count; i++) { + if ((i & 1) ? negateZodd : negateZeven) + z[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, z[i]); + x[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32, + get_FAKE_roundingmode(), x[i], y[i], z[i]); + if (negateRes) + x[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, x[i]); + if (ty == Ity_F64) + putYMMRegLane64F( rG, i, x[i] ); + else + putYMMRegLane32F( rG, i, x[i] ); + } + if (vty != Ity_V256) + putYMMRegLane128( rG, 1, mkV128(0) ); + + return delta; +} + + +/* Masked load. */ +static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, VexAbiInfo* vbi, + Prefix pfx, Long delta, + const HChar* opname, Bool isYMM, IRType ty ) +{ + HChar dis_buf[50]; + Int alen, i; + IRTemp addr; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp res[8], cond; + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + if (isYMM) { + DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) ); + } else { + DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) ); + } + delta += alen; + + for (i = 0; i < 2 * (isYMM ? 2 : 1) * (ty == Ity_I32 ? 2 : 1); i++) { + res[i] = newTemp(ty); + cond = newTemp(Ity_I1); + assign( cond, + binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S, + ty == Ity_I32 ? getYMMRegLane32( rV, i ) + : getYMMRegLane64( rV, i ), + mkU(ty, 0) )); + assign( res[i], + IRExpr_ITE( + mkexpr(cond), + loadLE(ty, IRExpr_ITE( + mkexpr(cond), + binop(Iop_Add64, mkexpr(addr), + mkU64(i*(ty == Ity_I32 ? 4 : 8))), + getIReg64(R_RSP) + ) + ), + mkU(ty, 0) + ) + ); + } + switch (ty) { + case Ity_I32: + for (i = 0; i < 8; i++) + putYMMRegLane32( rG, i, (i < 4 || isYMM) + ? mkexpr(res[i]) : mkU32(0) ); + break; + case Ity_I64: + for (i = 0; i < 4; i++) + putYMMRegLane64( rG, i, (i < 2 || isYMM) + ? mkexpr(res[i]) : mkU64(0) ); + break; + default: vassert(0); + } + + *uses_vvvv = True; + return delta; +} + + +/* Gather. */ +static ULong dis_VGATHER ( Bool *uses_vvvv, VexAbiInfo* vbi, + Prefix pfx, Long delta, + const HChar* opname, Bool isYMM, + Bool isVM64x, IRType ty ) +{ + HChar dis_buf[50]; + Int alen, i, vscale, count1, count2; + IRTemp addr; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx); + UInt rI; + IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128; + IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128; + IRTemp cond; + addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI, + idxTy, &vscale ); + if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV) + return delta; + if (dstTy == Ity_V256) { + DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) ); + } else { + DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) ); + } + delta += alen; + + if (ty == Ity_I32) { + count1 = isYMM ? 8 : 4; + count2 = isVM64x ? count1 / 2 : count1; + } else { + count1 = count2 = isYMM ? 4 : 2; + } + + /* First update the mask register to copies of the sign bit. */ + if (ty == Ity_I32) { + if (isYMM) + putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) ); + else + putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) ); + } else { + for (i = 0; i < count1; i++) { + putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ), + mkU8(63)) ); + } + } + + /* Next gather the individual elements. If any fault occurs, the + corresponding mask element will be set and the loop stops. */ + for (i = 0; i < count2; i++) { + IRExpr *expr, *addr_expr; + cond = newTemp(Ity_I1); + assign( cond, + binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S, + ty == Ity_I32 ? getYMMRegLane32( rV, i ) + : getYMMRegLane64( rV, i ), + mkU(ty, 0)) ); + expr = ty == Ity_I32 ? getYMMRegLane32( rG, i ) + : getYMMRegLane64( rG, i ); + addr_expr = isVM64x ? getYMMRegLane64( rI, i ) + : unop(Iop_32Sto64, getYMMRegLane32( rI, i )); + switch (vscale) { + case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break; + case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break; + case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break; + default: break; + } + addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr); + addr_expr = handleAddrOverrides(vbi, pfx, addr_expr); + addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP)); + expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr); + if (ty == Ity_I32) { + putYMMRegLane32( rG, i, expr ); + putYMMRegLane32( rV, i, mkU32(0) ); + } else { + putYMMRegLane64( rG, i, expr); + putYMMRegLane64( rV, i, mkU64(0) ); + } + } + + if (!isYMM || (ty == Ity_I32 && isVM64x)) { + if (ty == Ity_I64 || isYMM) + putYMMRegLane128( rV, 1, mkV128(0) ); + else if (ty == Ity_I32 && count2 == 2) { + putYMMRegLane64( rV, 1, mkU64(0) ); + putYMMRegLane64( rG, 1, mkU64(0) ); + } + putYMMRegLane128( rG, 1, mkV128(0) ); + } + + *uses_vvvv = True; + return delta; +} + + +__attribute__((noinline)) +static +Long dis_ESC_0F38__VEX ( + /*MB_OUT*/DisResult* dres, + /*OUT*/ Bool* uses_vvvv, + Bool (*resteerOkFn) ( /*opaque*/void*, Addr64 ), + Bool resteerCisOk, + void* callback_opaque, + VexArchInfo* archinfo, + VexAbiInfo* vbi, + Prefix pfx, Int sz, Long deltaIN + ) +{ + IRTemp addr = IRTemp_INVALID; + Int alen = 0; + HChar dis_buf[50]; + Long delta = deltaIN; + UChar opc = getUChar(delta); + delta++; + *uses_vvvv = False; + + switch (opc) { + + case 0x00: + /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */ + /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM ); + goto decode_success; + } + /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */ + /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM ); + goto decode_success; + } + break; + + case 0x01: + case 0x02: + case 0x03: + /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */ + /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */ + /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc ); + *uses_vvvv = True; + goto decode_success; + } + /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */ + /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */ + /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PHADD_256( vbi, pfx, delta, opc ); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0x04: + /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpmaddubsw", + math_PMADDUBSW_128 ); + goto decode_success; + } + /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpmaddubsw", + math_PMADDUBSW_256 ); + goto decode_success; + } + break; + + case 0x05: + case 0x06: + case 0x07: + /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */ + /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */ + /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc ); + *uses_vvvv = True; + goto decode_success; + } + /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */ + /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */ + /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PHADD_256( vbi, pfx, delta, opc ); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0x08: + case 0x09: + case 0x0A: + /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */ + /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */ + /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { IRTemp sV = newTemp(Ity_V128); IRTemp dV = newTemp(Ity_V128); @@ -24817,6 +26990,63 @@ Long dis_ESC_0F38__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */ + /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */ + /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + IRTemp sV = newTemp(Ity_V256); + IRTemp dV = newTemp(Ity_V256); + IRTemp s3, s2, s1, s0, d3, d2, d1, d0; + s3 = s2 = s1 = s0 = IRTemp_INVALID; + d3 = d2 = d1 = d0 = IRTemp_INVALID; + UChar ch = '?'; + Int laneszB = 0; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx); + + switch (opc) { + case 0x08: laneszB = 1; ch = 'b'; break; + case 0x09: laneszB = 2; ch = 'w'; break; + case 0x0A: laneszB = 4; ch = 'd'; break; + default: vassert(0); + } + + assign( dV, getYMMReg(rV) ); + + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getYMMReg(rE) ); + delta += 1; + DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE), + nameYMMReg(rV), nameYMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( sV, loadLE(Ity_V256, mkexpr(addr)) ); + delta += alen; + DIP("vpsign%c %s,%s,%s\n", ch, dis_buf, + nameYMMReg(rV), nameYMMReg(rG)); + } + + breakupV256to64s( dV, &d3, &d2, &d1, &d0 ); + breakupV256to64s( sV, &s3, &s2, &s1, &s0 ); + + putYMMReg( + rG, + binop( Iop_V128HLtoV256, + binop(Iop_64HLtoV128, + dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ), + dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB ) + ), + binop(Iop_64HLtoV128, + dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ), + dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB ) + ) + ) + ); + *uses_vvvv = True; + goto decode_success; + } break; case 0x0B: @@ -24859,6 +27089,49 @@ Long dis_ESC_0F38__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + IRTemp sV = newTemp(Ity_V256); + IRTemp dV = newTemp(Ity_V256); + IRTemp s3, s2, s1, s0, d3, d2, d1, d0; + s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID; + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx,modrm); + UInt rV = getVexNvvvv(pfx); + + assign( dV, getYMMReg(rV) ); + + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx,modrm); + assign( sV, getYMMReg(rE) ); + delta += 1; + DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE), + nameYMMReg(rV), nameYMMReg(rG)); + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( sV, loadLE(Ity_V256, mkexpr(addr)) ); + delta += alen; + DIP("vpmulhrsw %s,%s,%s\n", dis_buf, + nameYMMReg(rV), nameYMMReg(rG)); + } + + breakupV256to64s( dV, &d3, &d2, &d1, &d0 ); + breakupV256to64s( sV, &s3, &s2, &s1, &s0 ); + + putYMMReg( + rG, + binop(Iop_V128HLtoV256, + binop(Iop_64HLtoV128, + dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ), + dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ), + binop(Iop_64HLtoV128, + dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ), + dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) ) + ) + ); + *uses_vvvv = True; + goto decode_success; + } break; case 0x0C: @@ -25001,6 +27274,16 @@ Long dis_ESC_0F38__VEX ( } break; + case 0x16: + /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD ); + goto decode_success; + } + break; + case 0x17: /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { @@ -25050,10 +27333,45 @@ Long dis_ESC_0F38__VEX ( putYMMReg(rG, res); goto decode_success; } - break; - - case 0x19: - /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */ + /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ + && epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rE = eregOfRexRM(pfx, modrm); + DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + IRTemp t32 = newTemp(Ity_I32); + assign(t32, getXMMRegLane32(rE, 0)); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64)); + putYMMRegLoAndZU(rG, res); + delta++; + goto decode_success; + } + /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ + && epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rE = eregOfRexRM(pfx, modrm); + DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG)); + IRTemp t32 = newTemp(Ity_I32); + assign(t32, getXMMRegLane32(rE, 0)); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64), + mkexpr(t64), mkexpr(t64)); + putYMMReg(rG, res); + delta++; + goto decode_success; + } + break; + + case 0x19: + /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) { @@ -25069,6 +27387,22 @@ Long dis_ESC_0F38__VEX ( putYMMReg(rG, res); goto decode_success; } + /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ + && epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rE = eregOfRexRM(pfx, modrm); + DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG)); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, getXMMRegLane64(rE, 0)); + IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64), + mkexpr(t64), mkexpr(t64)); + putYMMReg(rG, res); + delta++; + goto decode_success; + } break; case 0x1A: @@ -25096,6 +27430,13 @@ Long dis_ESC_0F38__VEX ( "vpabsb", math_PABS_XMM_pap1 ); goto decode_success; } + /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_to_G_unary( + uses_vvvv, vbi, pfx, delta, + "vpabsb", math_PABS_YMM_pap1 ); + goto decode_success; + } break; case 0x1D: @@ -25106,6 +27447,13 @@ Long dis_ESC_0F38__VEX ( "vpabsw", math_PABS_XMM_pap2 ); goto decode_success; } + /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_to_G_unary( + uses_vvvv, vbi, pfx, delta, + "vpabsw", math_PABS_YMM_pap2 ); + goto decode_success; + } break; case 0x1E: @@ -25116,6 +27464,13 @@ Long dis_ESC_0F38__VEX ( "vpabsd", math_PABS_XMM_pap4 ); goto decode_success; } + /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_AVX256_E_to_G_unary( + uses_vvvv, vbi, pfx, delta, + "vpabsd", math_PABS_YMM_pap4 ); + goto decode_success; + } break; case 0x20: @@ -25126,6 +27481,12 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, False/*!xIsZ*/ ); goto decode_success; } + /* VPMOVSXBW xmm2/m128, ymm1 */ + /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ ); + goto decode_success; + } break; case 0x21: @@ -25136,6 +27497,12 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, False/*!xIsZ*/ ); goto decode_success; } + /* VPMOVSXBD xmm2/m64, ymm1 */ + /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ ); + goto decode_success; + } break; case 0x22: @@ -25145,6 +27512,12 @@ Long dis_ESC_0F38__VEX ( delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ ); goto decode_success; } + /* VPMOVSXBQ xmm2/m32, ymm1 */ + /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVSXBQ_256( vbi, pfx, delta ); + goto decode_success; + } break; case 0x23: @@ -25154,6 +27527,11 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, False/*!xIsZ*/ ); goto decode_success; } + /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ ); + goto decode_success; + } break; case 0x24: @@ -25162,6 +27540,11 @@ Long dis_ESC_0F38__VEX ( delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ ); goto decode_success; } + /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVSXWQ_256( vbi, pfx, delta ); + goto decode_success; + } break; case 0x25: @@ -25171,6 +27554,11 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, False/*!xIsZ*/ ); goto decode_success; } + /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ ); + goto decode_success; + } break; case 0x28: @@ -25181,6 +27569,13 @@ Long dis_ESC_0F38__VEX ( "vpmuldq", math_PMULDQ_128 ); goto decode_success; } + /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, + "vpmuldq", math_PMULDQ_256 ); + goto decode_success; + } break; case 0x29: @@ -25191,6 +27586,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 ); goto decode_success; } + /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */ + /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 ); + goto decode_success; + } break; case 0x2A: @@ -25208,6 +27610,20 @@ Long dis_ESC_0F38__VEX ( putYMMRegLoAndZU(rD, mkexpr(tD)); goto decode_success; } + /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && !epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + UInt rD = gregOfRexRM(pfx, modrm); + IRTemp tD = newTemp(Ity_V256); + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + gen_SEGV_if_not_32_aligned(addr); + assign(tD, loadLE(Ity_V256, mkexpr(addr))); + DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD)); + putYMMReg(rD, mkexpr(tD)); + goto decode_success; + } break; case 0x2B: @@ -25220,6 +27636,48 @@ Long dis_ESC_0F38__VEX ( False/*!invertLeftArg*/, True/*swapArgs*/ ); goto decode_success; } + /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */ + /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpackusdw", + math_VPACKUSDW_YMM ); + goto decode_success; + } + break; + + case 0x2C: + /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2C /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps", + /*!isYMM*/False, Ity_I32 ); + goto decode_success; + } + /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2C /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps", + /*isYMM*/True, Ity_I32 ); + goto decode_success; + } + break; + + case 0x2D: + /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2D /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd", + /*!isYMM*/False, Ity_I64 ); + goto decode_success; + } + /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2D /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd", + /*isYMM*/True, Ity_I64 ); + goto decode_success; + } break; case 0x30: @@ -25230,6 +27688,12 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, True/*xIsZ*/ ); goto decode_success; } + /* VPMOVZXBW xmm2/m128, ymm1 */ + /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ ); + goto decode_success; + } break; case 0x31: @@ -25240,6 +27704,12 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, True/*xIsZ*/ ); goto decode_success; } + /* VPMOVZXBD xmm2/m64, ymm1 */ + /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ ); + goto decode_success; + } break; case 0x32: @@ -25249,6 +27719,12 @@ Long dis_ESC_0F38__VEX ( delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ ); goto decode_success; } + /* VPMOVZXBQ xmm2/m32, ymm1 */ + /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVZXBQ_256( vbi, pfx, delta ); + goto decode_success; + } break; case 0x33: @@ -25259,6 +27735,12 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, True/*xIsZ*/ ); goto decode_success; } + /* VPMOVZXWD xmm2/m128, ymm1 */ + /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ ); + goto decode_success; + } break; case 0x34: @@ -25267,6 +27749,11 @@ Long dis_ESC_0F38__VEX ( delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ ); goto decode_success; } + /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVZXWQ_256( vbi, pfx, delta ); + goto decode_success; + } break; case 0x35: @@ -25276,6 +27763,21 @@ Long dis_ESC_0F38__VEX ( True/*isAvx*/, True/*xIsZ*/ ); goto decode_success; } + /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ ); + goto decode_success; + } + break; + + case 0x36: + /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex( + uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD ); + goto decode_success; + } break; case 0x37: @@ -25286,6 +27788,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 ); goto decode_success; } + /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */ + /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 ); + goto decode_success; + } break; case 0x38: @@ -25296,6 +27805,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 ); goto decode_success; } + /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */ + /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 ); + goto decode_success; + } break; case 0x39: @@ -25306,6 +27822,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 ); goto decode_success; } + /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */ + /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 ); + goto decode_success; + } break; case 0x3A: @@ -25316,6 +27839,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 ); goto decode_success; } + /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */ + /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 ); + goto decode_success; + } break; case 0x3B: @@ -25326,6 +27856,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 ); goto decode_success; } + /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */ + /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 ); + goto decode_success; + } break; case 0x3C: @@ -25336,6 +27873,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 ); goto decode_success; } + /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */ + /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 ); + goto decode_success; + } break; case 0x3D: @@ -25346,6 +27890,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 ); goto decode_success; } + /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */ + /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 ); + goto decode_success; + } break; case 0x3E: @@ -25356,6 +27907,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 ); goto decode_success; } + /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */ + /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 ); + goto decode_success; + } break; case 0x3F: @@ -25366,6 +27924,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 ); goto decode_success; } + /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */ + /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 ); + goto decode_success; + } break; case 0x40: @@ -25376,6 +27941,13 @@ Long dis_ESC_0F38__VEX ( uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 ); goto decode_success; } + /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */ + /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple( + uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 ); + goto decode_success; + } break; case 0x41: @@ -25386,33 +27958,1017 @@ Long dis_ESC_0F38__VEX ( } break; - case 0xDB: - case 0xDC: - case 0xDD: - case 0xDE: - case 0xDF: - /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */ - /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */ - /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */ - /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */ - /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */ - if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { - delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc ); - if (opc != 0xDB) *uses_vvvv = True; + case 0x45: + /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */ + /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */ + if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) { + delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd", + Iop_Shr32, 1==getVexL(pfx) ); + *uses_vvvv = True; + goto decode_success; + } + /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */ + /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */ + if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) { + delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq", + Iop_Shr64, 1==getVexL(pfx) ); + *uses_vvvv = True; goto decode_success; } break; - default: + case 0x46: + /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */ + /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */ + if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) { + delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd", + Iop_Sar32, 1==getVexL(pfx) ); + *uses_vvvv = True; + goto decode_success; + } break; - } + case 0x47: + /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */ + /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */ + if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) { + delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd", + Iop_Shl32, 1==getVexL(pfx) ); + *uses_vvvv = True; + goto decode_success; + } + /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */ + /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */ + if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) { + delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq", + Iop_Shl64, 1==getVexL(pfx) ); + *uses_vvvv = True; + goto decode_success; + } + break; - //decode_failure: - return deltaIN; + case 0x58: + /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t32 = newTemp(Ity_I32); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta++; + DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + assign(t32, getXMMRegLane32(rE, 0)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG)); + assign(t32, loadLE(Ity_I32, mkexpr(addr))); + } + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64)); + putYMMRegLoAndZU(rG, res); + goto decode_success; + } + /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t32 = newTemp(Ity_I32); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta++; + DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG)); + assign(t32, getXMMRegLane32(rE, 0)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG)); + assign(t32, loadLE(Ity_I32, mkexpr(addr))); + } + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64), + mkexpr(t64), mkexpr(t64)); + putYMMReg(rG, res); + goto decode_success; + } + break; - decode_success: - return delta; + case 0x59: + /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t64 = newTemp(Ity_I64); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta++; + DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + assign(t64, getXMMRegLane64(rE, 0)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG)); + assign(t64, loadLE(Ity_I64, mkexpr(addr))); + } + IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64)); + putYMMRegLoAndZU(rG, res); + goto decode_success; + } + /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t64 = newTemp(Ity_I64); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta++; + DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG)); + assign(t64, getXMMRegLane64(rE, 0)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG)); + assign(t64, loadLE(Ity_I64, mkexpr(addr))); + } + IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64), + mkexpr(t64), mkexpr(t64)); + putYMMReg(rG, res); + goto decode_success; + } + break; + + case 0x5A: + /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ + && !epartIsReg(getUChar(delta))) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG)); + IRTemp t128 = newTemp(Ity_V128); + assign(t128, loadLE(Ity_V128, mkexpr(addr))); + putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) ); + goto decode_success; + } + break; + + case 0x78: + /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t8 = newTemp(Ity_I8); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0))); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG)); + assign(t8, loadLE(Ity_I8, mkexpr(addr))); + } + IRTemp t16 = newTemp(Ity_I16); + assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8))); + IRTemp t32 = newTemp(Ity_I32); + assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16))); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64)); + putYMMRegLoAndZU(rG, res); + goto decode_success; + } + /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t8 = newTemp(Ity_I8); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG)); + assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0))); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG)); + assign(t8, loadLE(Ity_I8, mkexpr(addr))); + } + IRTemp t16 = newTemp(Ity_I16); + assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8))); + IRTemp t32 = newTemp(Ity_I32); + assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16))); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64), + mkexpr(t64), mkexpr(t64)); + putYMMReg(rG, res); + goto decode_success; + } + break; + + case 0x79: + /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t16 = newTemp(Ity_I16); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG)); + assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0))); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG)); + assign(t16, loadLE(Ity_I16, mkexpr(addr))); + } + IRTemp t32 = newTemp(Ity_I32); + assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16))); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64)); + putYMMRegLoAndZU(rG, res); + goto decode_success; + } + /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp t16 = newTemp(Ity_I16); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG)); + assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0))); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; + DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG)); + assign(t16, loadLE(Ity_I16, mkexpr(addr))); + } + IRTemp t32 = newTemp(Ity_I32); + assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16))); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32))); + IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64), + mkexpr(t64), mkexpr(t64)); + putYMMReg(rG, res); + goto decode_success; + } + break; + + case 0x8C: + /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", + /*!isYMM*/False, Ity_I32 ); + goto decode_success; + } + /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd", + /*isYMM*/True, Ity_I32 ); + goto decode_success; + } + /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", + /*!isYMM*/False, Ity_I64 ); + goto decode_success; + } + /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq", + /*isYMM*/True, Ity_I64 ); + goto decode_success; + } + break; + + case 0x90: + /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd", + /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd", + /*isYMM*/True, /*!isVM64x*/False, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq", + /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq", + /*isYMM*/True, /*!isVM64x*/False, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + break; + + case 0x91: + /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd", + /*!isYMM*/False, /*isVM64x*/True, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd", + /*isYMM*/True, /*isVM64x*/True, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq", + /*!isYMM*/False, /*isVM64x*/True, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq", + /*isYMM*/True, /*isVM64x*/True, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + break; + + case 0x92: + /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps", + /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps", + /*isYMM*/True, /*!isVM64x*/False, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd", + /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd", + /*isYMM*/True, /*!isVM64x*/False, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + break; + + case 0x93: + /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps", + /*!isYMM*/False, /*isVM64x*/True, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps", + /*isYMM*/True, /*isVM64x*/True, Ity_I32 ); + if (delta != delta0) + goto decode_success; + } + /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd", + /*!isYMM*/False, /*isVM64x*/True, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) { + Long delta0 = delta; + delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd", + /*isYMM*/True, /*isVM64x*/True, Ity_I64 ); + if (delta != delta0) + goto decode_success; + } + break; + + case 0x96 ... 0x9F: + case 0xA6 ... 0xAF: + case 0xB6 ... 0xBF: + /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */ + /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */ + /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */ + /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */ + /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */ + /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */ + /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */ + /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */ + /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */ + /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */ + /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */ + /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */ + /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */ + /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */ + /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */ + /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */ + /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */ + /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */ + /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */ + /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */ + /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */ + /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */ + /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */ + /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */ + /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */ + /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */ + /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */ + /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */ + /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */ + /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */ + /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */ + /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */ + /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */ + /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */ + /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */ + /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */ + /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */ + /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */ + /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */ + /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */ + /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */ + /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */ + /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */ + /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */ + /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */ + /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */ + /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */ + /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */ + /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */ + /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */ + /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */ + /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */ + /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */ + /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */ + /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */ + /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */ + /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */ + /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */ + /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */ + /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */ + /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */ + /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */ + /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */ + /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */ + /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */ + /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */ + /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */ + /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */ + /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */ + /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */ + /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */ + /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */ + /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */ + /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */ + /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */ + /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */ + /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */ + /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */ + /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */ + /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */ + /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */ + /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */ + /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */ + /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */ + /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */ + /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */ + /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */ + /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */ + /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */ + /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */ + /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */ + /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */ + /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */ + /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */ + /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */ + /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */ + if (have66noF2noF3(pfx)) { + delta = dis_FMA( vbi, pfx, delta, opc ); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */ + /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */ + /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */ + /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */ + /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) { + delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc ); + if (opc != 0xDB) *uses_vvvv = True; + goto decode_success; + } + break; + + case 0xF2: + /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */ + /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp dst = newTemp(ty); + IRTemp src1 = newTemp(ty); + IRTemp src2 = newTemp(ty); + UChar rm = getUChar(delta); + + assign( src1, getIRegV(size,pfx) ); + if (epartIsReg(rm)) { + assign( src2, getIRegE(size,pfx,rm) ); + DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm), + nameIRegV(size,pfx), nameIRegG(size,pfx,rm)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src2, loadLE(ty, mkexpr(addr)) ); + DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx), + nameIRegG(size,pfx,rm)); + delta += alen; + } + + assign( dst, binop( mkSizedOp(ty,Iop_And8), + unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ), + mkexpr(src2) ) ); + putIRegG( size, pfx, rm, mkexpr(dst) ); + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(size == 8 + ? AMD64G_CC_OP_ANDN64 + : AMD64G_CC_OP_ANDN32)) ); + stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) ); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) ); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0xF3: + /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */ + /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ + && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src = newTemp(ty); + IRTemp dst = newTemp(ty); + UChar rm = getUChar(delta); + + if (epartIsReg(rm)) { + assign( src, getIRegE(size,pfx,rm) ); + DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm), + nameIRegV(size,pfx)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src, loadLE(ty, mkexpr(addr)) ); + DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx)); + delta += alen; + } + + assign( dst, binop(mkSizedOp(ty,Iop_And8), + binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0), + mkexpr(src)), mkexpr(src)) ); + putIRegV( size, pfx, mkexpr(dst) ); + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(size == 8 + ? AMD64G_CC_OP_BLSI64 + : AMD64G_CC_OP_BLSI32)) ); + stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) ); + stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) ); + *uses_vvvv = True; + goto decode_success; + } + /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */ + /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ + && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src = newTemp(ty); + IRTemp dst = newTemp(ty); + UChar rm = getUChar(delta); + + if (epartIsReg(rm)) { + assign( src, getIRegE(size,pfx,rm) ); + DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm), + nameIRegV(size,pfx)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src, loadLE(ty, mkexpr(addr)) ); + DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx)); + delta += alen; + } + + assign( dst, binop(mkSizedOp(ty,Iop_Xor8), + binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src), + mkU(ty, 1)), mkexpr(src)) ); + putIRegV( size, pfx, mkexpr(dst) ); + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(size == 8 + ? AMD64G_CC_OP_BLSMSK64 + : AMD64G_CC_OP_BLSMSK32)) ); + stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) ); + stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) ); + *uses_vvvv = True; + goto decode_success; + } + /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */ + /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ + && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src = newTemp(ty); + IRTemp dst = newTemp(ty); + UChar rm = getUChar(delta); + + if (epartIsReg(rm)) { + assign( src, getIRegE(size,pfx,rm) ); + DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm), + nameIRegV(size,pfx)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src, loadLE(ty, mkexpr(addr)) ); + DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx)); + delta += alen; + } + + assign( dst, binop(mkSizedOp(ty,Iop_And8), + binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src), + mkU(ty, 1)), mkexpr(src)) ); + putIRegV( size, pfx, mkexpr(dst) ); + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(size == 8 + ? AMD64G_CC_OP_BLSR64 + : AMD64G_CC_OP_BLSR32)) ); + stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) ); + stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) ); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0xF5: + /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */ + /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp dst = newTemp(ty); + IRTemp src1 = newTemp(ty); + IRTemp src2 = newTemp(ty); + IRTemp start = newTemp(Ity_I8); + IRTemp cond = newTemp(Ity_I8); + UChar rm = getUChar(delta); + + assign( src2, getIRegV(size,pfx) ); + if (epartIsReg(rm)) { + assign( src1, getIRegE(size,pfx,rm) ); + DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), + nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src1, loadLE(ty, mkexpr(addr)) ); + DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf, + nameIRegG(size,pfx,rm)); + delta += alen; + } + + assign( start, narrowTo( Ity_I8, mkexpr(src2) ) ); + assign( cond, binop(Iop_CmpLT32U, + unop(Iop_8Uto32, mkexpr(start)), + mkU32(8*size)) ); + /* if (start < opsize) { + if (start == 0) + dst = 0; + else + dst = (src1 << (opsize-start)) u>> (opsize-start); + } else { + dst = src1; + } */ + assign( dst, + IRExpr_ITE( + mkexpr(cond), + IRExpr_ITE( + binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)), + mkU(ty, 0), + binop( + mkSizedOp(ty,Iop_Shr8), + binop( + mkSizedOp(ty,Iop_Shl8), + mkexpr(src1), + binop(Iop_Sub8, mkU8(8*size), mkexpr(start)) + ), + binop(Iop_Sub8, mkU8(8*size), mkexpr(start)) + ) + ), + mkexpr(src1) + ) + ); + putIRegG( size, pfx, rm, mkexpr(dst) ); + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(size == 8 + ? AMD64G_CC_OP_BLSR64 + : AMD64G_CC_OP_BLSR32)) ); + stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) ); + stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) ); + *uses_vvvv = True; + goto decode_success; + } + /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */ + /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */ + if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src = newTemp(ty); + IRTemp mask = newTemp(ty); + UChar rm = getUChar(delta); + + assign( src, getIRegV(size,pfx) ); + if (epartIsReg(rm)) { + assign( mask, getIRegE(size,pfx,rm) ); + DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm), + nameIRegV(size,pfx), nameIRegG(size,pfx,rm)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( mask, loadLE(ty, mkexpr(addr)) ); + DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx), + nameIRegG(size,pfx,rm)); + delta += alen; + } + + IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)), + widenUto64(mkexpr(mask)) ); + putIRegG( size, pfx, rm, + narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/, + "amd64g_calculate_pdep", + &amd64g_calculate_pdep, args)) ); + *uses_vvvv = True; + /* Flags aren't modified. */ + goto decode_success; + } + /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */ + /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */ + if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src = newTemp(ty); + IRTemp mask = newTemp(ty); + UChar rm = getUChar(delta); + + assign( src, getIRegV(size,pfx) ); + if (epartIsReg(rm)) { + assign( mask, getIRegE(size,pfx,rm) ); + DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm), + nameIRegV(size,pfx), nameIRegG(size,pfx,rm)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( mask, loadLE(ty, mkexpr(addr)) ); + DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx), + nameIRegG(size,pfx,rm)); + delta += alen; + } + + /* First mask off bits not set in mask, they are ignored + and it should be fine if they contain undefined values. */ + IRExpr* masked = binop(mkSizedOp(ty,Iop_And8), + mkexpr(src), mkexpr(mask)); + IRExpr** args = mkIRExprVec_2( widenUto64(masked), + widenUto64(mkexpr(mask)) ); + putIRegG( size, pfx, rm, + narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/, + "amd64g_calculate_pext", + &amd64g_calculate_pext, args)) ); + *uses_vvvv = True; + /* Flags aren't modified. */ + goto decode_success; + } + break; + + case 0xF6: + /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */ + /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */ + if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src1 = newTemp(ty); + IRTemp src2 = newTemp(ty); + IRTemp res = newTemp(size == 8 ? Ity_I128 : Ity_I64); + UChar rm = getUChar(delta); + + assign( src1, getIRegRDX(size) ); + if (epartIsReg(rm)) { + assign( src2, getIRegE(size,pfx,rm) ); + DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm), + nameIRegV(size,pfx), nameIRegG(size,pfx,rm)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src2, loadLE(ty, mkexpr(addr)) ); + DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx), + nameIRegG(size,pfx,rm)); + delta += alen; + } + + assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32, + mkexpr(src1), mkexpr(src2)) ); + putIRegV( size, pfx, + unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) ); + putIRegG( size, pfx, rm, + unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32, + mkexpr(res)) ); + *uses_vvvv = True; + /* Flags aren't modified. */ + goto decode_success; + } + break; + + case 0xF7: + /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */ + /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */ + if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 ); + goto decode_success; + } + /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */ + /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */ + if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 ); + goto decode_success; + } + /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */ + /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */ + if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 ); + goto decode_success; + } + /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */ + /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */ + if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp dst = newTemp(ty); + IRTemp src1 = newTemp(ty); + IRTemp src2 = newTemp(ty); + IRTemp stle = newTemp(Ity_I16); + IRTemp start = newTemp(Ity_I8); + IRTemp len = newTemp(Ity_I8); + UChar rm = getUChar(delta); + + assign( src2, getIRegV(size,pfx) ); + if (epartIsReg(rm)) { + assign( src1, getIRegE(size,pfx,rm) ); + DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), + nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm)); + delta++; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + assign( src1, loadLE(ty, mkexpr(addr)) ); + DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf, + nameIRegG(size,pfx,rm)); + delta += alen; + } + + assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) ); + assign( start, unop( Iop_16to8, mkexpr(stle) ) ); + assign( len, unop( Iop_16HIto8, mkexpr(stle) ) ); + /* if (start+len < opsize) { + if (len != 0) + dst = (src1 << (opsize-start-len)) u>> (opsize-len); + else + dst = 0; + } else { + if (start < opsize) + dst = src1 u>> start; + else + dst = 0; + } */ + assign( dst, + IRExpr_ITE( + binop(Iop_CmpLT32U, + binop(Iop_Add32, + unop(Iop_8Uto32, mkexpr(start)), + unop(Iop_8Uto32, mkexpr(len))), + mkU32(8*size)), + IRExpr_ITE( + binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)), + mkU(ty, 0), + binop(mkSizedOp(ty,Iop_Shr8), + binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1), + binop(Iop_Sub8, + binop(Iop_Sub8, mkU8(8*size), + mkexpr(start)), + mkexpr(len))), + binop(Iop_Sub8, mkU8(8*size), + mkexpr(len))) + ), + IRExpr_ITE( + binop(Iop_CmpLT32U, + unop(Iop_8Uto32, mkexpr(start)), + mkU32(8*size)), + binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1), + mkexpr(start)), + mkU(ty, 0) + ) + ) + ); + putIRegG( size, pfx, rm, mkexpr(dst) ); + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(size == 8 + ? AMD64G_CC_OP_ANDN64 + : AMD64G_CC_OP_ANDN32)) ); + stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) ); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) ); + *uses_vvvv = True; + goto decode_success; + } + break; + + default: + break; + + } + + //decode_failure: + return deltaIN; + + decode_success: + return delta; } @@ -25460,7 +29016,132 @@ Long dis_ESC_0F3A__VEX ( delta++; *uses_vvvv = False; - switch (opc) { + switch (opc) { + + case 0x00: + case 0x01: + /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */ + /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/ + && 1==getRexW(pfx)/*W1*/) { + UChar modrm = getUChar(delta); + UInt imm8 = 0; + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp sV = newTemp(Ity_V256); + const HChar *name = opc == 0 ? "vpermq" : "vpermpd"; + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + imm8 = getUChar(delta); + DIP("%s $%u,%s,%s\n", + name, imm8, nameYMMReg(rE), nameYMMReg(rG)); + assign(sV, getYMMReg(rE)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + imm8 = getUChar(delta); + DIP("%s $%u,%s,%s\n", + name, imm8, dis_buf, nameYMMReg(rG)); + assign(sV, loadLE(Ity_V256, mkexpr(addr))); + } + delta++; + IRTemp s[4]; + s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID; + breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]); + IRTemp dV = newTemp(Ity_V256); + assign(dV, IRExpr_Qop(Iop_64x4toV256, + mkexpr(s[(imm8 >> 6) & 3]), + mkexpr(s[(imm8 >> 4) & 3]), + mkexpr(s[(imm8 >> 2) & 3]), + mkexpr(s[(imm8 >> 0) & 3]))); + putYMMReg(rG, mkexpr(dV)); + goto decode_success; + } + break; + + case 0x02: + /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt imm8 = 0; + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + UInt i; + IRTemp s[4], d[4]; + assign(sV, getXMMReg(rV)); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + imm8 = getUChar(delta); + DIP("vpblendd $%u,%s,%s,%s\n", + imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG)); + assign(dV, getXMMReg(rE)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + imm8 = getUChar(delta); + DIP("vpblendd $%u,%s,%s,%s\n", + imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + assign(dV, loadLE(Ity_V128, mkexpr(addr))); + } + delta++; + for (i = 0; i < 4; i++) { + s[i] = IRTemp_INVALID; + d[i] = IRTemp_INVALID; + } + breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] ); + breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] ); + for (i = 0; i < 4; i++) + putYMMRegLane32(rG, i, mkexpr((imm8 & (1<> 3) ), + mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) ); + *uses_vvvv = True; + goto decode_success; + } break; case 0x44: @@ -26524,6 +30388,52 @@ Long dis_ESC_0F3A__VEX ( } break; + case 0x46: + /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt imm8 = 0; + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp s00 = newTemp(Ity_V128); + IRTemp s01 = newTemp(Ity_V128); + IRTemp s10 = newTemp(Ity_V128); + IRTemp s11 = newTemp(Ity_V128); + assign(s00, getYMMRegLane128(rV, 0)); + assign(s01, getYMMRegLane128(rV, 1)); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + imm8 = getUChar(delta); + DIP("vperm2i128 $%u,%s,%s,%s\n", + imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG)); + assign(s10, getYMMRegLane128(rE, 0)); + assign(s11, getYMMRegLane128(rE, 1)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + imm8 = getUChar(delta); + DIP("vperm2i128 $%u,%s,%s,%s\n", + imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG)); + assign(s10, loadLE(Ity_V128, binop(Iop_Add64, + mkexpr(addr), mkU64(0)))); + assign(s11, loadLE(Ity_V128, binop(Iop_Add64, + mkexpr(addr), mkU64(16)))); + } + delta++; +# define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \ + : ((_nn)==2) ? s10 : s11) + putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3))); + putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3))); +# undef SEL + if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0)); + if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0)); + *uses_vvvv = True; + goto decode_success; + } + break; + case 0x4A: /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4 ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */ @@ -26576,6 +30486,15 @@ Long dis_ESC_0F3A__VEX ( *uses_vvvv = True; goto decode_success; } + /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4 + ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */ + /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */ + if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) { + delta = dis_VBLENDV_256 ( vbi, pfx, delta, + "vpblendvb", 1, Iop_SarN8x16 ); + *uses_vvvv = True; + goto decode_success; + } break; case 0x60: @@ -26605,6 +30524,44 @@ Long dis_ESC_0F3A__VEX ( } break; + case 0xF0: + /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */ + /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */ + if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) { + Int size = getRexW(pfx) ? 8 : 4; + IRType ty = szToITy(size); + IRTemp src = newTemp(ty); + UChar rm = getUChar(delta); + UChar imm8; + + if (epartIsReg(rm)) { + imm8 = getUChar(delta+1); + assign( src, getIRegE(size,pfx,rm) ); + DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm), + nameIRegG(size,pfx,rm)); + delta += 2; + } else { + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + imm8 = getUChar(delta+alen); + assign( src, loadLE(ty, mkexpr(addr)) ); + DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm)); + delta += alen + 1; + } + imm8 &= 8*size-1; + + /* dst = (src >>u imm8) | (src << (size-imm8)) */ + putIRegG( size, pfx, rm, + imm8 == 0 ? mkexpr(src) + : binop( mkSizedOp(ty,Iop_Or8), + binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src), + mkU8(imm8) ), + binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src), + mkU8(8*size-imm8) ) ) ); + /* Flags aren't modified. */ + goto decode_success; + } + break; + default: break; diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index e7b878eb2f..9f2aa64fa2 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -43,6 +43,8 @@ #include "host_generic_regs.h" #include "host_generic_simd64.h" #include "host_generic_simd128.h" +#include "host_generic_simd256.h" +#include "host_generic_maddf.h" #include "host_amd64_defs.h" @@ -2531,6 +2533,73 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } + if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) { + /* Sigh ... very rough code. Could do much better. */ + /* Get the 128-bit literal 00---0 10---0 into a register + and xor it with the value to be negated. */ + HReg r1 = newVRegI(env); + HReg dst = newVRegV(env); + HReg tmp = newVRegV(env); + HReg src = iselFltExpr(env, e->Iex.Unop.arg); + AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); + addInstr(env, mk_vMOVsd_RR(src,tmp)); + addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); + addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 )); + addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1))); + addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0)); + addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst)); + add_to_rsp(env, 16); + return dst; + } + + if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) { + IRQop *qop = e->Iex.Qop.details; + HReg dst = newVRegV(env); + HReg argX = iselFltExpr(env, qop->arg2); + HReg argY = iselFltExpr(env, qop->arg3); + HReg argZ = iselFltExpr(env, qop->arg4); + /* XXXROUNDINGFIXME */ + /* set roundingmode here */ + /* subq $16, %rsp -- make a space*/ + sub_from_rsp(env, 16); + /* Prepare 4 arg regs: + leaq 0(%rsp), %rdi + leaq 4(%rsp), %rsi + leaq 8(%rsp), %rdx + leaq 12(%rsp), %rcx + */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()), + hregAMD64_RDI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()), + hregAMD64_RSI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()), + hregAMD64_RDX())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()), + hregAMD64_RCX())); + /* Store the three args, at (%rsi), (%rdx) and (%rcx): + movss %argX, 0(%rsi) + movss %argY, 0(%rdx) + movss %argZ, 0(%rcx) + */ + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX, + AMD64AMode_IR(0, hregAMD64_RSI()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY, + AMD64AMode_IR(0, hregAMD64_RDX()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ, + AMD64AMode_IR(0, hregAMD64_RCX()))); + /* call the helper */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, + (ULong)(HWord)h_generic_calc_MAddF32, + 4, RetLocNone )); + /* fetch the result from memory, using %r_argp, which the + register allocator will keep alive across the call. */ + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst, + AMD64AMode_IR(0, hregAMD64_RSP()))); + /* and finally, clear the space */ + add_to_rsp(env, 16); + return dst; + } + ppIRExpr(e); vpanic("iselFltExpr_wrk"); } @@ -2662,6 +2731,54 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e ) } } + if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) { + IRQop *qop = e->Iex.Qop.details; + HReg dst = newVRegV(env); + HReg argX = iselDblExpr(env, qop->arg2); + HReg argY = iselDblExpr(env, qop->arg3); + HReg argZ = iselDblExpr(env, qop->arg4); + /* XXXROUNDINGFIXME */ + /* set roundingmode here */ + /* subq $32, %rsp -- make a space*/ + sub_from_rsp(env, 32); + /* Prepare 4 arg regs: + leaq 0(%rsp), %rdi + leaq 8(%rsp), %rsi + leaq 16(%rsp), %rdx + leaq 24(%rsp), %rcx + */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()), + hregAMD64_RDI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()), + hregAMD64_RSI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()), + hregAMD64_RDX())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()), + hregAMD64_RCX())); + /* Store the three args, at (%rsi), (%rdx) and (%rcx): + movsd %argX, 0(%rsi) + movsd %argY, 0(%rdx) + movsd %argZ, 0(%rcx) + */ + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX, + AMD64AMode_IR(0, hregAMD64_RSI()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY, + AMD64AMode_IR(0, hregAMD64_RDX()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ, + AMD64AMode_IR(0, hregAMD64_RCX()))); + /* call the helper */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, + (ULong)(HWord)h_generic_calc_MAddF64, + 4, RetLocNone )); + /* fetch the result from memory, using %r_argp, which the + register allocator will keep alive across the call. */ + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst, + AMD64AMode_IR(0, hregAMD64_RSP()))); + /* and finally, clear the space */ + add_to_rsp(env, 32); + return dst; + } + if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg arg = iselDblExpr(env, e->Iex.Binop.arg2); @@ -3478,6 +3595,7 @@ static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, ISelEnv* env, IRExpr* e ) { + HWord fn = 0; /* address of helper fn, if required */ vassert(e); IRType ty = typeOfIRExpr(env->type_env,e); vassert(ty == Ity_V256); @@ -3599,6 +3717,8 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, } case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector; + case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector; + case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector; do_CmpNEZ_vector: { HReg argHi, argLo; @@ -3673,6 +3793,37 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, case Iop_AndV256: op = Asse_AND; goto do_SseReRg; case Iop_OrV256: op = Asse_OR; goto do_SseReRg; case Iop_XorV256: op = Asse_XOR; goto do_SseReRg; + case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg; + case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg; + case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg; + case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg; + case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg; + case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg; + case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg; + case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg; + case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg; + case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg; + case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg; + case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg; + case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg; + case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg; + case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg; + case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg; + case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg; + case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg; + case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg; + case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg; + case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg; + case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg; + case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg; + case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg; + case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg; + case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg; + case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg; + case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg; + case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg; + case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg; + case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg; do_SseReRg: { HReg argLhi, argLlo, argRhi, argRlo; @@ -3689,12 +3840,198 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, return; } + case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift; + case Iop_ShlN32x8: op = Asse_SHL32; goto do_SseShift; + case Iop_ShlN64x4: op = Asse_SHL64; goto do_SseShift; + case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift; + case Iop_SarN32x8: op = Asse_SAR32; goto do_SseShift; + case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift; + case Iop_ShrN32x8: op = Asse_SHR32; goto do_SseShift; + case Iop_ShrN64x4: op = Asse_SHR64; goto do_SseShift; + do_SseShift: { + HReg gregHi, gregLo; + iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1); + AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); + AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); + HReg ereg = newVRegV(env); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); + addInstr(env, AMD64Instr_Push(rmi)); + addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0)); + addInstr(env, mk_vMOVsd_RR(gregHi, dstHi)); + addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi)); + addInstr(env, mk_vMOVsd_RR(gregLo, dstLo)); + addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo)); + add_to_rsp(env, 16); + *rHi = dstHi; + *rLo = dstLo; + return; + } + case Iop_V128HLtoV256: { *rHi = iselVecExpr(env, e->Iex.Binop.arg1); *rLo = iselVecExpr(env, e->Iex.Binop.arg2); return; } + case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4; + goto do_SseAssistedBinary; + case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4; + goto do_SseAssistedBinary; + case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4; + goto do_SseAssistedBinary; + case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4; + goto do_SseAssistedBinary; + case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4; + goto do_SseAssistedBinary; + case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8; + goto do_SseAssistedBinary; + case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8; + goto do_SseAssistedBinary; + case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16; + goto do_SseAssistedBinary; + case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16; + goto do_SseAssistedBinary; + case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2; + goto do_SseAssistedBinary; + case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2; + goto do_SseAssistedBinary; + do_SseAssistedBinary: { + /* RRRufff! RRRufff code is what we're generating here. Oh + well. */ + vassert(fn != 0); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + HReg argLhi, argLlo, argRhi, argRlo; + iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); + iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); + HReg argp = newVRegI(env); + /* subq $160, %rsp -- make a space*/ + sub_from_rsp(env, 160); + /* leaq 48(%rsp), %r_argp -- point into it */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), + argp)); + /* andq $-16, %r_argp -- 16-align the pointer */ + addInstr(env, AMD64Instr_Alu64R(Aalu_AND, + AMD64RMI_Imm( ~(UInt)15 ), + argp)); + /* Prepare 3 arg regs: + leaq 0(%r_argp), %rdi + leaq 16(%r_argp), %rsi + leaq 32(%r_argp), %rdx + */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), + hregAMD64_RDI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp), + hregAMD64_RSI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp), + hregAMD64_RDX())); + /* Store the two high args, at (%rsi) and (%rdx): + movupd %argLhi, 0(%rsi) + movupd %argRhi, 0(%rdx) + */ + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi, + AMD64AMode_IR(0, hregAMD64_RSI()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi, + AMD64AMode_IR(0, hregAMD64_RDX()))); + /* Store the two low args, at 48(%rsi) and 48(%rdx): + movupd %argLlo, 48(%rsi) + movupd %argRlo, 48(%rdx) + */ + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo, + AMD64AMode_IR(48, hregAMD64_RSI()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo, + AMD64AMode_IR(48, hregAMD64_RDX()))); + /* call the helper */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone )); + /* Prepare 3 arg regs: + leaq 48(%r_argp), %rdi + leaq 64(%r_argp), %rsi + leaq 80(%r_argp), %rdx + */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp), + hregAMD64_RDI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp), + hregAMD64_RSI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp), + hregAMD64_RDX())); + /* call the helper */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone )); + /* fetch the result from memory, using %r_argp, which the + register allocator will keep alive across the call. */ + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi, + AMD64AMode_IR(0, argp))); + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo, + AMD64AMode_IR(48, argp))); + /* and finally, clear the space */ + add_to_rsp(env, 160); + *rHi = dstHi; + *rLo = dstLo; + return; + } + + case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8; + goto do_SseAssistedBinary256; + do_SseAssistedBinary256: { + /* RRRufff! RRRufff code is what we're generating here. Oh + well. */ + vassert(fn != 0); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + HReg argLhi, argLlo, argRhi, argRlo; + iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); + iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); + HReg argp = newVRegI(env); + /* subq $160, %rsp -- make a space*/ + sub_from_rsp(env, 160); + /* leaq 48(%rsp), %r_argp -- point into it */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), + argp)); + /* andq $-16, %r_argp -- 16-align the pointer */ + addInstr(env, AMD64Instr_Alu64R(Aalu_AND, + AMD64RMI_Imm( ~(UInt)15 ), + argp)); + /* Prepare 3 arg regs: + leaq 0(%r_argp), %rdi + leaq 32(%r_argp), %rsi + leaq 64(%r_argp), %rdx + */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), + hregAMD64_RDI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp), + hregAMD64_RSI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp), + hregAMD64_RDX())); + /* Store the two args, at (%rsi) and (%rdx): + movupd %argLlo, 0(%rsi) + movupd %argLhi, 16(%rsi) + movupd %argRlo, 0(%rdx) + movupd %argRhi, 16(%rdx) + */ + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo, + AMD64AMode_IR(0, hregAMD64_RSI()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi, + AMD64AMode_IR(16, hregAMD64_RSI()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo, + AMD64AMode_IR(0, hregAMD64_RDX()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi, + AMD64AMode_IR(16, hregAMD64_RDX()))); + /* call the helper */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone )); + /* fetch the result from memory, using %r_argp, which the + register allocator will keep alive across the call. */ + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo, + AMD64AMode_IR(0, argp))); + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi, + AMD64AMode_IR(16, argp))); + /* and finally, clear the space */ + add_to_rsp(env, 160); + *rHi = dstHi; + *rLo = dstLo; + return; + } + default: break; } /* switch (e->Iex.Binop.op) */ @@ -3725,6 +4062,22 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, return; } + if (e->tag == Iex_ITE) { + HReg r1Hi, r1Lo, r0Hi, r0Lo; + iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue); + iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi)); + addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo)); + AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond); + addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi)); + addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo)); + *rHi = dstHi; + *rLo = dstLo; + return; + } + //avx_fail: vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n", LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps)); @@ -4303,7 +4656,9 @@ HInstrArray* iselSB_AMD64 ( IRSB* bb, | VEX_HWCAPS_AMD64_CX16 | VEX_HWCAPS_AMD64_LZCNT | VEX_HWCAPS_AMD64_AVX - | VEX_HWCAPS_AMD64_RDTSCP))); + | VEX_HWCAPS_AMD64_RDTSCP + | VEX_HWCAPS_AMD64_BMI + | VEX_HWCAPS_AMD64_AVX2))); /* Make up an initial environment to use. */ env = LibVEX_Alloc(sizeof(ISelEnv)); diff --git a/VEX/priv/host_generic_maddf.c b/VEX/priv/host_generic_maddf.c new file mode 100644 index 0000000000..d4e9fb7d60 --- /dev/null +++ b/VEX/priv/host_generic_maddf.c @@ -0,0 +1,320 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_generic_maddf.c ---*/ +/*---------------------------------------------------------------*/ + +/* + Compute x * y + z as ternary operation. + Copyright (C) 2010-2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek , 2010. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . +*/ + +/* Generic helper functions for doing FMA, i.e. compute x * y + z + as ternary operation. + These are purely back-end entities and cannot be seen/referenced + from IR. */ + +#include "libvex_basictypes.h" +#include "host_generic_maddf.h" +#include "main_util.h" + +/* This implementation relies on Double being more than twice as + precise as Float and uses rounding to odd in order to avoid problems + with double rounding. + See a paper by Boldo and Melquiond: + http://www.lri.fr/~melquion/doc/08-tc.pdf */ + +#define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X)) + +#if defined(__x86_64__) && defined(__SSE2_MATH__) +# define ENV_TYPE unsigned int +/* Save current rounding mode into ENV, hold exceptions, set rounding + mode to rounding toward zero. */ +# define ROUNDTOZERO(env) \ + do { \ + unsigned int mxcsr; \ + __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ + (env) = mxcsr; \ + mxcsr = (mxcsr | 0x7f80) & ~0x3f; \ + __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\ + } while (0) +/* Restore exceptions from ENV, return if inexact exception has been raised + since ROUNDTOZERO. */ +# define RESET_TESTINEXACT(env) \ + ({ \ + unsigned int mxcsr, ret; \ + __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ + ret = (mxcsr >> 5) & 1; \ + mxcsr = (mxcsr & 0x3d) | (env); \ + __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\ + ret; \ + }) +/* Return if inexact exception has been raised since ROUNDTOZERO. */ +# define TESTINEXACT() \ + ({ \ + unsigned int mxcsr; \ + __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ + (mxcsr >> 5) & 1; \ + }) +#endif + +#define DBL_MANT_DIG 53 +#define IEEE754_DOUBLE_BIAS 0x3ff + +union vg_ieee754_double { + Double d; + + /* This is the IEEE 754 double-precision format. */ + struct { +#ifdef VKI_BIG_ENDIAN + unsigned int negative:1; + unsigned int exponent:11; + unsigned int mantissa0:20; + unsigned int mantissa1:32; +#else + unsigned int mantissa1:32; + unsigned int mantissa0:20; + unsigned int exponent:11; + unsigned int negative:1; +#endif + } ieee; +}; + +void VEX_REGPARM(3) + h_generic_calc_MAddF32 ( /*OUT*/Float* res, + Float* argX, Float* argY, Float* argZ ) +{ +#ifndef ENV_TYPE + /* Lame fallback implementation. */ + *res = *argX * *argY + *argZ; +#else + ENV_TYPE env; + /* Multiplication is always exact. */ + Double temp = (Double) *argX * (Double) *argY; + union vg_ieee754_double u; + + ROUNDTOZERO (env); + + /* Perform addition with round to odd. */ + u.d = temp + (Double) *argZ; + /* Ensure the addition is not scheduled after fetestexcept call. */ + FORCE_EVAL (u.d); + + /* Reset rounding mode and test for inexact simultaneously. */ + int j = RESET_TESTINEXACT (env); + + if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) + u.ieee.mantissa1 |= j; + + /* And finally truncation with round to nearest. */ + *res = (Float) u.d; +#endif +} + + +void VEX_REGPARM(3) + h_generic_calc_MAddF64 ( /*OUT*/Double* res, + Double* argX, Double* argY, Double* argZ ) +{ +#ifndef ENV_TYPE + /* Lame fallback implementation. */ + *res = *argX * *argY + *argZ; +#else + Double x = *argX, y = *argY, z = *argZ; + union vg_ieee754_double u, v, w; + int adjust = 0; + u.d = x; + v.d = y; + w.d = z; + if (UNLIKELY (u.ieee.exponent + v.ieee.exponent + >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) + || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) + || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) + || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) + || UNLIKELY (u.ieee.exponent + v.ieee.exponent + <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) { + /* If z is Inf, but x and y are finite, the result should be + z rather than NaN. */ + if (w.ieee.exponent == 0x7ff + && u.ieee.exponent != 0x7ff + && v.ieee.exponent != 0x7ff) { + *res = (z + x) + y; + return; + } + /* If x or y or z is Inf/NaN, or if fma will certainly overflow, + or if x * y is less than half of DBL_DENORM_MIN, + compute as x * y + z. */ + if (u.ieee.exponent == 0x7ff + || v.ieee.exponent == 0x7ff + || w.ieee.exponent == 0x7ff + || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS + || u.ieee.exponent + v.ieee.exponent + < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) { + *res = x * y + z; + return; + } + if (u.ieee.exponent + v.ieee.exponent + >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) { + /* Compute 1p-53 times smaller result and multiply + at the end. */ + if (u.ieee.exponent > v.ieee.exponent) + u.ieee.exponent -= DBL_MANT_DIG; + else + v.ieee.exponent -= DBL_MANT_DIG; + /* If x + y exponent is very large and z exponent is very small, + it doesn't matter if we don't adjust it. */ + if (w.ieee.exponent > DBL_MANT_DIG) + w.ieee.exponent -= DBL_MANT_DIG; + adjust = 1; + } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { + /* Similarly. + If z exponent is very large and x and y exponents are + very small, it doesn't matter if we don't adjust it. */ + if (u.ieee.exponent > v.ieee.exponent) { + if (u.ieee.exponent > DBL_MANT_DIG) + u.ieee.exponent -= DBL_MANT_DIG; + } else if (v.ieee.exponent > DBL_MANT_DIG) + v.ieee.exponent -= DBL_MANT_DIG; + w.ieee.exponent -= DBL_MANT_DIG; + adjust = 1; + } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { + u.ieee.exponent -= DBL_MANT_DIG; + if (v.ieee.exponent) + v.ieee.exponent += DBL_MANT_DIG; + else + v.d *= 0x1p53; + } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { + v.ieee.exponent -= DBL_MANT_DIG; + if (u.ieee.exponent) + u.ieee.exponent += DBL_MANT_DIG; + else + u.d *= 0x1p53; + } else /* if (u.ieee.exponent + v.ieee.exponent + <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ { + if (u.ieee.exponent > v.ieee.exponent) + u.ieee.exponent += 2 * DBL_MANT_DIG; + else + v.ieee.exponent += 2 * DBL_MANT_DIG; + if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) { + if (w.ieee.exponent) + w.ieee.exponent += 2 * DBL_MANT_DIG; + else + w.d *= 0x1p106; + adjust = -1; + } + /* Otherwise x * y should just affect inexact + and nothing else. */ + } + x = u.d; + y = v.d; + z = w.d; + } + /* Multiplication m1 + m2 = x * y using Dekker's algorithm. */ +# define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1) + Double x1 = x * C; + Double y1 = y * C; + Double m1 = x * y; + x1 = (x - x1) + x1; + y1 = (y - y1) + y1; + Double x2 = x - x1; + Double y2 = y - y1; + Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2; +# undef C + + /* Addition a1 + a2 = z + m1 using Knuth's algorithm. */ + Double a1 = z + m1; + Double t1 = a1 - z; + Double t2 = a1 - t1; + t1 = m1 - t1; + t2 = z - t2; + Double a2 = t1 + t2; + + ENV_TYPE env; + ROUNDTOZERO (env); + + /* Perform m2 + a2 addition with round to odd. */ + u.d = a2 + m2; + + if (UNLIKELY (adjust < 0)) { + if ((u.ieee.mantissa1 & 1) == 0) + u.ieee.mantissa1 |= TESTINEXACT (); + v.d = a1 + u.d; + /* Ensure the addition is not scheduled after fetestexcept call. */ + FORCE_EVAL (v.d); + } + + /* Reset rounding mode and test for inexact simultaneously. */ + int j = RESET_TESTINEXACT (env) != 0; + + if (LIKELY (adjust == 0)) { + if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) + u.ieee.mantissa1 |= j; + /* Result is a1 + u.d. */ + *res = a1 + u.d; + } else if (LIKELY (adjust > 0)) { + if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) + u.ieee.mantissa1 |= j; + /* Result is a1 + u.d, scaled up. */ + *res = (a1 + u.d) * 0x1p53; + } else { + /* If a1 + u.d is exact, the only rounding happens during + scaling down. */ + if (j == 0) { + *res = v.d * 0x1p-106; + return; + } + /* If result rounded to zero is not subnormal, no double + rounding will occur. */ + if (v.ieee.exponent > 106) { + *res = (a1 + u.d) * 0x1p-106; + return; + } + /* If v.d * 0x1p-106 with round to zero is a subnormal above + or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa + down just by 1 bit, which means v.ieee.mantissa1 |= j would + change the round bit, not sticky or guard bit. + v.d * 0x1p-106 never normalizes by shifting up, + so round bit plus sticky bit should be already enough + for proper rounding. */ + if (v.ieee.exponent == 106) { + /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding, + v.ieee.mantissa1 & 1 is the round bit and j is our sticky + bit. In round-to-nearest 001 rounds down like 00, + 011 rounds up, even though 01 rounds down (thus we need + to adjust), 101 rounds down like 10 and 111 rounds up + like 11. */ + if ((v.ieee.mantissa1 & 3) == 1) { + v.d *= 0x1p-106; + if (v.ieee.negative) + *res = v.d - 0x1p-1074; + else + *res = v.d + 0x1p-1074; + } else + *res = v.d * 0x1p-106; + return; + } + v.ieee.mantissa1 |= j; + *res = v.d * 0x1p-106; + return; + } +#endif +} + +/*---------------------------------------------------------------*/ +/*--- end host_generic_maddf.c --*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/host_generic_maddf.h b/VEX/priv/host_generic_maddf.h new file mode 100644 index 0000000000..6757f74544 --- /dev/null +++ b/VEX/priv/host_generic_maddf.h @@ -0,0 +1,48 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_generic_maddf.h ---*/ +/*---------------------------------------------------------------*/ + +/* + Compute x * y + z as ternary operation. + Copyright (C) 2010-2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek , 2010. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . +*/ + +/* Generic helper functions for doing FMA, i.e. compute x * y + z + as ternary operation. + These are purely back-end entities and cannot be seen/referenced + from IR. */ + +#ifndef __VEX_HOST_GENERIC_MADDF_H +#define __VEX_HOST_GENERIC_MADDF_H + +#include "libvex_basictypes.h" + +extern VEX_REGPARM(3) + void h_generic_calc_MAddF32 ( /*OUT*/Float*, Float*, Float*, Float* ); + +extern VEX_REGPARM(3) + void h_generic_calc_MAddF64 ( /*OUT*/Double*, Double*, Double*, + Double* ); + +#endif /* ndef __VEX_HOST_GENERIC_MADDF_H */ + +/*---------------------------------------------------------------*/ +/*--- end host_generic_maddf.h --*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/host_generic_simd256.c b/VEX/priv/host_generic_simd256.c new file mode 100644 index 0000000000..93990d22d0 --- /dev/null +++ b/VEX/priv/host_generic_simd256.c @@ -0,0 +1,57 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_generic_simd256.c ---*/ +/*---------------------------------------------------------------*/ + +/* + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2012 OpenWorks GbR + info@open-works.net + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + + The GNU General Public License is contained in the file COPYING. +*/ + +/* Generic helper functions for doing 256-bit SIMD arithmetic in cases + where the instruction selectors cannot generate code in-line. + These are purely back-end entities and cannot be seen/referenced + from IR. */ + +#include "libvex_basictypes.h" +#include "host_generic_simd256.h" + + +void VEX_REGPARM(3) + h_generic_calc_Perm32x8 ( /*OUT*/V256* res, + V256* argL, V256* argR ) +{ + res->w32[0] = argL->w32[ argR->w32[0] & 7 ]; + res->w32[1] = argL->w32[ argR->w32[1] & 7 ]; + res->w32[2] = argL->w32[ argR->w32[2] & 7 ]; + res->w32[3] = argL->w32[ argR->w32[3] & 7 ]; + res->w32[4] = argL->w32[ argR->w32[4] & 7 ]; + res->w32[5] = argL->w32[ argR->w32[5] & 7 ]; + res->w32[6] = argL->w32[ argR->w32[6] & 7 ]; + res->w32[7] = argL->w32[ argR->w32[7] & 7 ]; +} + + +/*---------------------------------------------------------------*/ +/*--- end host_generic_simd256.c ---*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/host_generic_simd256.h b/VEX/priv/host_generic_simd256.h new file mode 100644 index 0000000000..1254316f1d --- /dev/null +++ b/VEX/priv/host_generic_simd256.h @@ -0,0 +1,55 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_generic_simd256.h ---*/ +/*---------------------------------------------------------------*/ + +/* + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2012 OpenWorks GbR + info@open-works.net + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + + The GNU General Public License is contained in the file COPYING. +*/ + +/* Generic helper functions for doing 256-bit SIMD arithmetic in cases + where the instruction selectors cannot generate code in-line. + These are purely back-end entities and cannot be seen/referenced + as clean helper functions from IR. + + These will get called from generated code and therefore should be + well behaved -- no floating point or mmx insns, just straight + integer code. + + Each function implements the correspondingly-named IR primop. +*/ + +#ifndef __VEX_HOST_GENERIC_SIMD256_H +#define __VEX_HOST_GENERIC_SIMD256_H + +#include "libvex_basictypes.h" + +extern VEX_REGPARM(3) + void h_generic_calc_Perm32x8 ( /*OUT*/V256*, V256*, V256* ); + +#endif /* ndef __VEX_HOST_GENERIC_SIMD256_H */ + +/*---------------------------------------------------------------*/ +/*--- end host_generic_simd256.h ---*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 39be267f0f..c3f7bfd593 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -1036,6 +1036,68 @@ void ppIROp ( IROp op ) case Iop_NotV256: vex_printf("NotV256"); return; case Iop_CmpNEZ64x4: vex_printf("CmpNEZ64x4"); return; case Iop_CmpNEZ32x8: vex_printf("CmpNEZ32x8"); return; + case Iop_CmpNEZ16x16: vex_printf("CmpNEZ16x16"); return; + case Iop_CmpNEZ8x32: vex_printf("CmpNEZ8x32"); return; + + case Iop_Add8x32: vex_printf("Add8x32"); return; + case Iop_Add16x16: vex_printf("Add16x16"); return; + case Iop_Add32x8: vex_printf("Add32x8"); return; + case Iop_Add64x4: vex_printf("Add64x4"); return; + case Iop_Sub8x32: vex_printf("Sub8x32"); return; + case Iop_Sub16x16: vex_printf("Sub16x16"); return; + case Iop_Sub32x8: vex_printf("Sub32x8"); return; + case Iop_Sub64x4: vex_printf("Sub64x4"); return; + case Iop_QAdd8Ux32: vex_printf("QAdd8Ux32"); return; + case Iop_QAdd16Ux16: vex_printf("QAdd16Ux16"); return; + case Iop_QAdd8Sx32: vex_printf("QAdd8Sx32"); return; + case Iop_QAdd16Sx16: vex_printf("QAdd16Sx16"); return; + case Iop_QSub8Ux32: vex_printf("QSub8Ux32"); return; + case Iop_QSub16Ux16: vex_printf("QSub16Ux16"); return; + case Iop_QSub8Sx32: vex_printf("QSub8Sx32"); return; + case Iop_QSub16Sx16: vex_printf("QSub16Sx16"); return; + + case Iop_Mul16x16: vex_printf("Mul16x16"); return; + case Iop_Mul32x8: vex_printf("Mul32x8"); return; + case Iop_MulHi16Ux16: vex_printf("MulHi16Ux16"); return; + case Iop_MulHi16Sx16: vex_printf("MulHi16Sx16"); return; + + case Iop_Avg8Ux32: vex_printf("Avg8Ux32"); return; + case Iop_Avg16Ux16: vex_printf("Avg16Ux16"); return; + + case Iop_Max8Sx32: vex_printf("Max8Sx32"); return; + case Iop_Max16Sx16: vex_printf("Max16Sx16"); return; + case Iop_Max32Sx8: vex_printf("Max32Sx8"); return; + case Iop_Max8Ux32: vex_printf("Max8Ux32"); return; + case Iop_Max16Ux16: vex_printf("Max16Ux16"); return; + case Iop_Max32Ux8: vex_printf("Max32Ux8"); return; + + case Iop_Min8Sx32: vex_printf("Min8Sx32"); return; + case Iop_Min16Sx16: vex_printf("Min16Sx16"); return; + case Iop_Min32Sx8: vex_printf("Min32Sx8"); return; + case Iop_Min8Ux32: vex_printf("Min8Ux32"); return; + case Iop_Min16Ux16: vex_printf("Min16Ux16"); return; + case Iop_Min32Ux8: vex_printf("Min32Ux8"); return; + + case Iop_CmpEQ8x32: vex_printf("CmpEQ8x32"); return; + case Iop_CmpEQ16x16: vex_printf("CmpEQ16x16"); return; + case Iop_CmpEQ32x8: vex_printf("CmpEQ32x8"); return; + case Iop_CmpEQ64x4: vex_printf("CmpEQ64x4"); return; + case Iop_CmpGT8Sx32: vex_printf("CmpGT8Sx32"); return; + case Iop_CmpGT16Sx16: vex_printf("CmpGT16Sx16"); return; + case Iop_CmpGT32Sx8: vex_printf("CmpGT32Sx8"); return; + case Iop_CmpGT64Sx4: vex_printf("CmpGT64Sx4"); return; + + case Iop_ShlN16x16: vex_printf("ShlN16x16"); return; + case Iop_ShlN32x8: vex_printf("ShlN32x8"); return; + case Iop_ShlN64x4: vex_printf("ShlN64x4"); return; + case Iop_ShrN16x16: vex_printf("ShrN16x16"); return; + case Iop_ShrN32x8: vex_printf("ShrN32x8"); return; + case Iop_ShrN64x4: vex_printf("ShrN64x4"); return; + case Iop_SarN16x16: vex_printf("SarN16x16"); return; + case Iop_SarN32x8: vex_printf("SarN32x8"); return; + + case Iop_Perm32x8: vex_printf("Perm32x8"); return; + default: vpanic("ppIROp(1)"); } @@ -3001,6 +3063,26 @@ void typeOfPrimop ( IROp op, case Iop_XorV256: case Iop_Max32Fx8: case Iop_Min32Fx8: case Iop_Max64Fx4: case Iop_Min64Fx4: + case Iop_Add8x32: case Iop_Add16x16: + case Iop_Add32x8: case Iop_Add64x4: + case Iop_Sub8x32: case Iop_Sub16x16: + case Iop_Sub32x8: case Iop_Sub64x4: + case Iop_Mul16x16: case Iop_Mul32x8: + case Iop_MulHi16Ux16: case Iop_MulHi16Sx16: + case Iop_Avg8Ux32: case Iop_Avg16Ux16: + case Iop_Max8Sx32: case Iop_Max16Sx16: case Iop_Max32Sx8: + case Iop_Max8Ux32: case Iop_Max16Ux16: case Iop_Max32Ux8: + case Iop_Min8Sx32: case Iop_Min16Sx16: case Iop_Min32Sx8: + case Iop_Min8Ux32: case Iop_Min16Ux16: case Iop_Min32Ux8: + case Iop_CmpEQ8x32: case Iop_CmpEQ16x16: + case Iop_CmpEQ32x8: case Iop_CmpEQ64x4: + case Iop_CmpGT8Sx32: case Iop_CmpGT16Sx16: + case Iop_CmpGT32Sx8: case Iop_CmpGT64Sx4: + case Iop_QAdd8Ux32: case Iop_QAdd16Ux16: + case Iop_QAdd8Sx32: case Iop_QAdd16Sx16: + case Iop_QSub8Ux32: case Iop_QSub16Ux16: + case Iop_QSub8Sx32: case Iop_QSub16Sx16: + case Iop_Perm32x8: BINARY(Ity_V256,Ity_V256, Ity_V256); case Iop_V256toV128_1: case Iop_V256toV128_0: @@ -3014,9 +3096,17 @@ void typeOfPrimop ( IROp op, case Iop_Sqrt32Fx8: case Iop_Sqrt64Fx4: case Iop_Recip32Fx8: + case Iop_CmpNEZ8x32: case Iop_CmpNEZ16x16: case Iop_CmpNEZ64x4: case Iop_CmpNEZ32x8: UNARY(Ity_V256, Ity_V256); + case Iop_ShlN16x16: case Iop_ShlN32x8: + case Iop_ShlN64x4: + case Iop_ShrN16x16: case Iop_ShrN32x8: + case Iop_ShrN64x4: + case Iop_SarN16x16: case Iop_SarN32x8: + BINARY(Ity_V256,Ity_I8, Ity_V256); + default: ppIROp(op); vpanic("typeOfPrimop"); diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c index aa45d11efb..149b651690 100644 --- a/VEX/priv/main_main.c +++ b/VEX/priv/main_main.c @@ -1208,11 +1208,16 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps ) orthogonal. */ /* Throw out obviously stupid cases: */ - /* AVX without SSE3 */ Bool have_sse3 = (hwcaps & VEX_HWCAPS_AMD64_SSE3) != 0; Bool have_avx = (hwcaps & VEX_HWCAPS_AMD64_AVX) != 0; + Bool have_bmi = (hwcaps & VEX_HWCAPS_AMD64_BMI) != 0; + Bool have_avx2 = (hwcaps & VEX_HWCAPS_AMD64_AVX2) != 0; + /* AVX without SSE3 */ if (have_avx && !have_sse3) return NULL; + /* AVX2 or BMI without AVX */ + if ((have_avx2 || have_bmi) && !have_avx) + return NULL; /* This isn't threadsafe. We might need to fix it at some point. */ static HChar buf[100] = { 0 }; @@ -1243,6 +1248,12 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps ) if (hwcaps & VEX_HWCAPS_AMD64_AVX) { p = p + vex_sprintf(p, "%s", "-avx"); } + if (hwcaps & VEX_HWCAPS_AMD64_AVX2) { + p = p + vex_sprintf(p, "%s", "-avx2"); + } + if (hwcaps & VEX_HWCAPS_AMD64_BMI) { + p = p + vex_sprintf(p, "%s", "-bmi"); + } out: vassert(buf[sizeof(buf)-1] == 0); diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h index b1061fc9a6..5a67349df2 100644 --- a/VEX/pub/libvex.h +++ b/VEX/pub/libvex.h @@ -79,11 +79,13 @@ typedef /* amd64: baseline capability is SSE2, with cmpxchg8b but not cmpxchg16b. */ -#define VEX_HWCAPS_AMD64_SSE3 (1<<5) /* SSE3 support */ -#define VEX_HWCAPS_AMD64_CX16 (1<<6) /* cmpxchg16b support */ -#define VEX_HWCAPS_AMD64_LZCNT (1<<7) /* SSE4a LZCNT insn */ -#define VEX_HWCAPS_AMD64_AVX (1<<8) /* AVX instructions */ -#define VEX_HWCAPS_AMD64_RDTSCP (1<<9) /* RDTSCP instruction */ +#define VEX_HWCAPS_AMD64_SSE3 (1<<5) /* SSE3 support */ +#define VEX_HWCAPS_AMD64_CX16 (1<<6) /* cmpxchg16b support */ +#define VEX_HWCAPS_AMD64_LZCNT (1<<7) /* SSE4a LZCNT insn */ +#define VEX_HWCAPS_AMD64_AVX (1<<8) /* AVX instructions */ +#define VEX_HWCAPS_AMD64_RDTSCP (1<<9) /* RDTSCP instruction */ +#define VEX_HWCAPS_AMD64_BMI (1<<10) /* BMI1 instructions */ +#define VEX_HWCAPS_AMD64_AVX2 (1<<11) /* AVX2 instructions */ /* ppc32: baseline capability is integer only */ #define VEX_HWCAPS_PPC32_F (1<<8) /* basic (non-optional) FP */ diff --git a/VEX/pub/libvex_basictypes.h b/VEX/pub/libvex_basictypes.h index 5335e2d053..1d08206855 100644 --- a/VEX/pub/libvex_basictypes.h +++ b/VEX/pub/libvex_basictypes.h @@ -75,6 +75,16 @@ typedef } V128; +/* A union for doing 256-bit vector primitives conveniently. */ +typedef + union { + UChar w8[32]; + UShort w16[16]; + UInt w32[8]; + ULong w64[4]; + } + V256; + /* Floating point. */ typedef float Float; /* IEEE754 single-precision (32-bit) value */ typedef double Double; /* IEEE754 double-precision (64-bit) value */ diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 627ffd7cf2..00a463a353 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1514,7 +1514,34 @@ typedef Iop_NotV256, /* MISC (vector integer cmp != 0) */ - Iop_CmpNEZ32x8, Iop_CmpNEZ64x4, + Iop_CmpNEZ8x32, Iop_CmpNEZ16x16, Iop_CmpNEZ32x8, Iop_CmpNEZ64x4, + + Iop_Add8x32, Iop_Add16x16, Iop_Add32x8, Iop_Add64x4, + Iop_Sub8x32, Iop_Sub16x16, Iop_Sub32x8, Iop_Sub64x4, + + Iop_CmpEQ8x32, Iop_CmpEQ16x16, Iop_CmpEQ32x8, Iop_CmpEQ64x4, + Iop_CmpGT8Sx32, Iop_CmpGT16Sx16, Iop_CmpGT32Sx8, Iop_CmpGT64Sx4, + + Iop_ShlN16x16, Iop_ShlN32x8, Iop_ShlN64x4, + Iop_ShrN16x16, Iop_ShrN32x8, Iop_ShrN64x4, + Iop_SarN16x16, Iop_SarN32x8, + + Iop_Max8Sx32, Iop_Max16Sx16, Iop_Max32Sx8, + Iop_Max8Ux32, Iop_Max16Ux16, Iop_Max32Ux8, + Iop_Min8Sx32, Iop_Min16Sx16, Iop_Min32Sx8, + Iop_Min8Ux32, Iop_Min16Ux16, Iop_Min32Ux8, + + Iop_Mul16x16, Iop_Mul32x8, + Iop_MulHi16Ux16, Iop_MulHi16Sx16, + + Iop_QAdd8Ux32, Iop_QAdd16Ux16, + Iop_QAdd8Sx32, Iop_QAdd16Sx16, + Iop_QSub8Ux32, Iop_QSub16Ux16, + Iop_QSub8Sx32, Iop_QSub16Sx16, + + Iop_Avg8Ux32, Iop_Avg16Ux16, + + Iop_Perm32x8, /* ------------------ 256-bit SIMD FP. ------------------ */ Iop_Add64Fx4,