unop(Iop_32Uto64,sseround) ) );
}
-/* Break a 128-bit value up into four 32-bit ints. */
+/* Break a V128-bit value up into four 32-bit ints. */
-static void breakup128to32s ( IRTemp t128,
- /*OUTs*/
- IRTemp* t3, IRTemp* t2,
- IRTemp* t1, IRTemp* t0 )
+static void breakupV128to32s ( IRTemp t128,
+ /*OUTs*/
+ IRTemp* t3, IRTemp* t2,
+ IRTemp* t1, IRTemp* t0 )
{
IRTemp hi64 = newTemp(Ity_I64);
IRTemp lo64 = newTemp(Ity_I64);
assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
}
-/* Construct a 128-bit value from four 32-bit ints. */
+/* Construct a V128-bit value from four 32-bit ints. */
-static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
- IRTemp t1, IRTemp t0 )
+static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
+ IRTemp t1, IRTemp t0 )
{
return
binop( Iop_64HLtoV128,
);
}
+/* Break a V256-bit value up into four 64-bit ints. */
+
+static void breakupV256to64s ( IRTemp t256,
+ /*OUTs*/
+ IRTemp* t3, IRTemp* t2,
+ IRTemp* t1, IRTemp* t0 )
+{
+ vassert(t0 && *t0 == IRTemp_INVALID);
+ vassert(t1 && *t1 == IRTemp_INVALID);
+ vassert(t2 && *t2 == IRTemp_INVALID);
+ vassert(t3 && *t3 == IRTemp_INVALID);
+ *t0 = newTemp(Ity_I64);
+ *t1 = newTemp(Ity_I64);
+ *t2 = newTemp(Ity_I64);
+ *t3 = newTemp(Ity_I64);
+ assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
+ assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
+ assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
+ assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
+}
+
/* Helper for the SSSE3 (not SSE3) PMULHRSW insns. Given two 64-bit
values (aa,bb), computes, for each of the 4 16-bit lanes:
IRTemp s3, s2, s1, s0;
s3 = s2 = s1 = s0 = IRTemp_INVALID;
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
# define SEL(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
IRTemp dV = newTemp(Ity_V128);
assign(dV,
- mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
- SEL((order>>2)&3), SEL((order>>0)&3) )
+ mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
+ SEL((order>>2)&3), SEL((order>>0)&3) )
);
# undef SEL
assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
: get_sse_roundingmode() );
t0 = t1 = t2 = t3 = IRTemp_INVALID;
- breakup128to32s( argV, &t3, &t2, &t1, &t0 );
+ breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
/* This is less than ideal. If it turns out to be a performance
bottleneck it can be improved. */
# define CVT(_t) \
}
-/* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
+/* FIXME: why not just use InterleaveLO / InterleaveHI? I think the
+ relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
/* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
{
IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
- breakup128to32s( dV, &d3, &d2, &d1, &d0 );
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
IRTemp res = newTemp(Ity_V128);
- assign(res, xIsH ? mk128from32s( s3, d3, s2, d2 )
- : mk128from32s( s1, d1, s0, d0 ));
+ assign(res, xIsH ? mkV128from32s( s3, d3, s2, d2 )
+ : mkV128from32s( s1, d1, s0, d0 ));
return res;
}
}
-static IRTemp math_SHUFPS ( IRTemp sV, IRTemp dV, UInt imm8 )
+/* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
+ Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
+ or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
+ way. */
+static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
+{
+ IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+ s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+ breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
+ breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
+ IRTemp res = newTemp(Ity_V256);
+ assign(res, xIsH
+ ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
+ mkexpr(s1), mkexpr(d1))
+ : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
+ mkexpr(s0), mkexpr(d0)));
+ return res;
+}
+
+
+/* FIXME: this is really bad. Surely can do something better here?
+ One observation is that the steering in the upper and lower 128 bit
+ halves is the same as with math_UNPCKxPS_128, so we simply split
+ into two halves, and use that. Consequently any improvement in
+ math_UNPCKxPS_128 (probably, to use interleave-style primops)
+ benefits this too. */
+static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
+{
+ IRTemp sVhi = newTemp(Ity_V128);
+ IRTemp sVlo = newTemp(Ity_V128);
+ IRTemp dVhi = newTemp(Ity_V128);
+ IRTemp dVlo = newTemp(Ity_V128);
+ assign(sVhi, unop(Iop_V256toV128_1, mkexpr(sV)));
+ assign(sVlo, unop(Iop_V256toV128_0, mkexpr(sV)));
+ assign(dVhi, unop(Iop_V256toV128_1, mkexpr(dV)));
+ assign(dVlo, unop(Iop_V256toV128_0, mkexpr(dV)));
+ IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
+ IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
+ IRTemp rV = newTemp(Ity_V256);
+ assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
+ return rV;
+}
+
+
+static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
{
IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
vassert(imm8 < 256);
- breakup128to32s( dV, &d3, &d2, &d1, &d0 );
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
# define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
# define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
IRTemp res = newTemp(Ity_V128);
assign(res,
- mk128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
- SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
+ mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
+ SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
# undef SELD
# undef SELS
return res;
}
+/* 256-bit SHUFPS appears to steer each of the 128-bit halves
+ identically. Hence do the clueless thing and use math_SHUFPS_128
+ twice. */
+static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
+{
+ IRTemp sVhi = newTemp(Ity_V128);
+ IRTemp sVlo = newTemp(Ity_V128);
+ IRTemp dVhi = newTemp(Ity_V128);
+ IRTemp dVlo = newTemp(Ity_V128);
+ assign(sVhi, unop(Iop_V256toV128_1, mkexpr(sV)));
+ assign(sVlo, unop(Iop_V256toV128_0, mkexpr(sV)));
+ assign(dVhi, unop(Iop_V256toV128_1, mkexpr(dV)));
+ assign(dVlo, unop(Iop_V256toV128_0, mkexpr(dV)));
+ IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
+ IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
+ IRTemp rV = newTemp(Ity_V256);
+ assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
+ return rV;
+}
+
+
static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
{
/* This is a really poor translation -- could be improved if
performance critical */
IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
- breakup128to32s( dV, &d3, &d2, &d1, &d0 );
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
IRTemp res = newTemp(Ity_V128);
assign(res, binop(Iop_64HLtoV128,
binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
return deltaIN; /* FAIL */
}
s3 = s2 = s1 = s0 = IRTemp_INVALID;
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
switch (imm8) {
case 0: assign(d16, unop(Iop_32to16, mkexpr(s0))); break;
case 1: assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
}
+static Long dis_CVTDQ2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
+ Long delta, Bool isAvx )
+{
+ IRTemp addr = IRTemp_INVALID;
+ Int alen = 0;
+ HChar dis_buf[50];
+ UChar modrm = getUChar(delta);
+ IRTemp arg64 = newTemp(Ity_I64);
+ UInt rG = gregOfRexRM(pfx,modrm);
+ UChar* mbV = isAvx ? "v" : "";
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ assign( arg64, getXMMRegLane64(rE, 0) );
+ delta += 1;
+ DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+ assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+ delta += alen;
+ DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
+ }
+ putXMMRegLane64F(
+ rG, 0,
+ unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
+ );
+ putXMMRegLane64F(
+ rG, 1,
+ unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
+ );
+ if (isAvx)
+ putYMMRegLane128(rG, 1, mkV128(0));
+ return delta;
+}
+
+
/* Note, this also handles SSE(1) insns. */
__attribute__((noinline))
static
}
assign( rmode, get_sse_roundingmode() );
- breakup128to32s( argV, &t3, &t2, &t1, &t0 );
+ breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
# define CVT(_t) binop( Iop_F64toF32, \
mkexpr(rmode), \
delta += 1+alen;
DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
}
- IRTemp res = math_SHUFPS( sV, dV, imm8 );
+ IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
goto decode_success;
}
/* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
F64 in xmm(G) */
if (haveF3no66noF2(pfx) && sz == 4) {
- IRTemp arg64 = newTemp(Ity_I64);
-
- modrm = getUChar(delta);
- if (epartIsReg(modrm)) {
- assign( arg64, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0) );
- delta += 1;
- DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
- nameXMMReg(gregOfRexRM(pfx,modrm)));
- } else {
- addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
- assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
- delta += alen;
- DIP("cvtdq2pd %s,%s\n", dis_buf,
- nameXMMReg(gregOfRexRM(pfx,modrm)) );
- }
-
- putXMMRegLane64F(
- gregOfRexRM(pfx,modrm), 0,
- unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
- );
-
- putXMMRegLane64F(
- gregOfRexRM(pfx,modrm), 1,
- unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
- );
-
+ delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
goto decode_success;
}
break;
delta += alen;
}
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
putXMMReg( gregOfRexRM(pfx,modrm),
- mk128from32s( s2, s2, s0, s0 ) );
+ mkV128from32s( s2, s2, s0, s0 ) );
goto decode_success;
}
/* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
delta += alen;
}
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
putXMMReg( gregOfRexRM(pfx,modrm),
- mk128from32s( s3, s3, s1, s1 ) );
+ mkV128from32s( s3, s3, s1, s1 ) );
goto decode_success;
}
break;
assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
- breakup128to32s( eV, &e3, &e2, &e1, &e0 );
- breakup128to32s( gV, &g3, &g2, &g1, &g0 );
+ breakupV128to32s( eV, &e3, &e2, &e1, &e0 );
+ breakupV128to32s( gV, &g3, &g2, &g1, &g0 );
- assign( leftV, mk128from32s( e2, e0, g2, g0 ) );
- assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
+ assign( leftV, mkV128from32s( e2, e0, g2, g0 ) );
+ assign( rightV, mkV128from32s( e3, e1, g3, g1 ) );
putXMMReg( gregOfRexRM(pfx,modrm),
binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
- breakup128to32s( addV, &a3, &a2, &a1, &a0 );
- breakup128to32s( subV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
+ breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
- putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( a3, s2, a1, s0 ));
+ putXMMReg( gregOfRexRM(pfx,modrm), mkV128from32s( a3, s2, a1, s0 ));
goto decode_success;
}
break;
nameXMMReg(gregOfRexRM(pfx,modrm)));
}
- breakup128to32s( dV, &d3, &d2, &d1, &d0 );
- breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+ breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
+ breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
assign( t0, binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) );
putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) );
vassert(0==getRexW(pfx)); /* ensured by caller */
modrm = getUChar(delta);
assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
- breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+ breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
if ( epartIsReg( modrm ) ) {
imm8_10 = (Int)(getUChar(delta+1) & 3);
UShort mask = 0;
switch (imm8) {
case 3: mask = 0x0FFF;
- assign(withZs, mk128from32s(u32, z32, z32, z32));
+ assign(withZs, mkV128from32s(u32, z32, z32, z32));
break;
case 2: mask = 0xF0FF;
- assign(withZs, mk128from32s(z32, u32, z32, z32));
+ assign(withZs, mkV128from32s(z32, u32, z32, z32));
break;
case 1: mask = 0xFF0F;
- assign(withZs, mk128from32s(z32, z32, u32, z32));
+ assign(withZs, mkV128from32s(z32, z32, u32, z32));
break;
case 0: mask = 0xFFF0;
- assign(withZs, mk128from32s(z32, z32, z32, u32));
+ assign(withZs, mkV128from32s(z32, z32, z32, u32));
break;
default: vassert(0);
}
{
const IRTemp inval = IRTemp_INVALID;
IRTemp dstDs[4] = { inval, inval, inval, inval };
- breakup128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
+ breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
vassert(imm8 <= 255);
dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
IRTemp zero_32 = newTemp(Ity_I32);
assign( zero_32, mkU32(0) );
IRTemp resV = newTemp(Ity_V128);
- assign( resV, mk128from32s(
+ assign( resV, mkV128from32s(
((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
Int imm8;
assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
t3 = t2 = t1 = t0 = IRTemp_INVALID;
- breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+ breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
if ( epartIsReg( modrm ) ) {
imm8 = (Int)getUChar(delta+1);
modrm = getUChar(delta);
assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
- breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+ breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
if ( epartIsReg( modrm ) ) {
imm8_20 = (Int)(getUChar(delta+1) & 7);
modrm = getUChar(delta);
assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
- breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+ breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
if ( epartIsReg( modrm ) ) {
imm8_10 = (Int)(getUChar(delta+1) & 3);
IRTemp vE = newTemp(Ity_V128);
assign( vE, getXMMReg(rE) );
IRTemp dsE[4] = { inval, inval, inval, inval };
- breakup128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
+ breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
imm8 = getUChar(delta+1);
d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
delta += 1+1;
binop( Iop_Mul32Fx4, mkexpr(xmm1_vec),
mkexpr(xmm2_vec) ),
mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
- breakup128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
- assign( prod_vec, mk128from32s( v3, v1, v2, v0 ) );
+ breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
+ assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
assign( sum_vec, binop( Iop_Add32Fx4,
binop( Iop_InterleaveHI32x4,
*uses_vvvv = True;
goto decode_success;
}
+ /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
+ /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
+ if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ Bool hi = opc == 0x15;
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx,modrm);
+ UInt rV = getVexNvvvv(pfx);
+ IRTemp eV = newTemp(Ity_V256);
+ IRTemp vV = newTemp(Ity_V256);
+ assign( vV, getYMMReg(rV) );
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ assign( eV, getYMMReg(rE) );
+ delta += 1;
+ DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
+ nameYMMReg(rE), nameYMMReg(rG));
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+ assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
+ delta += alen;
+ DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
+ dis_buf, nameYMMReg(rG));
+ }
+ IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
+ putYMMReg( rG, mkexpr(res) );
+ *uses_vvvv = True;
+ goto decode_success;
+ }
/* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
/* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
*uses_vvvv = True;
goto decode_success;
}
+ /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
+ /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ Bool hi = opc == 0x15;
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx,modrm);
+ UInt rV = getVexNvvvv(pfx);
+ IRTemp eV = newTemp(Ity_V256);
+ IRTemp vV = newTemp(Ity_V256);
+ assign( vV, getYMMReg(rV) );
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ assign( eV, getYMMReg(rE) );
+ delta += 1;
+ DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
+ nameYMMReg(rE), nameYMMReg(rG));
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+ assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
+ delta += alen;
+ DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
+ dis_buf, nameYMMReg(rG));
+ }
+ IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
+ putYMMReg( rG, mkexpr(res) );
+ *uses_vvvv = True;
+ goto decode_success;
+ }
break;
case 0x16:
uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
goto decode_success;
}
+ /* VANDPD r/m, rV, r ::: r = rV & r/m */
+ /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
+ goto decode_success;
+ }
/* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
goto decode_success;
}
+ /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
+ /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
+ goto decode_success;
+ }
/* VXORPS r/m, rV, r ::: r = rV ^ r/m */
/* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
goto decode_success;
}
+ /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ delta = dis_AVX128_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
+ goto decode_success;
+ }
/* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
delta = dis_AVX256_E_V_to_G(
if (delta > delta0) goto decode_success;
/* else fall through -- decoding has failed */
}
+ /* VCMPPD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
+ /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ Long delta0 = delta;
+ delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
+ "vcmppd", True/*all_lanes*/,
+ 8/*sz*/);
+ if (delta > delta0) goto decode_success;
+ /* else fall through -- decoding has failed */
+ }
break;
case 0xC5:
DIP("vshufps $%d,%s,%s,%s\n",
imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
}
- IRTemp res = math_SHUFPS( eV, vV, imm8 );
- putYMMRegLoAndZU( gregOfRexRM(pfx,modrm), mkexpr(res) );
+ IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
+ putYMMRegLoAndZU( rG, mkexpr(res) );
+ *uses_vvvv = True;
+ goto decode_success;
+ }
+ /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
+ /* = VEX.NDS.256.0F.WIG C6 /r ib */
+ if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ Int imm8 = 0;
+ IRTemp eV = newTemp(Ity_V256);
+ IRTemp vV = newTemp(Ity_V256);
+ UInt modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx,modrm);
+ UInt rV = getVexNvvvv(pfx);
+ assign( vV, getYMMReg(rV) );
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ assign( eV, getYMMReg(rE) );
+ imm8 = (Int)getUChar(delta+1);
+ delta += 1+1;
+ DIP("vshufps $%d,%s,%s,%s\n",
+ imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
+ assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
+ imm8 = (Int)getUChar(delta+alen);
+ delta += 1+alen;
+ DIP("vshufps $%d,%s,%s,%s\n",
+ imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+ }
+ IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
+ putYMMReg( rG, mkexpr(res) );
*uses_vvvv = True;
goto decode_success;
}
}
break;
+ case 0xE6:
+ /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
+ if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
+ delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
+ goto decode_success;
+ }
+ break;
+
case 0xE7:
/* MOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
Prefix pfx, Int sz, Long deltaIN
)
{
- //IRTemp addr = IRTemp_INVALID;
- //Int alen = 0;
- //HChar dis_buf[50];
+ IRTemp addr = IRTemp_INVALID;
+ Int alen = 0;
+ HChar dis_buf[50];
Long delta = deltaIN;
UChar opc = getUChar(delta);
delta++;
}
break;
+ case 0x19:
+ /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.W0 19 /r */
+ if (have66noF2noF3(pfx)
+ && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/
+ && !epartIsReg(getUChar(delta))) {
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx, modrm);
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+ delta += alen;
+ DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
+ IRTemp t64 = newTemp(Ity_I64);
+ assign(t64, loadLE(Ity_I64, mkexpr(addr)));
+ IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
+ mkexpr(t64), mkexpr(t64));
+ putYMMReg(rG, res);
+ goto decode_success;
+ }
+ break;
+
case 0x1E:
/* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
assign(sV, loadLE(Ity_V256, mkexpr(addr)));
}
delta++;
- IRTemp s3 = newTemp(Ity_I64);
- IRTemp s2 = newTemp(Ity_I64);
- IRTemp s1 = newTemp(Ity_I64);
- IRTemp s0 = newTemp(Ity_I64);
- assign(s3, unop(Iop_V256to64_3, mkexpr(sV)));
- assign(s2, unop(Iop_V256to64_2, mkexpr(sV)));
- assign(s1, unop(Iop_V256to64_1, mkexpr(sV)));
- assign(s0, unop(Iop_V256to64_0, mkexpr(sV)));
+ IRTemp s3, s2, s1, s0;
+ s3 = s2 = s1 = s0 = IRTemp_INVALID;
+ breakupV256to64s(sV, &s3, &s2, &s1, &s0);
IRTemp dV = newTemp(Ity_V256);
assign(dV, IRExpr_Qop(Iop_64x4toV256,
mkexpr((imm8 & (1<<3)) ? s3 : s2),
IRTemp vE = newTemp(Ity_V128);
assign( vE, getXMMReg(rE) );
IRTemp dsE[4] = { inval, inval, inval, inval };
- breakup128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
+ breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
imm8 = getUChar(delta+1);
d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
delta += 1+1;