From 7ee72fff7d7801e6b1027aa4509d8a63b5f5302c Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Wed, 11 May 2005 00:03:06 +0000 Subject: [PATCH] SSE2, on and on and on. There are more different SSE2 instructions than there are atoms in the universe. This much, at least, I now know. git-svn-id: svn://svn.valgrind.org/vex/trunk@1179 --- VEX/priv/guest-amd64/ghelpers.c | 8 + VEX/priv/guest-amd64/toIR.c | 929 ++++++++++++++++---------------- VEX/priv/host-amd64/hdefs.c | 56 +- VEX/priv/host-amd64/hdefs.h | 21 +- VEX/priv/host-amd64/isel.c | 50 +- 5 files changed, 549 insertions(+), 515 deletions(-) diff --git a/VEX/priv/guest-amd64/ghelpers.c b/VEX/priv/guest-amd64/ghelpers.c index a44117b29e..88bef6577b 100644 --- a/VEX/priv/guest-amd64/ghelpers.c +++ b/VEX/priv/guest-amd64/ghelpers.c @@ -1488,6 +1488,14 @@ ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy ) return (ULong)t; } +/* CALLED FROM GENERATED CODE: CLEAN HELPER */ +ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ) +{ + ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi ); + ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo ); + return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF); +} + /*---------------------------------------------------------------*/ /*--- Helpers for dealing with, and describing, ---*/ diff --git a/VEX/priv/guest-amd64/toIR.c b/VEX/priv/guest-amd64/toIR.c index 7b8400597b..12629c09fc 100644 --- a/VEX/priv/guest-amd64/toIR.c +++ b/VEX/priv/guest-amd64/toIR.c @@ -7665,78 +7665,80 @@ static ULong dis_SSEcmp_E_to_G ( Prefix pfx, ULong delta, } -//.. /* Vector by scalar shift of G by the amount specified at the bottom -//.. of E. */ -//.. -//.. static UInt dis_SSE_shiftG_byE ( UChar sorb, ULong delta, -//.. HChar* opname, IROp op ) -//.. { -//.. HChar dis_buf[50]; -//.. Int alen, size; -//.. IRTemp addr; -//.. Bool shl, shr, sar; -//.. UChar rm = getUChar(delta); -//.. IRTemp g0 = newTemp(Ity_V128); -//.. IRTemp g1 = newTemp(Ity_V128); -//.. IRTemp amt = newTemp(Ity_I32); -//.. IRTemp amt8 = newTemp(Ity_I8); -//.. if (epartIsReg(rm)) { -//.. assign( amt, getXMMRegLane32(eregOfRM(rm), 0) ); -//.. DIP("%s %s,%s\n", opname, -//.. nameXMMReg(eregOfRM(rm)), -//.. nameXMMReg(gregOfRM(rm)) ); -//.. delta++; -//.. } else { -//.. addr = disAMode ( &alen, sorb, delta, dis_buf ); -//.. assign( amt, loadLE(Ity_I32, mkexpr(addr)) ); -//.. DIP("%s %s,%s\n", opname, -//.. dis_buf, -//.. nameXMMReg(gregOfRM(rm)) ); -//.. delta += alen; -//.. } -//.. assign( g0, getXMMReg(gregOfRM(rm)) ); -//.. assign( amt8, unop(Iop_32to8, mkexpr(amt)) ); -//.. -//.. shl = shr = sar = False; -//.. size = 0; -//.. switch (op) { -//.. case Iop_ShlN16x8: shl = True; size = 32; break; -//.. case Iop_ShlN32x4: shl = True; size = 32; break; -//.. case Iop_ShlN64x2: shl = True; size = 64; break; -//.. case Iop_SarN16x8: sar = True; size = 16; break; -//.. case Iop_SarN32x4: sar = True; size = 32; break; -//.. case Iop_ShrN16x8: shr = True; size = 16; break; -//.. case Iop_ShrN32x4: shr = True; size = 32; break; -//.. case Iop_ShrN64x2: shr = True; size = 64; break; -//.. default: vassert(0); -//.. } -//.. -//.. if (shl || shr) { -//.. assign( -//.. g1, -//.. IRExpr_Mux0X( -//.. unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))), -//.. mkV128(0x0000), -//.. binop(op, mkexpr(g0), mkexpr(amt8)) -//.. ) -//.. ); -//.. } else -//.. if (sar) { -//.. assign( -//.. g1, -//.. IRExpr_Mux0X( -//.. unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))), -//.. binop(op, mkexpr(g0), mkU8(size-1)), -//.. binop(op, mkexpr(g0), mkexpr(amt8)) -//.. ) -//.. ); -//.. } else { -//.. vassert(0); -//.. } -//.. -//.. putXMMReg( gregOfRM(rm), mkexpr(g1) ); -//.. return delta; -//.. } +/* Vector by scalar shift of G by the amount specified at the bottom + of E. */ + +static ULong dis_SSE_shiftG_byE ( Prefix pfx, ULong delta, + HChar* opname, IROp op ) +{ + HChar dis_buf[50]; + Int alen, size; + IRTemp addr; + Bool shl, shr, sar; + UChar rm = getUChar(delta); + IRTemp g0 = newTemp(Ity_V128); + IRTemp g1 = newTemp(Ity_V128); + IRTemp amt = newTemp(Ity_I32); + IRTemp amt8 = newTemp(Ity_I8); + if (epartIsReg(rm)) { + assign( amt, getXMMRegLane32(eregOfRexRM(pfx,rm), 0) ); + DIP("%s %s,%s\n", opname, + nameXMMReg(eregOfRexRM(pfx,rm)), + nameXMMReg(gregOfRexRM(pfx,rm)) ); + delta++; + } else { + addr = disAMode ( &alen, pfx, delta, dis_buf, 0 ); + assign( amt, loadLE(Ity_I32, mkexpr(addr)) ); + DIP("%s %s,%s\n", opname, + dis_buf, + nameXMMReg(gregOfRexRM(pfx,rm)) ); + delta += alen; + } + assign( g0, getXMMReg(gregOfRexRM(pfx,rm)) ); + assign( amt8, unop(Iop_32to8, mkexpr(amt)) ); + + shl = shr = sar = False; + size = 0; + switch (op) { + case Iop_ShlN16x8: shl = True; size = 32; break; + case Iop_ShlN32x4: shl = True; size = 32; break; + case Iop_ShlN64x2: shl = True; size = 64; break; + case Iop_SarN16x8: sar = True; size = 16; break; + case Iop_SarN32x4: sar = True; size = 32; break; + case Iop_ShrN16x8: shr = True; size = 16; break; + case Iop_ShrN32x4: shr = True; size = 32; break; + case Iop_ShrN64x2: shr = True; size = 64; break; + default: vassert(0); + } + + if (shl || shr) { + assign( + g1, + IRExpr_Mux0X( + unop(Iop_1Uto8, + binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))), + mkV128(0x0000), + binop(op, mkexpr(g0), mkexpr(amt8)) + ) + ); + } else + if (sar) { + assign( + g1, + IRExpr_Mux0X( + unop(Iop_1Uto8, + binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))), + binop(op, mkexpr(g0), mkU8(size-1)), + binop(op, mkexpr(g0), mkexpr(amt8)) + ) + ); + } else { + vassert(0); + } + + putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) ); + return delta; +} /* Vector by scalar shift of E by an immediate byte. */ @@ -10756,153 +10758,163 @@ DisResult disInstr ( /*IN*/ Bool resteerOK, goto decode_success; } -//.. /* 66 0F EE = PMAXSW -- 16x8 signed max */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) { -//.. delta = dis_SSEint_E_to_G( sorb, delta+2, -//.. "pmaxsw", Iop_Max16Sx8, False ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F DE = PMAXUB -- 8x16 unsigned max */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) { -//.. delta = dis_SSEint_E_to_G( sorb, delta+2, -//.. "pmaxub", Iop_Max8Ux16, False ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F EA = PMINSW -- 16x8 signed min */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) { -//.. delta = dis_SSEint_E_to_G( sorb, delta+2, -//.. "pminsw", Iop_Min16Sx8, False ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F DA = PMINUB -- 8x16 unsigned min */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) { -//.. delta = dis_SSEint_E_to_G( sorb, delta+2, -//.. "pminub", Iop_Min8Ux16, False ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in -//.. xmm(G), turn them into a byte, and put zero-extend of it in -//.. ireg(G). Doing this directly is just too cumbersome; give up -//.. therefore and call a helper. */ -//.. /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) { -//.. modrm = insn[2]; -//.. if (epartIsReg(modrm)) { -//.. t0 = newTemp(Ity_I64); -//.. t1 = newTemp(Ity_I64); -//.. assign(t0, getXMMRegLane64(eregOfRM(modrm), 0)); -//.. assign(t1, getXMMRegLane64(eregOfRM(modrm), 1)); -//.. t5 = newTemp(Ity_I32); -//.. assign(t5, mkIRExprCCall( -//.. Ity_I32, 0/*regparms*/, -//.. "x86g_calculate_sse_pmovmskb", -//.. &x86g_calculate_sse_pmovmskb, -//.. mkIRExprVec_2( mkexpr(t1), mkexpr(t0) ))); -//.. putIReg(4, gregOfRM(modrm), mkexpr(t5)); -//.. DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)), -//.. nameIReg(4,gregOfRM(modrm))); -//.. delta += 3; -//.. goto decode_success; -//.. } -//.. /* else fall through */ -//.. } -//.. -//.. /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) { -//.. delta = dis_SSEint_E_to_G( sorb, delta+2, -//.. "pmulhuw", Iop_MulHi16Ux8, False ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) { -//.. delta = dis_SSEint_E_to_G( sorb, delta+2, -//.. "pmulhw", Iop_MulHi16Sx8, False ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F D5 = PMULHL -- 16x8 multiply */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) { -//.. delta = dis_SSEint_E_to_G( sorb, delta+2, -//.. "pmullw", Iop_Mul16x8, False ); -//.. goto decode_success; -//.. } -//.. -//.. /* ***--- this is an MMX class insn introduced in SSE2 ---*** */ -//.. /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x -//.. 0 to form 64-bit result */ -//.. if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) { -//.. IRTemp sV = newTemp(Ity_I64); -//.. IRTemp dV = newTemp(Ity_I64); -//.. t1 = newTemp(Ity_I32); -//.. t0 = newTemp(Ity_I32); -//.. modrm = insn[2]; -//.. -//.. do_MMX_preamble(); -//.. assign( dV, getMMXReg(gregOfRM(modrm)) ); -//.. -//.. if (epartIsReg(modrm)) { -//.. assign( sV, getMMXReg(eregOfRM(modrm)) ); -//.. delta += 2+1; -//.. DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)), -//.. nameMMXReg(gregOfRM(modrm))); -//.. } else { -//.. addr = disAMode ( &alen, sorb, delta+2, dis_buf ); -//.. assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); -//.. delta += 2+alen; -//.. DIP("pmuludq %s,%s\n", dis_buf, -//.. nameMMXReg(gregOfRM(modrm))); -//.. } -//.. -//.. assign( t0, unop(Iop_64to32, mkexpr(dV)) ); -//.. assign( t1, unop(Iop_64to32, mkexpr(sV)) ); -//.. putMMXReg( gregOfRM(modrm), -//.. binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x -//.. 0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit -//.. half */ -//.. /* This is a really poor translation -- could be improved if -//.. performance critical */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) { -//.. IRTemp sV, dV; -//.. IRTemp s3, s2, s1, s0, d3, d2, d1, d0; -//.. sV = newTemp(Ity_V128); -//.. dV = newTemp(Ity_V128); -//.. s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID; -//.. t1 = newTemp(Ity_I64); -//.. t0 = newTemp(Ity_I64); -//.. modrm = insn[2]; -//.. assign( dV, getXMMReg(gregOfRM(modrm)) ); -//.. -//.. if (epartIsReg(modrm)) { -//.. assign( sV, getXMMReg(eregOfRM(modrm)) ); -//.. delta += 2+1; -//.. DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)), -//.. nameXMMReg(gregOfRM(modrm))); -//.. } else { -//.. addr = disAMode ( &alen, sorb, delta+2, dis_buf ); -//.. assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); -//.. delta += 2+alen; -//.. DIP("pmuludq %s,%s\n", dis_buf, -//.. nameXMMReg(gregOfRM(modrm))); -//.. } -//.. -//.. breakup128to32s( dV, &d3, &d2, &d1, &d0 ); -//.. breakup128to32s( sV, &s3, &s2, &s1, &s0 ); -//.. -//.. assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ); -//.. putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) ); -//.. assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) ); -//.. putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) ); -//.. goto decode_success; -//.. } + /* 66 0F EE = PMAXSW -- 16x8 signed max */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xEE) { + delta = dis_SSEint_E_to_G( pfx, delta+2, + "pmaxsw", Iop_Max16Sx8, False ); + goto decode_success; + } + + /* 66 0F DE = PMAXUB -- 8x16 unsigned max */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xDE) { + delta = dis_SSEint_E_to_G( pfx, delta+2, + "pmaxub", Iop_Max8Ux16, False ); + goto decode_success; + } + + /* 66 0F EA = PMINSW -- 16x8 signed min */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xEA) { + delta = dis_SSEint_E_to_G( pfx, delta+2, + "pminsw", Iop_Min16Sx8, False ); + goto decode_success; + } + + /* 66 0F DA = PMINUB -- 8x16 unsigned min */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xDA) { + delta = dis_SSEint_E_to_G( pfx, delta+2, + "pminub", Iop_Min8Ux16, False ); + goto decode_success; + } + + /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in + xmm(E), turn them into a byte, and put zero-extend of it in + ireg(G). Doing this directly is just too cumbersome; give up + therefore and call a helper. */ + /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xD7) { + modrm = insn[2]; + if (epartIsReg(modrm)) { + t0 = newTemp(Ity_I64); + t1 = newTemp(Ity_I64); + assign(t0, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0)); + assign(t1, getXMMRegLane64(eregOfRexRM(pfx,modrm), 1)); + t5 = newTemp(Ity_I64); + assign(t5, mkIRExprCCall( + Ity_I64, 0/*regparms*/, + "amd64g_calculate_sse_pmovmskb", + &amd64g_calculate_sse_pmovmskb, + mkIRExprVec_2( mkexpr(t1), mkexpr(t0) ))); + putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t5))); + DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameIReg32(gregOfRexRM(pfx,modrm))); + delta += 3; + goto decode_success; + } + /* else fall through */ + } + + /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xE4) { + delta = dis_SSEint_E_to_G( pfx, delta+2, + "pmulhuw", Iop_MulHi16Ux8, False ); + goto decode_success; + } + + /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xE5) { + delta = dis_SSEint_E_to_G( pfx, delta+2, + "pmulhw", Iop_MulHi16Sx8, False ); + goto decode_success; + } + + /* 66 0F D5 = PMULHL -- 16x8 multiply */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xD5) { + delta = dis_SSEint_E_to_G( pfx, delta+2, + "pmullw", Iop_Mul16x8, False ); + goto decode_success; + } + + /* ***--- this is an MMX class insn introduced in SSE2 ---*** */ + /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x + 0 to form 64-bit result */ + if (haveNo66noF2noF3(pfx) && sz == 4 + && insn[0] == 0x0F && insn[1] == 0xF4) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + t1 = newTemp(Ity_I32); + t0 = newTemp(Ity_I32); + modrm = insn[2]; + + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 2+1; + DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+2, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 2+alen; + DIP("pmuludq %s,%s\n", dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + assign( t0, unop(Iop_64to32, mkexpr(dV)) ); + assign( t1, unop(Iop_64to32, mkexpr(sV)) ); + putMMXReg( gregLO3ofRM(modrm), + binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) ); + goto decode_success; + } + + /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x + 0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit + half */ + /* This is a really poor translation -- could be improved if + performance critical */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xF4) { + IRTemp sV, dV; + IRTemp s3, s2, s1, s0, d3, d2, d1, d0; + sV = newTemp(Ity_V128); + dV = newTemp(Ity_V128); + s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID; + t1 = newTemp(Ity_I64); + t0 = newTemp(Ity_I64); + modrm = insn[2]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 2+1; + DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+2, dis_buf, 0 ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 2+alen; + DIP("pmuludq %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + breakup128to32s( dV, &d3, &d2, &d1, &d0 ); + breakup128to32s( sV, &s3, &s2, &s1, &s0 ); + + assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ); + putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) ); + assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) ); + putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkexpr(t1) ); + goto decode_success; + } /* 66 0F EB = POR */ if (have66noF2noF3(pfx) && sz == 2 @@ -10911,150 +10923,152 @@ DisResult disInstr ( /*IN*/ Bool resteerOK, goto decode_success; } -//.. /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) { -//.. Int order; -//.. IRTemp sV, dV, s3, s2, s1, s0; -//.. s3 = s2 = s1 = s0 = IRTemp_INVALID; -//.. sV = newTemp(Ity_V128); -//.. dV = newTemp(Ity_V128); -//.. modrm = insn[2]; -//.. if (epartIsReg(modrm)) { -//.. assign( sV, getXMMReg(eregOfRM(modrm)) ); -//.. order = (Int)insn[3]; -//.. delta += 2+2; -//.. DIP("pshufd $%d,%s,%s\n", order, -//.. nameXMMReg(eregOfRM(modrm)), -//.. nameXMMReg(gregOfRM(modrm))); -//.. } else { -//.. addr = disAMode ( &alen, sorb, delta+2, dis_buf ); -//.. assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); -//.. order = (Int)insn[2+alen]; -//.. delta += 3+alen; -//.. DIP("pshufd $%d,%s,%s\n", order, -//.. dis_buf, -//.. nameXMMReg(gregOfRM(modrm))); -//.. } -//.. breakup128to32s( sV, &s3, &s2, &s1, &s0 ); -//.. -#if 0 /* stop gcc multi-line comment warning */ -/.. # define SEL(n) \ -/.. ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) -#endif /* stop gcc multi-line comment warning */ -//.. assign(dV, -//.. mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3), -//.. SEL((order>>2)&3), SEL((order>>0)&3) ) -//.. ); -//.. putXMMReg(gregOfRM(modrm), mkexpr(dV)); -//.. # undef SEL -//.. goto decode_success; -//.. } -//.. -//.. /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or -//.. mem) to G(xmm), and copy lower half */ -//.. if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) { -//.. Int order; -//.. IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0; -//.. s3 = s2 = s1 = s0 = IRTemp_INVALID; -//.. sV = newTemp(Ity_V128); -//.. dV = newTemp(Ity_V128); -//.. sVhi = newTemp(Ity_I64); -//.. dVhi = newTemp(Ity_I64); -//.. modrm = insn[3]; -//.. if (epartIsReg(modrm)) { -//.. assign( sV, getXMMReg(eregOfRM(modrm)) ); -//.. order = (Int)insn[4]; -//.. delta += 4+1; -//.. DIP("pshufhw $%d,%s,%s\n", order, -//.. nameXMMReg(eregOfRM(modrm)), -//.. nameXMMReg(gregOfRM(modrm))); -//.. } else { -//.. addr = disAMode ( &alen, sorb, delta+3, dis_buf ); -//.. assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); -//.. order = (Int)insn[3+alen]; -//.. delta += 4+alen; -//.. DIP("pshufhw $%d,%s,%s\n", order, -//.. dis_buf, -//.. nameXMMReg(gregOfRM(modrm))); -//.. } -//.. assign( sVhi, unop(Iop_128HIto64, mkexpr(sV)) ); -//.. breakup64to16s( sVhi, &s3, &s2, &s1, &s0 ); -//.. -#if 0 /* stop gcc multi-line comment warning */ -/.. # define SEL(n) \ -/.. ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) -#endif /* stop gcc multi-line comment warning */ -//.. assign(dVhi, -//.. mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3), -//.. SEL((order>>2)&3), SEL((order>>0)&3) ) -//.. ); -//.. assign(dV, binop( Iop_64HLto128, -//.. mkexpr(dVhi), -//.. unop(Iop_128to64, mkexpr(sV))) ); -//.. putXMMReg(gregOfRM(modrm), mkexpr(dV)); -//.. # undef SEL -//.. goto decode_success; -//.. } -//.. -//.. /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or -//.. mem) to G(xmm), and copy upper half */ -//.. if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) { -//.. Int order; -//.. IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0; -//.. s3 = s2 = s1 = s0 = IRTemp_INVALID; -//.. sV = newTemp(Ity_V128); -//.. dV = newTemp(Ity_V128); -//.. sVlo = newTemp(Ity_I64); -//.. dVlo = newTemp(Ity_I64); -//.. modrm = insn[3]; -//.. if (epartIsReg(modrm)) { -//.. assign( sV, getXMMReg(eregOfRM(modrm)) ); -//.. order = (Int)insn[4]; -//.. delta += 4+1; -//.. DIP("pshuflw $%d,%s,%s\n", order, -//.. nameXMMReg(eregOfRM(modrm)), -//.. nameXMMReg(gregOfRM(modrm))); -//.. } else { -//.. addr = disAMode ( &alen, sorb, delta+3, dis_buf ); -//.. assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); -//.. order = (Int)insn[3+alen]; -//.. delta += 4+alen; -//.. DIP("pshuflw $%d,%s,%s\n", order, -//.. dis_buf, -//.. nameXMMReg(gregOfRM(modrm))); -//.. } -//.. assign( sVlo, unop(Iop_128to64, mkexpr(sV)) ); -//.. breakup64to16s( sVlo, &s3, &s2, &s1, &s0 ); -//.. -#if 0 /* stop gcc multi-line comment warning */ -/.. # define SEL(n) \ -/.. ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) -#endif /* stop gcc multi-line comment warning */ -//.. assign(dVlo, -//.. mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3), -//.. SEL((order>>2)&3), SEL((order>>0)&3) ) -//.. ); -//.. assign(dV, binop( Iop_64HLto128, -//.. unop(Iop_128HIto64, mkexpr(sV)), -//.. mkexpr(dVlo) ) ); -//.. putXMMReg(gregOfRM(modrm), mkexpr(dV)); -//.. # undef SEL -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F 72 /6 ib = PSLLD by immediate */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72 -//.. && epartIsReg(insn[2]) -//.. && gregOfRM(insn[2]) == 6) { -//.. delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F F2 = PSLLD by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 ); -//.. goto decode_success; -//.. } + /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x70) { + Int order; + IRTemp sV, dV, s3, s2, s1, s0; + s3 = s2 = s1 = s0 = IRTemp_INVALID; + sV = newTemp(Ity_V128); + dV = newTemp(Ity_V128); + modrm = insn[2]; + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + order = (Int)insn[3]; + delta += 3+1; + DIP("pshufd $%d,%s,%s\n", order, + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+2, dis_buf, + 1/*byte after the amode*/ ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + order = (Int)insn[2+alen]; + delta += 2+alen+1; + DIP("pshufd $%d,%s,%s\n", order, + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + breakup128to32s( sV, &s3, &s2, &s1, &s0 ); + +# define SEL(n) \ + ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) + assign(dV, + mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3), + SEL((order>>2)&3), SEL((order>>0)&3) ) + ); + putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV)); +# undef SEL + goto decode_success; + } + + /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or + mem) to G(xmm), and copy lower half */ + if (haveF3no66noF2(pfx) && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x70) { + Int order; + IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0; + s3 = s2 = s1 = s0 = IRTemp_INVALID; + sV = newTemp(Ity_V128); + dV = newTemp(Ity_V128); + sVhi = newTemp(Ity_I64); + dVhi = newTemp(Ity_I64); + modrm = insn[2]; + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + order = (Int)insn[3]; + delta += 3+1; + DIP("pshufhw $%d,%s,%s\n", order, + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+2, dis_buf, + 1/*byte after the amode*/ ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + order = (Int)insn[2+alen]; + delta += 2+alen+1; + DIP("pshufhw $%d,%s,%s\n", order, + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) ); + breakup64to16s( sVhi, &s3, &s2, &s1, &s0 ); + +# define SEL(n) \ + ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) + assign(dVhi, + mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3), + SEL((order>>2)&3), SEL((order>>0)&3) ) + ); + assign(dV, binop( Iop_64HLtoV128, + mkexpr(dVhi), + unop(Iop_V128to64, mkexpr(sV))) ); + putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV)); +# undef SEL + goto decode_success; + } + + /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or + mem) to G(xmm), and copy upper half */ + if (haveF2no66noF3(pfx) && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x70) { + Int order; + IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0; + s3 = s2 = s1 = s0 = IRTemp_INVALID; + sV = newTemp(Ity_V128); + dV = newTemp(Ity_V128); + sVlo = newTemp(Ity_I64); + dVlo = newTemp(Ity_I64); + modrm = insn[2]; + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + order = (Int)insn[3]; + delta += 3+1; + DIP("pshuflw $%d,%s,%s\n", order, + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+2, dis_buf, + 1/*byte after the amode*/ ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + order = (Int)insn[2+alen]; + delta += 2+alen+1; + DIP("pshuflw $%d,%s,%s\n", order, + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) ); + breakup64to16s( sVlo, &s3, &s2, &s1, &s0 ); + +# define SEL(n) \ + ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3))) + assign(dVlo, + mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3), + SEL((order>>2)&3), SEL((order>>0)&3) ) + ); + assign(dV, binop( Iop_64HLtoV128, + unop(Iop_V128HIto64, mkexpr(sV)), + mkexpr(dVlo) ) ); + putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV)); +# undef SEL + goto decode_success; + } + + /* 66 0F 72 /6 ib = PSLLD by immediate */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x72 + && epartIsReg(insn[2]) + && gregLO3ofRM(insn[2]) == 6) { + delta = dis_SSE_shiftE_imm( pfx, delta+2, "pslld", Iop_ShlN32x4 ); + goto decode_success; + } + + /* 66 0F F2 = PSLLD by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xF2) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "pslld", Iop_ShlN32x4 ); + goto decode_success; + } /* 66 0F 73 /7 ib = PSLLDQ by immediate */ /* note, if mem case ever filled in, 1 byte after amode */ @@ -11118,75 +11132,85 @@ DisResult disInstr ( /*IN*/ Bool resteerOK, goto decode_success; } -//.. /* 66 0F 73 /6 ib = PSLLQ by immediate */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73 -//.. && epartIsReg(insn[2]) -//.. && gregOfRM(insn[2]) == 6) { -//.. delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F F3 = PSLLQ by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F 71 /6 ib = PSLLW by immediate */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71 -//.. && epartIsReg(insn[2]) -//.. && gregOfRM(insn[2]) == 6) { -//.. delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F F1 = PSLLW by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F 72 /4 ib = PSRAD by immediate */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72 -//.. && epartIsReg(insn[2]) -//.. && gregOfRM(insn[2]) == 4) { -//.. delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F E2 = PSRAD by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F 71 /4 ib = PSRAW by immediate */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71 -//.. && epartIsReg(insn[2]) -//.. && gregOfRM(insn[2]) == 4) { -//.. delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F E1 = PSRAW by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F 72 /2 ib = PSRLD by immediate */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72 -//.. && epartIsReg(insn[2]) -//.. && gregOfRM(insn[2]) == 2) { -//.. delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F D2 = PSRLD by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 ); -//.. goto decode_success; -//.. } + /* 66 0F 73 /6 ib = PSLLQ by immediate */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x73 + && epartIsReg(insn[2]) + && gregLO3ofRM(insn[2]) == 6) { + delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllq", Iop_ShlN64x2 ); + goto decode_success; + } + + /* 66 0F F3 = PSLLQ by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xF3) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "psllq", Iop_ShlN64x2 ); + goto decode_success; + } + + /* 66 0F 71 /6 ib = PSLLW by immediate */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x71 + && epartIsReg(insn[2]) + && gregLO3ofRM(insn[2]) == 6) { + delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllw", Iop_ShlN16x8 ); + goto decode_success; + } + + /* 66 0F F1 = PSLLW by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xF1) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "psllw", Iop_ShlN16x8 ); + goto decode_success; + } + + /* 66 0F 72 /4 ib = PSRAD by immediate */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x72 + && epartIsReg(insn[2]) + && gregLO3ofRM(insn[2]) == 4) { + delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrad", Iop_SarN32x4 ); + goto decode_success; + } + + /* 66 0F E2 = PSRAD by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xE2) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "psrad", Iop_SarN32x4 ); + goto decode_success; + } + + /* 66 0F 71 /4 ib = PSRAW by immediate */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x71 + && epartIsReg(insn[2]) + && gregLO3ofRM(insn[2]) == 4) { + delta = dis_SSE_shiftE_imm( pfx, delta+2, "psraw", Iop_SarN16x8 ); + goto decode_success; + } + + /* 66 0F E1 = PSRAW by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xE1) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "psraw", Iop_SarN16x8 ); + goto decode_success; + } + + /* 66 0F 72 /2 ib = PSRLD by immediate */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x72 + && epartIsReg(insn[2]) + && gregLO3ofRM(insn[2]) == 2) { + delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrld", Iop_ShrN32x4 ); + goto decode_success; + } + + /* 66 0F D2 = PSRLD by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xD2) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "psrld", Iop_ShrN32x4 ); + goto decode_success; + } /* 66 0F 73 /3 ib = PSRLDQ by immediate */ /* note, if mem case ever filled in, 1 byte after amode */ @@ -11260,25 +11284,28 @@ DisResult disInstr ( /*IN*/ Bool resteerOK, goto decode_success; } -//.. /* 66 0F D3 = PSRLQ by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F 71 /2 ib = PSRLW by immediate */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71 -//.. && epartIsReg(insn[2]) -//.. && gregOfRM(insn[2]) == 2) { -//.. delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 ); -//.. goto decode_success; -//.. } -//.. -//.. /* 66 0F D1 = PSRLW by E */ -//.. if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) { -//.. delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 ); -//.. goto decode_success; -//.. } + /* 66 0F D3 = PSRLQ by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xD3) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "psrlq", Iop_ShrN64x2 ); + goto decode_success; + } + + /* 66 0F 71 /2 ib = PSRLW by immediate */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x71 + && epartIsReg(insn[2]) + && gregLO3ofRM(insn[2]) == 2) { + delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlw", Iop_ShrN16x8 ); + goto decode_success; + } + + /* 66 0F D1 = PSRLW by E */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0xD1) { + delta = dis_SSE_shiftG_byE( pfx, delta+2, "psrlw", Iop_ShrN16x8 ); + goto decode_success; + } /* 66 0F F8 = PSUBB */ if (have66noF2noF3(pfx) && sz == 2 diff --git a/VEX/priv/host-amd64/hdefs.c b/VEX/priv/host-amd64/hdefs.c index 1505efa510..697f51eb8c 100644 --- a/VEX/priv/host-amd64/hdefs.c +++ b/VEX/priv/host-amd64/hdefs.c @@ -619,29 +619,29 @@ HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_QSUB16U: return "psubusw"; case Asse_QSUB8S: return "psubsb"; case Asse_QSUB16S: return "psubsw"; -//.. case Xsse_MUL16: return "pmullw"; -//.. case Xsse_MULHI16U: return "pmulhuw"; -//.. case Xsse_MULHI16S: return "pmulhw"; + case Asse_MUL16: return "pmullw"; + case Asse_MULHI16U: return "pmulhuw"; + case Asse_MULHI16S: return "pmulhw"; //.. case Xsse_AVG8U: return "pavgb"; //.. case Xsse_AVG16U: return "pavgw"; -//.. case Xsse_MAX16S: return "pmaxw"; -//.. case Xsse_MAX8U: return "pmaxub"; -//.. case Xsse_MIN16S: return "pminw"; -//.. case Xsse_MIN8U: return "pminub"; + case Asse_MAX16S: return "pmaxw"; + case Asse_MAX8U: return "pmaxub"; + case Asse_MIN16S: return "pminw"; + case Asse_MIN8U: return "pminub"; //.. case Xsse_CMPEQ8: return "pcmpeqb"; //.. case Xsse_CMPEQ16: return "pcmpeqw"; case Asse_CMPEQ32: return "pcmpeqd"; //.. case Xsse_CMPGT8S: return "pcmpgtb"; //.. case Xsse_CMPGT16S: return "pcmpgtw"; //.. case Xsse_CMPGT32S: return "pcmpgtd"; -//.. case Xsse_SHL16: return "psllw"; -//.. case Xsse_SHL32: return "pslld"; -//.. case Xsse_SHL64: return "psllq"; -//.. case Xsse_SHR16: return "psrlw"; -//.. case Xsse_SHR32: return "psrld"; + case Asse_SHL16: return "psllw"; + case Asse_SHL32: return "pslld"; + case Asse_SHL64: return "psllq"; + case Asse_SHR16: return "psrlw"; + case Asse_SHR32: return "psrld"; case Asse_SHR64: return "psrlq"; -//.. case Xsse_SAR16: return "psraw"; -//.. case Xsse_SAR32: return "psrad"; + case Asse_SAR16: return "psraw"; + case Asse_SAR32: return "psrad"; case Asse_PACKSSD: return "packssdw"; case Asse_PACKSSW: return "packsswb"; case Asse_PACKUSW: return "packuswb"; @@ -3329,20 +3329,20 @@ Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i ) //.. case Xsse_CMPGT8S: XX(0x66); XX(rex); XX(0x0F); XX(0x64); break; //.. case Xsse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break; //.. case Xsse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break; -//.. case Xsse_MAX16S: XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break; -//.. case Xsse_MAX8U: XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break; -//.. case Xsse_MIN16S: XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break; -//.. case Xsse_MIN8U: XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break; -//.. case Xsse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break; -//.. case Xsse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break; -//.. case Xsse_MUL16: XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break; -//.. case Xsse_SHL16: XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break; -//.. case Xsse_SHL32: XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break; -//.. case Xsse_SHL64: XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break; -//.. case Xsse_SAR16: XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break; -//.. case Xsse_SAR32: XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break; -//.. case Xsse_SHR16: XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break; -//.. case Xsse_SHR32: XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break; + case Asse_MAX16S: XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break; + case Asse_MAX8U: XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break; + case Asse_MIN16S: XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break; + case Asse_MIN8U: XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break; + case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break; + case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break; + case Asse_MUL16: XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break; + case Asse_SHL16: XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break; + case Asse_SHL32: XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break; + case Asse_SHL64: XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break; + case Asse_SAR16: XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break; + case Asse_SAR32: XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break; + case Asse_SHR16: XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break; + case Asse_SHR32: XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break; case Asse_SHR64: XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break; case Asse_SUB8: XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break; case Asse_SUB16: XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break; diff --git a/VEX/priv/host-amd64/hdefs.h b/VEX/priv/host-amd64/hdefs.h index ac3bc6f90b..0455063618 100644 --- a/VEX/priv/host-amd64/hdefs.h +++ b/VEX/priv/host-amd64/hdefs.h @@ -329,21 +329,20 @@ typedef Asse_SUB8, Asse_SUB16, Asse_SUB32, Asse_SUB64, Asse_QSUB8U, Asse_QSUB16U, Asse_QSUB8S, Asse_QSUB16S, -//.. Xsse_MUL16, -//.. Xsse_MULHI16U, -//.. Xsse_MULHI16S, + Asse_MUL16, + Asse_MULHI16U, + Asse_MULHI16S, //.. Xsse_AVG8U, Xsse_AVG16U, -//.. Xsse_MAX16S, -//.. Xsse_MAX8U, -//.. Xsse_MIN16S, -//.. Xsse_MIN8U, + Asse_MAX16S, + Asse_MAX8U, + Asse_MIN16S, + Asse_MIN8U, //.. Xsse_CMPEQ8, Xsse_CMPEQ16, Asse_CMPEQ32, //.. Xsse_CMPGT8S, Xsse_CMPGT16S, Xsse_CMPGT32S, -//.. Xsse_SHL16, Xsse_SHL32, Xsse_SHL64, -//.. Xsse_SHR16, Xsse_SHR32, - Asse_SHR64, -//.. Xsse_SAR16, Xsse_SAR32, + Asse_SHL16, Asse_SHL32, Asse_SHL64, + Asse_SHR16, Asse_SHR32, Asse_SHR64, + Asse_SAR16, Asse_SAR32, Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW, Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ, Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c index 72e74a345e..1c82befe6f 100644 --- a/VEX/priv/host-amd64/isel.c +++ b/VEX/priv/host-amd64/isel.c @@ -3415,13 +3415,13 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) //.. case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S; goto do_SseReRg; //.. case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg; //.. case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg; -//.. case Iop_Max16Sx8: op = Xsse_MAX16S; goto do_SseReRg; -//.. case Iop_Max8Ux16: op = Xsse_MAX8U; goto do_SseReRg; -//.. case Iop_Min16Sx8: op = Xsse_MIN16S; goto do_SseReRg; -//.. case Iop_Min8Ux16: op = Xsse_MIN8U; goto do_SseReRg; -//.. case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg; -//.. case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg; -//.. case Iop_Mul16x8: op = Xsse_MUL16; goto do_SseReRg; + case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg; + case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg; + case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg; + case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg; + case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg; + case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg; + case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg; case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg; case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg; case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg; @@ -3444,13 +3444,13 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } -//.. case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift; -//.. case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift; -//.. case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift; -//.. case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift; -//.. case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift; -//.. case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift; -//.. case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift; + case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift; + case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift; + case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift; + case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift; + case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift; + case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift; + case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift; case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift; do_SseShift: { HReg greg = iselVecExpr(env, e->Iex.Binop.arg1); @@ -3472,17 +3472,17 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) } /* switch (e->Iex.Binop.op) */ } /* if (e->tag == Iex_Binop) */ -//.. if (e->tag == Iex_Mux0X) { -//.. HReg r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond); -//.. HReg rX = iselVecExpr(env, e->Iex.Mux0X.exprX); -//.. HReg r0 = iselVecExpr(env, e->Iex.Mux0X.expr0); -//.. HReg dst = newVRegV(env); -//.. addInstr(env, mk_vMOVsd_RR(rX,dst)); -//.. addInstr(env, X86Instr_Test32(X86RI_Imm(0xFF), X86RM_Reg(r8))); -//.. addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst)); -//.. return dst; -//.. } -//.. + if (e->tag == Iex_Mux0X) { + HReg r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond); + HReg rX = iselVecExpr(env, e->Iex.Mux0X.exprX); + HReg r0 = iselVecExpr(env, e->Iex.Mux0X.expr0); + HReg dst = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(rX,dst)); + addInstr(env, AMD64Instr_Test64(AMD64RI_Imm(0xFF), AMD64RM_Reg(r8))); + addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst)); + return dst; + } + vec_fail: vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n", LibVEX_ppVexSubArch(env->subarch)); -- 2.47.3