From: Julian Seward Date: Tue, 13 May 2008 08:38:43 +0000 (+0000) Subject: Merge r1808,1809,1810: SSSE3 x86/amd64 support, and update CPUID accordingly. X-Git-Tag: svn/VALGRIND_3_3_1^2~9 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b88586501c670fa4b7db2f2cda7a03b7e3155597;p=thirdparty%2Fvalgrind.git Merge r1808,1809,1810: SSSE3 x86/amd64 support, and update CPUID accordingly. (Fixes #155528, although only tangentially) git-svn-id: svn://svn.valgrind.org/vex/branches/VEX_3_3_BRANCH@1841 --- diff --git a/VEX/priv/guest-amd64/ghelpers.c b/VEX/priv/guest-amd64/ghelpers.c index b6aa3289dd..e76ed0defb 100644 --- a/VEX/priv/guest-amd64/ghelpers.c +++ b/VEX/priv/guest-amd64/ghelpers.c @@ -1758,27 +1758,32 @@ void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state, /*--- Misc integer helpers, including rotates and CPUID. ---*/ /*---------------------------------------------------------------*/ -/* Claim to be the following CPU: - vendor_id : AuthenticAMD - cpu family : 15 - model : 12 - model name : AMD Athlon(tm) 64 Processor 3200+ - stepping : 0 - cpu MHz : 2202.917 - cache size : 512 KB +/* Claim to be the following CPU (2 x ...): + vendor_id : GenuineIntel + cpu family : 6 + model : 15 + model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz + stepping : 6 + cpu MHz : 2394.000 + cache size : 4096 KB + physical id : 0 + siblings : 2 + core id : 0 + cpu cores : 2 fpu : yes fpu_exception : yes - cpuid level : 1 + cpuid level : 10 wp : yes - flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr - pge mca cmov pat pse36 clflush mmx fxsr sse sse2 - pni syscall nx mmxext lm 3dnowext 3dnow - bogomips : 4308.99 - TLB size : 1088 4K pages + flags : fpu vme de pse tsc msr pae mce cx8 apic sep + mtrr pge mca cmov pat pse36 clflush dts acpi + mmx fxsr sse sse2 ss ht tm syscall nx lm + constant_tsc pni monitor ds_cpl vmx est tm2 + cx16 xtpr lahf_lm + bogomips : 4798.78 clflush size : 64 cache_alignment : 64 - address sizes : 40 bits physical, 48 bits virtual - power management: ts fid vid ttp + address sizes : 36 bits physical, 48 bits virtual + power management: */ void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* st ) { @@ -1790,42 +1795,79 @@ void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* st ) } while (0) switch (0xFFFFFFFF & st->guest_RAX) { - case 0x0: - SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65); + case 0x00000000: + SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69); break; - case 0x1: - SET_ABCD(0x00000fc0, 0x00000800, 0x00000000, 0x078bfbff); + case 0x00000001: + SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff); break; - case 0x80000000: - SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65); + case 0x00000002: + SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049); break; - case 0x80000001: - SET_ABCD(0x00000fc0, 0x0000010a, 0x00000000, 0xe1d3fbff); + case 0x00000003: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); break; - case 0x80000002: - SET_ABCD(0x20444d41, 0x6c687441, 0x74286e6f, 0x3620296d); + case 0x00000004: { + switch (0xFFFFFFFF & st->guest_RCX) { + case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f, + 0x0000003f, 0x00000001); break; + case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f, + 0x0000003f, 0x00000001); break; + case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f, + 0x00000fff, 0x00000001); break; + default: SET_ABCD(0x00000000, 0x00000000, + 0x00000000, 0x00000000); break; + } break; - case 0x80000003: - SET_ABCD(0x72502034, 0x7365636f, 0x20726f73, 0x30303233); + } + case 0x00000005: + SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020); break; - case 0x80000004: - SET_ABCD(0x0000002b, 0x00000000, 0x00000000, 0x00000000); + case 0x00000006: + SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000); break; - case 0x80000005: - SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140); + case 0x00000007: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); break; - case 0x80000006: - SET_ABCD(0x00000000, 0x42004200, 0x02008140, 0x00000000); + case 0x00000008: + SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000); break; - case 0x80000007: - SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f); + case 0x00000009: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); break; - case 0x80000008: - SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000); + case 0x0000000a: + unhandled_eax_value: + SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000); break; - default: - SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + case 0x80000000: + SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x80000001: + SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800); + break; + case 0x80000002: + SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865); + break; + case 0x80000003: + SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020); break; + case 0x80000004: + SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847); + break; + case 0x80000005: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x80000006: + SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000); + break; + case 0x80000007: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x80000008: + SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); + break; + default: + goto unhandled_eax_value; } # undef SET_ABCD } diff --git a/VEX/priv/guest-amd64/toIR.c b/VEX/priv/guest-amd64/toIR.c index 0003976228..28083742ae 100644 --- a/VEX/priv/guest-amd64/toIR.c +++ b/VEX/priv/guest-amd64/toIR.c @@ -8309,6 +8309,182 @@ static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2, } +/* Helper for the SSSE3 (not SSE3) PMULHRSW insns. Given two 64-bit + values (aa,bb), computes, for each of the 4 16-bit lanes: + + (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1 +*/ +static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp bb = newTemp(Ity_I64); + IRTemp aahi32s = newTemp(Ity_I64); + IRTemp aalo32s = newTemp(Ity_I64); + IRTemp bbhi32s = newTemp(Ity_I64); + IRTemp bblo32s = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + IRTemp one32x2 = newTemp(Ity_I64); + assign(aa, aax); + assign(bb, bbx); + assign( aahi32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)), + mkU8(16) )); + assign( aalo32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)), + mkU8(16) )); + assign( bbhi32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)), + mkU8(16) )); + assign( bblo32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)), + mkU8(16) )); + assign(one32x2, mkU64( (1ULL << 32) + 1 )); + assign( + rHi, + binop( + Iop_ShrN32x2, + binop( + Iop_Add32x2, + binop( + Iop_ShrN32x2, + binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)), + mkU8(14) + ), + mkexpr(one32x2) + ), + mkU8(1) + ) + ); + assign( + rLo, + binop( + Iop_ShrN32x2, + binop( + Iop_Add32x2, + binop( + Iop_ShrN32x2, + binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)), + mkU8(14) + ), + mkexpr(one32x2) + ), + mkU8(1) + ) + ); + return + binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo)); +} + +/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns. Given two 64-bit + values (aa,bb), computes, for each lane: + + if aa_lane < 0 then - bb_lane + else if aa_lane > 0 then bb_lane + else 0 +*/ +static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp bb = newTemp(Ity_I64); + IRTemp zero = newTemp(Ity_I64); + IRTemp bbNeg = newTemp(Ity_I64); + IRTemp negMask = newTemp(Ity_I64); + IRTemp posMask = newTemp(Ity_I64); + IROp opSub = Iop_INVALID; + IROp opCmpGTS = Iop_INVALID; + + switch (laneszB) { + case 1: opSub = Iop_Sub8x8; opCmpGTS = Iop_CmpGT8Sx8; break; + case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break; + case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break; + default: vassert(0); + } + + assign( aa, aax ); + assign( bb, bbx ); + assign( zero, mkU64(0) ); + assign( bbNeg, binop(opSub, mkexpr(zero), mkexpr(bb)) ); + assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) ); + assign( posMask, binop(opCmpGTS, mkexpr(aa), mkexpr(zero)) ); + + return + binop(Iop_Or64, + binop(Iop_And64, mkexpr(bb), mkexpr(posMask)), + binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) ); + +} + +/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns. Given a 64-bit + value aa, computes, for each lane + + if aa < 0 then -aa else aa + + Note that the result is interpreted as unsigned, so that the + absolute value of the most negative signed input can be + represented. +*/ +static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp zero = newTemp(Ity_I64); + IRTemp aaNeg = newTemp(Ity_I64); + IRTemp negMask = newTemp(Ity_I64); + IRTemp posMask = newTemp(Ity_I64); + IROp opSub = Iop_INVALID; + IROp opSarN = Iop_INVALID; + + switch (laneszB) { + case 1: opSub = Iop_Sub8x8; opSarN = Iop_SarN8x8; break; + case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break; + case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break; + default: vassert(0); + } + + assign( aa, aax ); + assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) ); + assign( posMask, unop(Iop_Not64, mkexpr(negMask)) ); + assign( zero, mkU64(0) ); + assign( aaNeg, binop(opSub, mkexpr(zero), mkexpr(aa)) ); + return + binop(Iop_Or64, + binop(Iop_And64, mkexpr(aa), mkexpr(posMask)), + binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ); +} + +static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64, + IRTemp lo64, Long byteShift ) +{ + vassert(byteShift >= 1 && byteShift <= 7); + return + binop(Iop_Or64, + binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))), + binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift)) + ); +} + +/* Generate a SIGSEGV followed by a restart of the current instruction + if effective_addr is not 16-aligned. This is required behaviour + for some SSE3 instructions and all 128-bit SSSE3 instructions. + This assumes that guest_RIP_curr_instr is set correctly! */ +static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) +{ + stmt( + IRStmt_Exit( + binop(Iop_CmpNE64, + binop(Iop_And64,mkexpr(effective_addr),mkU64(0xF)), + mkU64(0)), + Ijk_SigSEGV, + IRConst_U64(guest_RIP_curr_instr) + ) + ); +} + + /* Helper for deciding whether a given insn (starting at the opcode byte) may validly be used with a LOCK prefix. The following insns may be used with LOCK when their destination operand is in memory. @@ -12455,6 +12631,830 @@ DisResult disInstr_AMD64_WRK ( /* --- end of the SSE3 decoder. --- */ /* ---------------------------------------------------- */ + /* ---------------------------------------------------- */ + /* --- start of the SSSE3 decoder. --- */ + /* ---------------------------------------------------- */ + + /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and + Unsigned Bytes (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + IRTemp sVoddsSX = newTemp(Ity_I64); + IRTemp sVevensSX = newTemp(Ity_I64); + IRTemp dVoddsZX = newTemp(Ity_I64); + IRTemp dVevensZX = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmaddubsw %s,%s\n", dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + /* compute dV unsigned x sV signed */ + assign( sVoddsSX, + binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) ); + assign( sVevensSX, + binop(Iop_SarN16x4, + binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)), + mkU8(8)) ); + assign( dVoddsZX, + binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) ); + assign( dVevensZX, + binop(Iop_ShrN16x4, + binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)), + mkU8(8)) ); + + putMMXReg( + gregLO3ofRM(modrm), + binop(Iop_QAdd16Sx4, + binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)), + binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX)) + ) + ); + goto decode_success; + } + + /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and + Unsigned Bytes (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sVoddsSX = newTemp(Ity_V128); + IRTemp sVevensSX = newTemp(Ity_V128); + IRTemp dVoddsZX = newTemp(Ity_V128); + IRTemp dVevensZX = newTemp(Ity_V128); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmaddubsw %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + /* compute dV unsigned x sV signed */ + assign( sVoddsSX, + binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) ); + assign( sVevensSX, + binop(Iop_SarN16x8, + binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)), + mkU8(8)) ); + assign( dVoddsZX, + binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) ); + assign( dVevensZX, + binop(Iop_ShrN16x8, + binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)), + mkU8(8)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_QAdd16Sx8, + binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)), + binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX)) + ) + ); + goto decode_success; + } + + /* ***--- these are MMX class insns introduced in SSSE3 ---*** */ + /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or + mmx) and G to G (mmx). */ + /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or + mmx) and G to G (mmx). */ + /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G + to G (mmx). */ + + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01 + || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) { + HChar* str = "???"; + IROp opV64 = Iop_INVALID; + IROp opCatO = Iop_CatOddLanes16x4; + IROp opCatE = Iop_CatEvenLanes16x4; + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + + switch (insn[2]) { + case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break; + case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break; + case 0x01: opV64 = Iop_Add16x4; str = "addw"; break; + case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break; + case 0x02: opV64 = Iop_Add32x2; str = "addd"; break; + case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break; + default: vassert(0); + } + if (insn[2] == 0x02 || insn[2] == 0x06) { + opCatO = Iop_InterleaveHI32x2; + opCatE = Iop_InterleaveLO32x2; + } + + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("ph%s %s,%s\n", str, dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + binop(opV64, + binop(opCatE,mkexpr(sV),mkexpr(dV)), + binop(opCatO,mkexpr(sV),mkexpr(dV)) + ) + ); + goto decode_success; + } + + /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or + xmm) and G to G (xmm). */ + /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or + xmm) and G to G (xmm). */ + /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and + G to G (xmm). */ + + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01 + || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) { + HChar* str = "???"; + IROp opV64 = Iop_INVALID; + IROp opCatO = Iop_CatOddLanes16x4; + IROp opCatE = Iop_CatEvenLanes16x4; + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + + modrm = insn[3]; + + switch (insn[2]) { + case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break; + case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break; + case 0x01: opV64 = Iop_Add16x4; str = "addw"; break; + case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break; + case 0x02: opV64 = Iop_Add32x2; str = "addd"; break; + case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break; + default: vassert(0); + } + if (insn[2] == 0x02 || insn[2] == 0x06) { + opCatO = Iop_InterleaveHI32x2; + opCatE = Iop_InterleaveLO32x2; + } + + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) ); + DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + delta += 3+1; + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + DIP("ph%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + delta += 3+alen; + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + /* This isn't a particularly efficient way to compute the + result, but at least it avoids a proliferation of IROps, + hence avoids complication all the backends. */ + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + binop(opV64, + binop(opCatE,mkexpr(sHi),mkexpr(sLo)), + binop(opCatO,mkexpr(sHi),mkexpr(sLo)) + ), + binop(opV64, + binop(opCatE,mkexpr(dHi),mkexpr(dLo)), + binop(opCatO,mkexpr(dHi),mkexpr(dLo)) + ) + ) + ); + goto decode_success; + } + + /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale + (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmulhrsw %s,%s\n", dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) ) + ); + goto decode_success; + } + + /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and + Scale (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmulhrsw %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ), + dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) ) + ) + ); + goto decode_success; + } + + /* 0F 38 08 = PSIGNB -- Packed Sign 8x8 (MMX) */ + /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */ + /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x08: laneszB = 1; str = "b"; break; + case 0x09: laneszB = 2; str = "w"; break; + case 0x0A: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("psign%s %s,%s\n", str, dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB ) + ); + goto decode_success; + } + + /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */ + /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */ + /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x08: laneszB = 1; str = "b"; break; + case 0x09: laneszB = 2; str = "w"; break; + case 0x0A: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("psign%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ), + dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB ) + ) + ); + goto decode_success; + } + + /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8 (MMX) */ + /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */ + /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) { + IRTemp sV = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x1C: laneszB = 1; str = "b"; break; + case 0x1D: laneszB = 2; str = "w"; break; + case 0x1E: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + do_MMX_preamble(); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pabs%s %s,%s\n", str, dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + dis_PABS_helper( mkexpr(sV), laneszB ) + ); + goto decode_success; + } + + /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */ + /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */ + /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) { + IRTemp sV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x1C: laneszB = 1; str = "b"; break; + case 0x1D: laneszB = 2; str = "w"; break; + case 0x1E: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pabs%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + dis_PABS_helper( mkexpr(sHi), laneszB ), + dis_PABS_helper( mkexpr(sLo), laneszB ) + ) + ); + goto decode_success; + } + + /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */ + if (haveNo66noF2noF3(pfx) && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + IRTemp res = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + d64 = (Long)insn[3+1]; + delta += 3+1+1; + DIP("palignr $%d,%s,%s\n", (Int)d64, + nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + d64 = (Long)insn[3+alen]; + delta += 3+alen+1; + DIP("palignr $%d%s,%s\n", (Int)d64, + dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + if (d64 == 0) { + assign( res, mkexpr(sV) ); + } + else if (d64 >= 1 && d64 <= 7) { + assign(res, + binop(Iop_Or64, + binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)), + binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64)) + ))); + } + else if (d64 == 8) { + assign( res, mkexpr(dV) ); + } + else if (d64 >= 9 && d64 <= 15) { + assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) ); + } + else if (d64 >= 16 && d64 <= 255) { + assign( res, mkU64(0) ); + } + else + vassert(0); + + putMMXReg( gregLO3ofRM(modrm), mkexpr(res) ); + goto decode_success; + } + + /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + d64 = (Long)insn[3+1]; + delta += 3+1+1; + DIP("palignr $%d,%s,%s\n", (Int)d64, + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + d64 = (Long)insn[3+alen]; + delta += 3+alen+1; + DIP("palignr $%d,%s,%s\n", (Int)d64, + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + if (d64 == 0) { + assign( rHi, mkexpr(sHi) ); + assign( rLo, mkexpr(sLo) ); + } + else if (d64 >= 1 && d64 <= 7) { + assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) ); + assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) ); + } + else if (d64 == 8) { + assign( rHi, mkexpr(dLo) ); + assign( rLo, mkexpr(sHi) ); + } + else if (d64 >= 9 && d64 <= 15) { + assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) ); + assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) ); + } + else if (d64 == 16) { + assign( rHi, mkexpr(dHi) ); + assign( rLo, mkexpr(dLo) ); + } + else if (d64 >= 17 && d64 <= 23) { + assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) ); + assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) ); + } + else if (d64 == 24) { + assign( rHi, mkU64(0) ); + assign( rLo, mkexpr(dHi) ); + } + else if (d64 >= 25 && d64 <= 31) { + assign( rHi, mkU64(0) ); + assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) ); + } + else if (d64 >= 32 && d64 <= 255) { + assign( rHi, mkU64(0) ); + assign( rLo, mkU64(0) ); + } + else + vassert(0); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)) + ); + goto decode_success; + } + + /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pshufb %s,%s\n", dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + binop( + Iop_And64, + /* permute the lanes */ + binop( + Iop_Perm8x8, + mkexpr(dV), + binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL)) + ), + /* mask off lanes which have (index & 0x80) == 0x80 */ + unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7))) + ) + ); + goto decode_success; + } + + /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + IRTemp sevens = newTemp(Ity_I64); + IRTemp mask0x80hi = newTemp(Ity_I64); + IRTemp mask0x80lo = newTemp(Ity_I64); + IRTemp maskBit3hi = newTemp(Ity_I64); + IRTemp maskBit3lo = newTemp(Ity_I64); + IRTemp sAnd7hi = newTemp(Ity_I64); + IRTemp sAnd7lo = newTemp(Ity_I64); + IRTemp permdHi = newTemp(Ity_I64); + IRTemp permdLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pshufb %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + assign( sevens, mkU64(0x0707070707070707ULL) ); + + /* + mask0x80hi = Not(SarN8x8(sHi,7)) + maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7) + sAnd7hi = And(sHi,sevens) + permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi), + And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) ) + rHi = And(permdHi,mask0x80hi) + */ + assign( + mask0x80hi, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7)))); + + assign( + maskBit3hi, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)), + mkU8(7))); + + assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens))); + + assign( + permdHi, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)), + mkexpr(maskBit3hi)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)), + unop(Iop_Not64,mkexpr(maskBit3hi))) )); + + assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) ); + + /* And the same for the lower half of the result. What fun. */ + + assign( + mask0x80lo, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7)))); + + assign( + maskBit3lo, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)), + mkU8(7))); + + assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens))); + + assign( + permdLo, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)), + mkexpr(maskBit3lo)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)), + unop(Iop_Not64,mkexpr(maskBit3lo))) )); + + assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)) + ); + goto decode_success; + } + + /* ---------------------------------------------------- */ + /* --- end of the SSSE3 decoder. --- */ + /* ---------------------------------------------------- */ + /*after_sse_decoders:*/ /* Get the primary opcode. */ @@ -14337,7 +15337,7 @@ DisResult disInstr_AMD64_WRK ( d->fxState[1].fx = Ifx_Write; d->fxState[1].offset = OFFB_RBX; d->fxState[1].size = 8; - d->fxState[2].fx = Ifx_Write; + d->fxState[2].fx = Ifx_Modify; d->fxState[2].offset = OFFB_RCX; d->fxState[2].size = 8; d->fxState[3].fx = Ifx_Write; @@ -14699,11 +15699,13 @@ DisResult disInstr_AMD64_WRK ( decode_failure: /* All decode failures end up here. */ vex_printf("vex amd64->IR: unhandled instruction bytes: " - "0x%x 0x%x 0x%x 0x%x\n", + "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", (Int)getUChar(delta_start+0), (Int)getUChar(delta_start+1), (Int)getUChar(delta_start+2), - (Int)getUChar(delta_start+3) ); + (Int)getUChar(delta_start+3), + (Int)getUChar(delta_start+4), + (Int)getUChar(delta_start+5) ); /* Tell the dispatcher that this insn cannot be decoded, and so has not been executed, and (is currently) the next to be executed. diff --git a/VEX/priv/guest-x86/ghelpers.c b/VEX/priv/guest-x86/ghelpers.c index c89357603c..2c900087ac 100644 --- a/VEX/priv/guest-x86/ghelpers.c +++ b/VEX/priv/guest-x86/ghelpers.c @@ -2075,37 +2075,118 @@ void x86g_dirtyhelper_CPUID_sse1 ( VexGuestX86State* st ) } } -/* Claim to be the following SSE2-capable CPU: +/* Claim to be the following SSSE3-capable CPU (2 x ...): vendor_id : GenuineIntel - cpu family : 15 - model : 2 - model name : Intel(R) Pentium(R) 4 CPU 2.40GHz - stepping : 7 - cpu MHz : 2394.234 - cache size : 512 KB + cpu family : 6 + model : 15 + model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz + stepping : 6 + cpu MHz : 2394.000 + cache size : 4096 KB + physical id : 0 + siblings : 2 + core id : 0 + cpu cores : 2 + fpu : yes + fpu_exception : yes + cpuid level : 10 + wp : yes + flags : fpu vme de pse tsc msr pae mce cx8 apic sep + mtrr pge mca cmov pat pse36 clflush dts acpi + mmx fxsr sse sse2 ss ht tm syscall nx lm + constant_tsc pni monitor ds_cpl vmx est tm2 + cx16 xtpr lahf_lm + bogomips : 4798.78 + clflush size : 64 + cache_alignment : 64 + address sizes : 36 bits physical, 48 bits virtual + power management: */ void x86g_dirtyhelper_CPUID_sse2 ( VexGuestX86State* st ) { +# define SET_ABCD(_a,_b,_c,_d) \ + do { st->guest_EAX = (UInt)(_a); \ + st->guest_EBX = (UInt)(_b); \ + st->guest_ECX = (UInt)(_c); \ + st->guest_EDX = (UInt)(_d); \ + } while (0) + switch (st->guest_EAX) { - case 0: - st->guest_EAX = 0x00000002; - st->guest_EBX = 0x756e6547; - st->guest_ECX = 0x6c65746e; - st->guest_EDX = 0x49656e69; + case 0x00000000: + SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69); break; - case 1: - st->guest_EAX = 0x00000f27; - st->guest_EBX = 0x00010809; - st->guest_ECX = 0x00004400; - st->guest_EDX = 0xbfebfbff; + case 0x00000001: + SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff); break; - default: - st->guest_EAX = 0x665b5101; - st->guest_EBX = 0x00000000; - st->guest_ECX = 0x00000000; - st->guest_EDX = 0x007b7040; + case 0x00000002: + SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049); + break; + case 0x00000003: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x00000004: { + switch (st->guest_ECX) { + case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f, + 0x0000003f, 0x00000001); break; + case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f, + 0x0000003f, 0x00000001); break; + case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f, + 0x00000fff, 0x00000001); break; + default: SET_ABCD(0x00000000, 0x00000000, + 0x00000000, 0x00000000); break; + } + break; + } + case 0x00000005: + SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020); + break; + case 0x00000006: + SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000); + break; + case 0x00000007: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x00000008: + SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x00000009: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x0000000a: + unhandled_eax_value: + SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x80000000: + SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x80000001: + SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100000); break; + case 0x80000002: + SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865); + break; + case 0x80000003: + SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020); + break; + case 0x80000004: + SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847); + break; + case 0x80000005: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x80000006: + SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000); + break; + case 0x80000007: + SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000); + break; + case 0x80000008: + SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000); + break; + default: + goto unhandled_eax_value; } +# undef SET_ABCD } diff --git a/VEX/priv/guest-x86/toIR.c b/VEX/priv/guest-x86/toIR.c index 304a85cef5..343a4f6c03 100644 --- a/VEX/priv/guest-x86/toIR.c +++ b/VEX/priv/guest-x86/toIR.c @@ -7225,6 +7225,182 @@ void set_EFLAGS_from_value ( IRTemp t1, } +/* Helper for the SSSE3 (not SSE3) PMULHRSW insns. Given two 64-bit + values (aa,bb), computes, for each of the 4 16-bit lanes: + + (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1 +*/ +static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp bb = newTemp(Ity_I64); + IRTemp aahi32s = newTemp(Ity_I64); + IRTemp aalo32s = newTemp(Ity_I64); + IRTemp bbhi32s = newTemp(Ity_I64); + IRTemp bblo32s = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + IRTemp one32x2 = newTemp(Ity_I64); + assign(aa, aax); + assign(bb, bbx); + assign( aahi32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)), + mkU8(16) )); + assign( aalo32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)), + mkU8(16) )); + assign( bbhi32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)), + mkU8(16) )); + assign( bblo32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)), + mkU8(16) )); + assign(one32x2, mkU64( (1ULL << 32) + 1 )); + assign( + rHi, + binop( + Iop_ShrN32x2, + binop( + Iop_Add32x2, + binop( + Iop_ShrN32x2, + binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)), + mkU8(14) + ), + mkexpr(one32x2) + ), + mkU8(1) + ) + ); + assign( + rLo, + binop( + Iop_ShrN32x2, + binop( + Iop_Add32x2, + binop( + Iop_ShrN32x2, + binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)), + mkU8(14) + ), + mkexpr(one32x2) + ), + mkU8(1) + ) + ); + return + binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo)); +} + +/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns. Given two 64-bit + values (aa,bb), computes, for each lane: + + if aa_lane < 0 then - bb_lane + else if aa_lane > 0 then bb_lane + else 0 +*/ +static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp bb = newTemp(Ity_I64); + IRTemp zero = newTemp(Ity_I64); + IRTemp bbNeg = newTemp(Ity_I64); + IRTemp negMask = newTemp(Ity_I64); + IRTemp posMask = newTemp(Ity_I64); + IROp opSub = Iop_INVALID; + IROp opCmpGTS = Iop_INVALID; + + switch (laneszB) { + case 1: opSub = Iop_Sub8x8; opCmpGTS = Iop_CmpGT8Sx8; break; + case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break; + case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break; + default: vassert(0); + } + + assign( aa, aax ); + assign( bb, bbx ); + assign( zero, mkU64(0) ); + assign( bbNeg, binop(opSub, mkexpr(zero), mkexpr(bb)) ); + assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) ); + assign( posMask, binop(opCmpGTS, mkexpr(aa), mkexpr(zero)) ); + + return + binop(Iop_Or64, + binop(Iop_And64, mkexpr(bb), mkexpr(posMask)), + binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) ); + +} + +/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns. Given a 64-bit + value aa, computes, for each lane + + if aa < 0 then -aa else aa + + Note that the result is interpreted as unsigned, so that the + absolute value of the most negative signed input can be + represented. +*/ +static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp zero = newTemp(Ity_I64); + IRTemp aaNeg = newTemp(Ity_I64); + IRTemp negMask = newTemp(Ity_I64); + IRTemp posMask = newTemp(Ity_I64); + IROp opSub = Iop_INVALID; + IROp opSarN = Iop_INVALID; + + switch (laneszB) { + case 1: opSub = Iop_Sub8x8; opSarN = Iop_SarN8x8; break; + case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break; + case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break; + default: vassert(0); + } + + assign( aa, aax ); + assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) ); + assign( posMask, unop(Iop_Not64, mkexpr(negMask)) ); + assign( zero, mkU64(0) ); + assign( aaNeg, binop(opSub, mkexpr(zero), mkexpr(aa)) ); + return + binop(Iop_Or64, + binop(Iop_And64, mkexpr(aa), mkexpr(posMask)), + binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ); +} + +static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64, + IRTemp lo64, Int byteShift ) +{ + vassert(byteShift >= 1 && byteShift <= 7); + return + binop(Iop_Or64, + binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))), + binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift)) + ); +} + +/* Generate a SIGSEGV followed by a restart of the current instruction + if effective_addr is not 16-aligned. This is required behaviour + for some SSE3 instructions and all 128-bit SSSE3 instructions. + This assumes that guest_RIP_curr_instr is set correctly! */ +static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) +{ + stmt( + IRStmt_Exit( + binop(Iop_CmpNE32, + binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)), + mkU32(0)), + Ijk_SigSEGV, + IRConst_U32(guest_EIP_curr_instr) + ) + ); +} + + /* Helper for deciding whether a given insn (starting at the opcode byte) may validly be used with a LOCK prefix. The following insns may be used with LOCK when their destination operand is in memory. @@ -11167,6 +11343,817 @@ DisResult disInstr_X86_WRK ( /* --- end of the SSE3 decoder. --- */ /* ---------------------------------------------------- */ + /* ---------------------------------------------------- */ + /* --- start of the SSSE3 decoder. --- */ + /* ---------------------------------------------------- */ + + /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and + Unsigned Bytes (MMX) */ + if (sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + IRTemp sVoddsSX = newTemp(Ity_I64); + IRTemp sVevensSX = newTemp(Ity_I64); + IRTemp dVoddsZX = newTemp(Ity_I64); + IRTemp dVevensZX = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)), + nameMMXReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmaddubsw %s,%s\n", dis_buf, + nameMMXReg(gregOfRM(modrm))); + } + + /* compute dV unsigned x sV signed */ + assign( sVoddsSX, + binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) ); + assign( sVevensSX, + binop(Iop_SarN16x4, + binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)), + mkU8(8)) ); + assign( dVoddsZX, + binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) ); + assign( dVevensZX, + binop(Iop_ShrN16x4, + binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)), + mkU8(8)) ); + + putMMXReg( + gregOfRM(modrm), + binop(Iop_QAdd16Sx4, + binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)), + binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX)) + ) + ); + goto decode_success; + } + + /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and + Unsigned Bytes (XMM) */ + if (sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sVoddsSX = newTemp(Ity_V128); + IRTemp sVevensSX = newTemp(Ity_V128); + IRTemp dVoddsZX = newTemp(Ity_V128); + IRTemp dVevensZX = newTemp(Ity_V128); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmaddubsw %s,%s\n", dis_buf, + nameXMMReg(gregOfRM(modrm))); + } + + /* compute dV unsigned x sV signed */ + assign( sVoddsSX, + binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) ); + assign( sVevensSX, + binop(Iop_SarN16x8, + binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)), + mkU8(8)) ); + assign( dVoddsZX, + binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) ); + assign( dVevensZX, + binop(Iop_ShrN16x8, + binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)), + mkU8(8)) ); + + putXMMReg( + gregOfRM(modrm), + binop(Iop_QAdd16Sx8, + binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)), + binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX)) + ) + ); + goto decode_success; + } + + /* ***--- these are MMX class insns introduced in SSSE3 ---*** */ + /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or + mmx) and G to G (mmx). */ + /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or + mmx) and G to G (mmx). */ + /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G + to G (mmx). */ + + if (sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01 + || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) { + HChar* str = "???"; + IROp opV64 = Iop_INVALID; + IROp opCatO = Iop_CatOddLanes16x4; + IROp opCatE = Iop_CatEvenLanes16x4; + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + + switch (insn[2]) { + case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break; + case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break; + case 0x01: opV64 = Iop_Add16x4; str = "addw"; break; + case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break; + case 0x02: opV64 = Iop_Add32x2; str = "addd"; break; + case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break; + default: vassert(0); + } + if (insn[2] == 0x02 || insn[2] == 0x06) { + opCatO = Iop_InterleaveHI32x2; + opCatE = Iop_InterleaveLO32x2; + } + + do_MMX_preamble(); + assign( dV, getMMXReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)), + nameMMXReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("ph%s %s,%s\n", str, dis_buf, + nameMMXReg(gregOfRM(modrm))); + } + + putMMXReg( + gregOfRM(modrm), + binop(opV64, + binop(opCatE,mkexpr(sV),mkexpr(dV)), + binop(opCatO,mkexpr(sV),mkexpr(dV)) + ) + ); + goto decode_success; + } + + /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or + xmm) and G to G (xmm). */ + /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or + xmm) and G to G (xmm). */ + /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and + G to G (xmm). */ + + if (sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01 + || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) { + HChar* str = "???"; + IROp opV64 = Iop_INVALID; + IROp opCatO = Iop_CatOddLanes16x4; + IROp opCatE = Iop_CatEvenLanes16x4; + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + + modrm = insn[3]; + + switch (insn[2]) { + case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break; + case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break; + case 0x01: opV64 = Iop_Add16x4; str = "addw"; break; + case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break; + case 0x02: opV64 = Iop_Add32x2; str = "addd"; break; + case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break; + default: vassert(0); + } + if (insn[2] == 0x02 || insn[2] == 0x06) { + opCatO = Iop_InterleaveHI32x2; + opCatE = Iop_InterleaveLO32x2; + } + + assign( dV, getXMMReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg( eregOfRM(modrm)) ); + DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm))); + delta += 3+1; + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + DIP("ph%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRM(modrm))); + delta += 3+alen; + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + /* This isn't a particularly efficient way to compute the + result, but at least it avoids a proliferation of IROps, + hence avoids complication all the backends. */ + putXMMReg( + gregOfRM(modrm), + binop(Iop_64HLtoV128, + binop(opV64, + binop(opCatE,mkexpr(sHi),mkexpr(sLo)), + binop(opCatO,mkexpr(sHi),mkexpr(sLo)) + ), + binop(opV64, + binop(opCatE,mkexpr(dHi),mkexpr(dLo)), + binop(opCatO,mkexpr(dHi),mkexpr(dLo)) + ) + ) + ); + goto decode_success; + } + + /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale + (MMX) */ + if (sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)), + nameMMXReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmulhrsw %s,%s\n", dis_buf, + nameMMXReg(gregOfRM(modrm))); + } + + putMMXReg( + gregOfRM(modrm), + dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) ) + ); + goto decode_success; + } + + /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and + Scale (XMM) */ + if (sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmulhrsw %s,%s\n", dis_buf, + nameXMMReg(gregOfRM(modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRM(modrm), + binop(Iop_64HLtoV128, + dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ), + dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) ) + ) + ); + goto decode_success; + } + + /* 0F 38 08 = PSIGNB -- Packed Sign 8x8 (MMX) */ + /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */ + /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */ + if (sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x08: laneszB = 1; str = "b"; break; + case 0x09: laneszB = 2; str = "w"; break; + case 0x0A: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)), + nameMMXReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("psign%s %s,%s\n", str, dis_buf, + nameMMXReg(gregOfRM(modrm))); + } + + putMMXReg( + gregOfRM(modrm), + dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB ) + ); + goto decode_success; + } + + /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */ + /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */ + /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */ + if (sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x08: laneszB = 1; str = "b"; break; + case 0x09: laneszB = 2; str = "w"; break; + case 0x0A: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("psign%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRM(modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRM(modrm), + binop(Iop_64HLtoV128, + dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ), + dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB ) + ) + ); + goto decode_success; + } + + /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8 (MMX) */ + /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */ + /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */ + if (sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) { + IRTemp sV = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x1C: laneszB = 1; str = "b"; break; + case 0x1D: laneszB = 2; str = "w"; break; + case 0x1E: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + do_MMX_preamble(); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)), + nameMMXReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pabs%s %s,%s\n", str, dis_buf, + nameMMXReg(gregOfRM(modrm))); + } + + putMMXReg( + gregOfRM(modrm), + dis_PABS_helper( mkexpr(sV), laneszB ) + ); + goto decode_success; + } + + /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */ + /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */ + /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */ + if (sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) { + IRTemp sV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x1C: laneszB = 1; str = "b"; break; + case 0x1D: laneszB = 2; str = "w"; break; + case 0x1E: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pabs%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRM(modrm))); + } + + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRM(modrm), + binop(Iop_64HLtoV128, + dis_PABS_helper( mkexpr(sHi), laneszB ), + dis_PABS_helper( mkexpr(sLo), laneszB ) + ) + ); + goto decode_success; + } + + /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */ + if (sz == 4 + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + IRTemp res = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregOfRM(modrm)) ); + d32 = (UInt)insn[3+1]; + delta += 3+1+1; + DIP("palignr $%d,%s,%s\n", (Int)d32, + nameMMXReg(eregOfRM(modrm)), + nameMMXReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + d32 = (UInt)insn[3+alen]; + delta += 3+alen+1; + DIP("palignr $%d%s,%s\n", (Int)d32, + dis_buf, + nameMMXReg(gregOfRM(modrm))); + } + + if (d32 == 0) { + assign( res, mkexpr(sV) ); + } + else if (d32 >= 1 && d32 <= 7) { + assign(res, + binop(Iop_Or64, + binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)), + binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32)) + ))); + } + else if (d32 == 8) { + assign( res, mkexpr(dV) ); + } + else if (d32 >= 9 && d32 <= 15) { + assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) ); + } + else if (d32 >= 16 && d32 <= 255) { + assign( res, mkU64(0) ); + } + else + vassert(0); + + putMMXReg( gregOfRM(modrm), mkexpr(res) ); + goto decode_success; + } + + /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */ + if (sz == 2 + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRM(modrm)) ); + d32 = (UInt)insn[3+1]; + delta += 3+1+1; + DIP("palignr $%d,%s,%s\n", (Int)d32, + nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + d32 = (UInt)insn[3+alen]; + delta += 3+alen+1; + DIP("palignr $%d,%s,%s\n", (Int)d32, + dis_buf, + nameXMMReg(gregOfRM(modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + if (d32 == 0) { + assign( rHi, mkexpr(sHi) ); + assign( rLo, mkexpr(sLo) ); + } + else if (d32 >= 1 && d32 <= 7) { + assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) ); + assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) ); + } + else if (d32 == 8) { + assign( rHi, mkexpr(dLo) ); + assign( rLo, mkexpr(sHi) ); + } + else if (d32 >= 9 && d32 <= 15) { + assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) ); + assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) ); + } + else if (d32 == 16) { + assign( rHi, mkexpr(dHi) ); + assign( rLo, mkexpr(dLo) ); + } + else if (d32 >= 17 && d32 <= 23) { + assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) ); + assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) ); + } + else if (d32 == 24) { + assign( rHi, mkU64(0) ); + assign( rLo, mkexpr(dHi) ); + } + else if (d32 >= 25 && d32 <= 31) { + assign( rHi, mkU64(0) ); + assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) ); + } + else if (d32 >= 32 && d32 <= 255) { + assign( rHi, mkU64(0) ); + assign( rLo, mkU64(0) ); + } + else + vassert(0); + + putXMMReg( + gregOfRM(modrm), + binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)) + ); + goto decode_success; + } + + /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */ + if (sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)), + nameMMXReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pshufb %s,%s\n", dis_buf, + nameMMXReg(gregOfRM(modrm))); + } + + putMMXReg( + gregOfRM(modrm), + binop( + Iop_And64, + /* permute the lanes */ + binop( + Iop_Perm8x8, + mkexpr(dV), + binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL)) + ), + /* mask off lanes which have (index & 0x80) == 0x80 */ + unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7))) + ) + ); + goto decode_success; + } + + /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */ + if (sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + IRTemp sevens = newTemp(Ity_I64); + IRTemp mask0x80hi = newTemp(Ity_I64); + IRTemp mask0x80lo = newTemp(Ity_I64); + IRTemp maskBit3hi = newTemp(Ity_I64); + IRTemp maskBit3lo = newTemp(Ity_I64); + IRTemp sAnd7hi = newTemp(Ity_I64); + IRTemp sAnd7lo = newTemp(Ity_I64); + IRTemp permdHi = newTemp(Ity_I64); + IRTemp permdLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRM(modrm)) ); + delta += 3+1; + DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm))); + } else { + addr = disAMode ( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pshufb %s,%s\n", dis_buf, + nameXMMReg(gregOfRM(modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + assign( sevens, mkU64(0x0707070707070707ULL) ); + + /* + mask0x80hi = Not(SarN8x8(sHi,7)) + maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7) + sAnd7hi = And(sHi,sevens) + permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi), + And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) ) + rHi = And(permdHi,mask0x80hi) + */ + assign( + mask0x80hi, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7)))); + + assign( + maskBit3hi, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)), + mkU8(7))); + + assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens))); + + assign( + permdHi, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)), + mkexpr(maskBit3hi)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)), + unop(Iop_Not64,mkexpr(maskBit3hi))) )); + + assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) ); + + /* And the same for the lower half of the result. What fun. */ + + assign( + mask0x80lo, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7)))); + + assign( + maskBit3lo, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)), + mkU8(7))); + + assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens))); + + assign( + permdLo, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)), + mkexpr(maskBit3lo)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)), + unop(Iop_Not64,mkexpr(maskBit3lo))) )); + + assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) ); + + putXMMReg( + gregOfRM(modrm), + binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)) + ); + goto decode_success; + } + + /* ---------------------------------------------------- */ + /* --- end of the SSSE3 decoder. --- */ + /* ---------------------------------------------------- */ + after_sse_decoders: /* ---------------------------------------------------- */ @@ -12920,7 +13907,7 @@ DisResult disInstr_X86_WRK ( d->fxState[1].fx = Ifx_Write; d->fxState[1].offset = OFFB_EBX; d->fxState[1].size = 4; - d->fxState[2].fx = Ifx_Write; + d->fxState[2].fx = Ifx_Modify; d->fxState[2].offset = OFFB_ECX; d->fxState[2].size = 4; d->fxState[3].fx = Ifx_Write; diff --git a/VEX/priv/host-amd64/hdefs.c b/VEX/priv/host-amd64/hdefs.c index 2ff737f036..9caf8c2f2c 100644 --- a/VEX/priv/host-amd64/hdefs.c +++ b/VEX/priv/host-amd64/hdefs.c @@ -2692,6 +2692,9 @@ Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i, case Ijk_SigTRAP: *p++ = 0xBD; p = emit32(p, VEX_TRC_JMP_SIGTRAP); break; + case Ijk_SigSEGV: + *p++ = 0xBD; + p = emit32(p, VEX_TRC_JMP_SIGSEGV); break; case Ijk_Ret: case Ijk_Call: case Ijk_Boring: diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c index c00dbdab91..d49031e7c7 100644 --- a/VEX/priv/host-amd64/isel.c +++ b/VEX/priv/host-amd64/isel.c @@ -1038,6 +1038,12 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) fn = (HWord)h_generic_calc_InterleaveHI32x2; break; case Iop_InterleaveLO32x2: fn = (HWord)h_generic_calc_InterleaveLO32x2; break; + case Iop_CatOddLanes16x4: + fn = (HWord)h_generic_calc_CatOddLanes16x4; break; + case Iop_CatEvenLanes16x4: + fn = (HWord)h_generic_calc_CatEvenLanes16x4; break; + case Iop_Perm8x8: + fn = (HWord)h_generic_calc_Perm8x8; break; case Iop_Max8Ux8: fn = (HWord)h_generic_calc_Max8Ux8; break; @@ -1050,6 +1056,8 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Mul16x4: fn = (HWord)h_generic_calc_Mul16x4; break; + case Iop_Mul32x2: + fn = (HWord)h_generic_calc_Mul32x2; break; case Iop_MulHi16Sx4: fn = (HWord)h_generic_calc_MulHi16Sx4; break; case Iop_MulHi16Ux4: @@ -1095,6 +1103,10 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) fn = (HWord)h_generic_calc_ShlN16x4; second_is_UInt = True; break; + case Iop_ShlN8x8: + fn = (HWord)h_generic_calc_ShlN8x8; + second_is_UInt = True; + break; case Iop_ShrN32x2: fn = (HWord)h_generic_calc_ShrN32x2; second_is_UInt = True; diff --git a/VEX/priv/host-generic/h_generic_simd64.c b/VEX/priv/host-generic/h_generic_simd64.c index b271c0fea1..252fd4814d 100644 --- a/VEX/priv/host-generic/h_generic_simd64.c +++ b/VEX/priv/host-generic/h_generic_simd64.c @@ -142,6 +142,11 @@ static inline UChar sel8x8_0 ( ULong w64 ) { return toUChar(0xFF & (lo32 >> 0)); } +static inline UChar index8x8 ( ULong w64, UChar ix ) { + ix &= 7; + return toUChar((w64 >> (8*ix)) & 0xFF); +} + /* Scalar helpers. */ @@ -213,6 +218,12 @@ static inline Short mul16 ( Short xx, Short yy ) return (Short)t; } +static inline Int mul32 ( Int xx, Int yy ) +{ + Int t = ((Int)xx) * ((Int)yy); + return (Int)t; +} + static inline Short mulhi16S ( Short xx, Short yy ) { Int t = ((Int)xx) * ((Int)yy); @@ -299,6 +310,11 @@ static inline UChar qnarrow16Uto8 ( UShort xx0 ) /* shifts: we don't care about out-of-range ones, since that is dealt with at a higher level. */ +static inline UChar shl8 ( UChar v, UInt n ) +{ + return toUChar(v << n); +} + static inline UChar sar8 ( UChar v, UInt n ) { return toUChar(((Char)v) >> n); @@ -555,6 +571,14 @@ ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy ) ); } +ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy ) +{ + return mk32x2( + mul32( sel32x2_1(xx), sel32x2_1(yy) ), + mul32( sel32x2_0(xx), sel32x2_0(yy) ) + ); +} + ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy ) { return mk16x4( @@ -799,6 +823,42 @@ ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb ) ); } +/* ------------ Concatenation ------------ */ + +ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb ) +{ + return mk16x4( + sel16x4_3(aa), + sel16x4_1(aa), + sel16x4_3(bb), + sel16x4_1(bb) + ); +} + +ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) +{ + return mk16x4( + sel16x4_2(aa), + sel16x4_0(aa), + sel16x4_2(bb), + sel16x4_0(bb) + ); +} + +/* misc hack looking for a proper home */ +ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) +{ + return mk8x8( + index8x8(aa, sel8x8_7(bb)), + index8x8(aa, sel8x8_6(bb)), + index8x8(aa, sel8x8_5(bb)), + index8x8(aa, sel8x8_4(bb)), + index8x8(aa, sel8x8_3(bb)), + index8x8(aa, sel8x8_2(bb)), + index8x8(aa, sel8x8_1(bb)), + index8x8(aa, sel8x8_0(bb)) + ); +} /* ------------ Shifting ------------ */ /* Note that because these primops are undefined if the shift amount @@ -829,6 +889,22 @@ ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn ) ); } +ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn ) +{ + /* vassert(nn < 8); */ + nn &= 7; + return mk8x8( + shl8( sel8x8_7(xx), nn ), + shl8( sel8x8_6(xx), nn ), + shl8( sel8x8_5(xx), nn ), + shl8( sel8x8_4(xx), nn ), + shl8( sel8x8_3(xx), nn ), + shl8( sel8x8_2(xx), nn ), + shl8( sel8x8_1(xx), nn ), + shl8( sel8x8_0(xx), nn ) + ); +} + ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn ) { /* vassert(nn < 32); */ diff --git a/VEX/priv/host-generic/h_generic_simd64.h b/VEX/priv/host-generic/h_generic_simd64.h index eb00e690cf..e7d4c6927e 100644 --- a/VEX/priv/host-generic/h_generic_simd64.h +++ b/VEX/priv/host-generic/h_generic_simd64.h @@ -83,6 +83,7 @@ extern ULong h_generic_calc_QSub16Ux4 ( ULong, ULong ); extern ULong h_generic_calc_QSub8Ux8 ( ULong, ULong ); extern ULong h_generic_calc_Mul16x4 ( ULong, ULong ); +extern ULong h_generic_calc_Mul32x2 ( ULong, ULong ); extern ULong h_generic_calc_MulHi16Sx4 ( ULong, ULong ); extern ULong h_generic_calc_MulHi16Ux4 ( ULong, ULong ); @@ -108,6 +109,11 @@ extern ULong h_generic_calc_InterleaveLO16x4 ( ULong, ULong ); extern ULong h_generic_calc_InterleaveHI32x2 ( ULong, ULong ); extern ULong h_generic_calc_InterleaveLO32x2 ( ULong, ULong ); +extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong ); +extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong ); +extern ULong h_generic_calc_Perm8x8 ( ULong, ULong ); + +extern ULong h_generic_calc_ShlN8x8 ( ULong, UInt ); extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt ); extern ULong h_generic_calc_ShlN32x2 ( ULong, UInt ); diff --git a/VEX/priv/host-x86/isel.c b/VEX/priv/host-x86/isel.c index c00ec049b0..af50b5394e 100644 --- a/VEX/priv/host-x86/isel.c +++ b/VEX/priv/host-x86/isel.c @@ -2324,6 +2324,12 @@ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e ) fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish; case Iop_InterleaveLO32x2: fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish; + case Iop_CatOddLanes16x4: + fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish; + case Iop_CatEvenLanes16x4: + fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish; + case Iop_Perm8x8: + fn = (HWord)h_generic_calc_Perm8x8; goto binnish; case Iop_Max8Ux8: fn = (HWord)h_generic_calc_Max8Ux8; goto binnish; @@ -2336,6 +2342,8 @@ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e ) case Iop_Mul16x4: fn = (HWord)h_generic_calc_Mul16x4; goto binnish; + case Iop_Mul32x2: + fn = (HWord)h_generic_calc_Mul32x2; goto binnish; case Iop_MulHi16Sx4: fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish; case Iop_MulHi16Ux4: @@ -2401,6 +2409,8 @@ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e ) fn = (HWord)h_generic_calc_ShlN32x2; goto shifty; case Iop_ShlN16x4: fn = (HWord)h_generic_calc_ShlN16x4; goto shifty; + case Iop_ShlN8x8: + fn = (HWord)h_generic_calc_ShlN8x8; goto shifty; case Iop_ShrN32x2: fn = (HWord)h_generic_calc_ShrN32x2; goto shifty; case Iop_ShrN16x4: diff --git a/VEX/priv/ir/irdefs.c b/VEX/priv/ir/irdefs.c index f4d80b88ed..af949c560e 100644 --- a/VEX/priv/ir/irdefs.c +++ b/VEX/priv/ir/irdefs.c @@ -326,6 +326,7 @@ void ppIROp ( IROp op ) case Iop_QSub8Sx8: vex_printf("QSub8Sx8"); return; case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return; case Iop_Mul16x4: vex_printf("Mul16x4"); return; + case Iop_Mul32x2: vex_printf("Mul32x2"); return; case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return; case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return; case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return; @@ -340,6 +341,7 @@ void ppIROp ( IROp op ) case Iop_CmpGT8Sx8: vex_printf("CmpGT8Sx8"); return; case Iop_CmpGT16Sx4: vex_printf("CmpGT16Sx4"); return; case Iop_CmpGT32Sx2: vex_printf("CmpGT32Sx2"); return; + case Iop_ShlN8x8: vex_printf("ShlN8x8"); return; case Iop_ShlN16x4: vex_printf("ShlN16x4"); return; case Iop_ShlN32x2: vex_printf("ShlN32x2"); return; case Iop_ShrN16x4: vex_printf("ShrN16x4"); return; @@ -356,6 +358,9 @@ void ppIROp ( IROp op ) case Iop_InterleaveLO8x8: vex_printf("InterleaveLO8x8"); return; case Iop_InterleaveLO16x4: vex_printf("InterleaveLO16x4"); return; case Iop_InterleaveLO32x2: vex_printf("InterleaveLO32x2"); return; + case Iop_CatOddLanes16x4: vex_printf("CatOddLanes16x4"); return; + case Iop_CatEvenLanes16x4: vex_printf("CatEvenLanes16x4"); return; + case Iop_Perm8x8: vex_printf("Iop_Perm8x8"); return; case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return; case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return; @@ -1506,9 +1511,12 @@ void typeOfPrimop ( IROp op, case Iop_InterleaveHI8x8: case Iop_InterleaveLO8x8: case Iop_InterleaveHI16x4: case Iop_InterleaveLO16x4: case Iop_InterleaveHI32x2: case Iop_InterleaveLO32x2: + case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4: + case Iop_Perm8x8: case Iop_Max8Ux8: case Iop_Max16Sx4: case Iop_Min8Ux8: case Iop_Min16Sx4: - case Iop_Mul16x4: case Iop_MulHi16Sx4: case Iop_MulHi16Ux4: + case Iop_Mul16x4: case Iop_Mul32x2: + case Iop_MulHi16Sx4: case Iop_MulHi16Ux4: case Iop_QAdd8Sx8: case Iop_QAdd16Sx4: case Iop_QAdd8Ux8: case Iop_QAdd16Ux4: case Iop_QNarrow32Sx2: @@ -1518,7 +1526,7 @@ void typeOfPrimop ( IROp op, case Iop_QSub8Ux8: case Iop_QSub16Ux4: BINARY(Ity_I64,Ity_I64, Ity_I64); - case Iop_ShlN32x2: case Iop_ShlN16x4: + case Iop_ShlN32x2: case Iop_ShlN16x4: case Iop_ShlN8x8: case Iop_ShrN32x2: case Iop_ShrN16x4: case Iop_SarN32x2: case Iop_SarN16x4: case Iop_SarN8x8: BINARY(Ity_I64,Ity_I8, Ity_I64); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 056dd23dbb..4f8ef30771 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -658,7 +658,7 @@ typedef Iop_QSub8Sx8, Iop_QSub16Sx4, /* MULTIPLICATION (normal / high half of signed/unsigned) */ - Iop_Mul16x4, + Iop_Mul16x4, Iop_Mul32x2, Iop_MulHi16Ux4, Iop_MulHi16Sx4, @@ -677,7 +677,7 @@ typedef Iop_CmpGT8Sx8, Iop_CmpGT16Sx4, Iop_CmpGT32Sx2, /* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */ - Iop_ShlN16x4, Iop_ShlN32x2, + Iop_ShlN8x8, Iop_ShlN16x4, Iop_ShlN32x2, Iop_ShrN16x4, Iop_ShrN32x2, Iop_SarN8x8, Iop_SarN16x4, Iop_SarN32x2, @@ -692,6 +692,19 @@ typedef Iop_InterleaveHI8x8, Iop_InterleaveHI16x4, Iop_InterleaveHI32x2, Iop_InterleaveLO8x8, Iop_InterleaveLO16x4, Iop_InterleaveLO32x2, + /* CONCATENATION -- build a new value by concatenating either + the even or odd lanes of both operands. Note that + Cat{Odd,Even}Lanes32x2 are identical to Interleave{HI,LO}32x2 + and so are omitted. */ + Iop_CatOddLanes16x4, Iop_CatEvenLanes16x4, + + /* PERMUTING -- copy src bytes to dst, + as indexed by control vector bytes: + for i in 0 .. 7 . result[i] = argL[ argR[i] ] + argR[i] values may only be in the range 0 .. 7, else behaviour + is undefined. */ + Iop_Perm8x8, + /* ------------------ 128-bit SIMD FP. ------------------ */ /* --- 32x4 vector FP --- */