From: Julian Seward Date: Wed, 6 Feb 2008 11:42:45 +0000 (+0000) Subject: Add SSSE3 support. Currently only for 64-bit. TODO: X-Git-Tag: svn/VALGRIND_3_4_1^2~43 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8ddb62873b02fce10061acf4f8540b1cf7c8c5eb;p=thirdparty%2Fvalgrind.git Add SSSE3 support. Currently only for 64-bit. TODO: * Check through IR generation * For 128-bit variants accessing memory, generate an exception if effective address is not 128-bit aligned * Change CPUID output to be Core-2 like * Enable for 32-bit code too. * Make Memcheck handle the new IROps * Commit test cases git-svn-id: svn://svn.valgrind.org/vex/trunk@1808 --- diff --git a/VEX/priv/guest-amd64/toIR.c b/VEX/priv/guest-amd64/toIR.c index 0003976228..5c8570d80b 100644 --- a/VEX/priv/guest-amd64/toIR.c +++ b/VEX/priv/guest-amd64/toIR.c @@ -8309,6 +8309,165 @@ static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2, } +/* Helper for the SSSE3 (not SSE3) PMULHRSW insns. Given two 64-bit + values (aa,bb), computes, for each of the 4 16-bit lanes: + + (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1 +*/ +static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp bb = newTemp(Ity_I64); + IRTemp aahi32s = newTemp(Ity_I64); + IRTemp aalo32s = newTemp(Ity_I64); + IRTemp bbhi32s = newTemp(Ity_I64); + IRTemp bblo32s = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + IRTemp one32x2 = newTemp(Ity_I64); + assign(aa, aax); + assign(bb, bbx); + assign( aahi32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)), + mkU8(16) )); + assign( aalo32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)), + mkU8(16) )); + assign( bbhi32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)), + mkU8(16) )); + assign( bblo32s, + binop(Iop_SarN32x2, + binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)), + mkU8(16) )); + assign(one32x2, mkU64( (1ULL << 32) + 1 )); + assign( + rHi, + binop( + Iop_ShrN32x2, + binop( + Iop_Add32x2, + binop( + Iop_ShrN32x2, + binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)), + mkU8(14) + ), + mkexpr(one32x2) + ), + mkU8(1) + ) + ); + assign( + rLo, + binop( + Iop_ShrN32x2, + binop( + Iop_Add32x2, + binop( + Iop_ShrN32x2, + binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)), + mkU8(14) + ), + mkexpr(one32x2) + ), + mkU8(1) + ) + ); + return + binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo)); +} + +/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns. Given two 64-bit + values (aa,bb), computes, for each lane: + + if aa_lane < 0 then - bb_lane + else if aa_lane > 0 then bb_lane + else 0 +*/ +static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp bb = newTemp(Ity_I64); + IRTemp zero = newTemp(Ity_I64); + IRTemp bbNeg = newTemp(Ity_I64); + IRTemp negMask = newTemp(Ity_I64); + IRTemp posMask = newTemp(Ity_I64); + IROp opSub = Iop_INVALID; + IROp opCmpGTS = Iop_INVALID; + + switch (laneszB) { + case 1: opSub = Iop_Sub8x8; opCmpGTS = Iop_CmpGT8Sx8; break; + case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break; + case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break; + default: vassert(0); + } + + assign( aa, aax ); + assign( bb, bbx ); + assign( zero, mkU64(0) ); + assign( bbNeg, binop(opSub, mkexpr(zero), mkexpr(bb)) ); + assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) ); + assign( posMask, binop(opCmpGTS, mkexpr(aa), mkexpr(zero)) ); + + return + binop(Iop_Or64, + binop(Iop_And64, mkexpr(bb), mkexpr(posMask)), + binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) ); + +} + +/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns. Given a 64-bit + value aa, computes, for each lane + + if aa < 0 then -aa else aa + + Note that the result is interpreted as unsigned, so that the + absolute value of the most negative signed input can be + represented. +*/ +static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB ) +{ + IRTemp aa = newTemp(Ity_I64); + IRTemp zero = newTemp(Ity_I64); + IRTemp aaNeg = newTemp(Ity_I64); + IRTemp negMask = newTemp(Ity_I64); + IRTemp posMask = newTemp(Ity_I64); + IROp opSub = Iop_INVALID; + IROp opSarN = Iop_INVALID; + + switch (laneszB) { + case 1: opSub = Iop_Sub8x8; opSarN = Iop_SarN8x8; break; + case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break; + case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break; + default: vassert(0); + } + + assign( aa, aax ); + assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) ); + assign( posMask, unop(Iop_Not64, mkexpr(negMask)) ); + assign( zero, mkU64(0) ); + assign( aaNeg, binop(opSub, mkexpr(zero), mkexpr(aa)) ); + return + binop(Iop_Or64, + binop(Iop_And64, mkexpr(aa), mkexpr(posMask)), + binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ); +} + +static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64, + IRTemp lo64, Long byteShift ) +{ + vassert(byteShift >= 1 && byteShift <= 7); + return + binop(Iop_Or64, + binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))), + binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift)) + ); +} + + /* Helper for deciding whether a given insn (starting at the opcode byte) may validly be used with a LOCK prefix. The following insns may be used with LOCK when their destination operand is in memory. @@ -12455,6 +12614,830 @@ DisResult disInstr_AMD64_WRK ( /* --- end of the SSE3 decoder. --- */ /* ---------------------------------------------------- */ + /* ---------------------------------------------------- */ + /* --- start of the SSSE3 decoder. --- */ + /* ---------------------------------------------------- */ + + /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and + Unsigned Bytes (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + IRTemp sVoddsSX = newTemp(Ity_I64); + IRTemp sVevensSX = newTemp(Ity_I64); + IRTemp dVoddsZX = newTemp(Ity_I64); + IRTemp dVevensZX = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmaddubsw %s,%s\n", dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + /* compute dV unsigned x sV signed */ + assign( sVoddsSX, + binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) ); + assign( sVevensSX, + binop(Iop_SarN16x4, + binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)), + mkU8(8)) ); + assign( dVoddsZX, + binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) ); + assign( dVevensZX, + binop(Iop_ShrN16x4, + binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)), + mkU8(8)) ); + + putMMXReg( + gregLO3ofRM(modrm), + binop(Iop_QAdd16Sx4, + binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)), + binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX)) + ) + ); + goto decode_success; + } + + /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and + Unsigned Bytes (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sVoddsSX = newTemp(Ity_V128); + IRTemp sVevensSX = newTemp(Ity_V128); + IRTemp dVoddsZX = newTemp(Ity_V128); + IRTemp dVevensZX = newTemp(Ity_V128); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + /* FIXME: generate trap if addr is not 16-aligned */ + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmaddubsw %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + /* compute dV unsigned x sV signed */ + assign( sVoddsSX, + binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) ); + assign( sVevensSX, + binop(Iop_SarN16x8, + binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)), + mkU8(8)) ); + assign( dVoddsZX, + binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) ); + assign( dVevensZX, + binop(Iop_ShrN16x8, + binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)), + mkU8(8)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_QAdd16Sx8, + binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)), + binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX)) + ) + ); + goto decode_success; + } + + /* ***--- these are MMX class insns introduced in SSSE3 ---*** */ + /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or + mmx) and G to G (mmx). */ + /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or + mmx) and G to G (mmx). */ + /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G + to G (mmx). */ + /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G + to G (mmx). */ + + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01 + || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) { + HChar* str = "???"; + IROp opV64 = Iop_INVALID; + IROp opCatO = Iop_CatOddLanes16x4; + IROp opCatE = Iop_CatEvenLanes16x4; + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + + switch (insn[2]) { + case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break; + case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break; + case 0x01: opV64 = Iop_Add16x4; str = "addw"; break; + case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break; + case 0x02: opV64 = Iop_Add32x2; str = "addd"; break; + case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break; + default: vassert(0); + } + if (insn[2] == 0x02 || insn[2] == 0x06) { + opCatO = Iop_InterleaveHI32x2; + opCatE = Iop_InterleaveLO32x2; + } + + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("ph%s %s,%s\n", str, dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + binop(opV64, + binop(opCatE,mkexpr(sV),mkexpr(dV)), + binop(opCatO,mkexpr(sV),mkexpr(dV)) + ) + ); + goto decode_success; + } + + /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or + xmm) and G to G (xmm). */ + /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or + xmm) and G to G (xmm). */ + /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and + G to G (xmm). */ + /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and + G to G (xmm). */ + + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01 + || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) { + HChar* str = "???"; + IROp opV64 = Iop_INVALID; + IROp opCatO = Iop_CatOddLanes16x4; + IROp opCatE = Iop_CatEvenLanes16x4; + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + + modrm = insn[3]; + + switch (insn[2]) { + case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break; + case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break; + case 0x01: opV64 = Iop_Add16x4; str = "addw"; break; + case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break; + case 0x02: opV64 = Iop_Add32x2; str = "addd"; break; + case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break; + default: vassert(0); + } + if (insn[2] == 0x02 || insn[2] == 0x06) { + opCatO = Iop_InterleaveHI32x2; + opCatE = Iop_InterleaveLO32x2; + } + + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) ); + DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + delta += 3+1; + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + /* FIXME: generate trap if addr is not 16-aligned */ + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + DIP("ph%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + delta += 3+alen; + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + /* This isn't a particularly efficient way to compute the + result, but at least it avoids a proliferation of IROps, + hence avoids complication all the backends. */ + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + binop(opV64, + binop(opCatE,mkexpr(sHi),mkexpr(sLo)), + binop(opCatO,mkexpr(sHi),mkexpr(sLo)) + ), + binop(opV64, + binop(opCatE,mkexpr(dHi),mkexpr(dLo)), + binop(opCatO,mkexpr(dHi),mkexpr(dLo)) + ) + ) + ); + goto decode_success; + } + + /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale + (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmulhrsw %s,%s\n", dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) ) + ); + goto decode_success; + } + + /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and + Scale (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + /* FIXME: generate trap if addr is not 16-aligned */ + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pmulhrsw %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ), + dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) ) + ) + ); + goto decode_success; + } + + /* 0F 38 08 = PSIGNB -- Packed Sign 8x8 (MMX) */ + /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */ + /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x08: laneszB = 1; str = "b"; break; + case 0x09: laneszB = 2; str = "w"; break; + case 0x0A: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("psign%s %s,%s\n", str, dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB ) + ); + goto decode_success; + } + + /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */ + /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */ + /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x08: laneszB = 1; str = "b"; break; + case 0x09: laneszB = 2; str = "w"; break; + case 0x0A: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + /* FIXME: generate trap if addr is not 16-aligned */ + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("psign%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ), + dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB ) + ) + ); + goto decode_success; + } + + /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8 (MMX) */ + /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */ + /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) { + IRTemp sV = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x1C: laneszB = 1; str = "b"; break; + case 0x1D: laneszB = 2; str = "w"; break; + case 0x1E: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + do_MMX_preamble(); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pabs%s %s,%s\n", str, dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + dis_PABS_helper( mkexpr(sV), laneszB ) + ); + goto decode_success; + } + + /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */ + /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */ + /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) { + IRTemp sV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + HChar* str = "???"; + Int laneszB = 0; + + switch (insn[2]) { + case 0x1C: laneszB = 1; str = "b"; break; + case 0x1D: laneszB = 2; str = "w"; break; + case 0x1E: laneszB = 4; str = "d"; break; + default: vassert(0); + } + + modrm = insn[3]; + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + /* FIXME: generate trap if addr is not 16-aligned */ + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pabs%s %s,%s\n", str, dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, + dis_PABS_helper( mkexpr(sHi), laneszB ), + dis_PABS_helper( mkexpr(sLo), laneszB ) + ) + ); + goto decode_success; + } + + /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */ + if (haveNo66noF2noF3(pfx) && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + IRTemp res = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + d64 = (Long)insn[3+1]; + delta += 3+1+1; + DIP("palignr $%d,%s,%s\n", (Int)d64, + nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + d64 = (Long)insn[3+alen]; + delta += 3+alen+1; + DIP("palignr $%d%s,%s\n", (Int)d64, + dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + if (d64 == 0) { + assign( res, mkexpr(sV) ); + } + else if (d64 >= 1 && d64 <= 7) { + assign(res, + binop(Iop_Or64, + binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)), + binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64)) + ))); + } + else if (d64 == 8) { + assign( res, mkexpr(dV) ); + } + else if (d64 >= 9 && d64 <= 15) { + assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) ); + } + else if (d64 >= 16 && d64 <= 255) { + assign( res, mkU64(0) ); + } + else + vassert(0); + + putMMXReg( gregLO3ofRM(modrm), mkexpr(res) ); + goto decode_success; + } + + /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + d64 = (Long)insn[3+1]; + delta += 3+1+1; + DIP("palignr $%d,%s,%s\n", (Int)d64, + nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + /* FIXME: generate trap if addr is not 16-aligned */ + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + d64 = (Long)insn[3+alen]; + delta += 3+alen+1; + DIP("palignr $%d,%s,%s\n", (Int)d64, + dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + if (d64 == 0) { + assign( rHi, mkexpr(sHi) ); + assign( rLo, mkexpr(sLo) ); + } + else if (d64 >= 1 && d64 <= 7) { + assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) ); + assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) ); + } + else if (d64 == 8) { + assign( rHi, mkexpr(dLo) ); + assign( rLo, mkexpr(sHi) ); + } + else if (d64 >= 9 && d64 <= 15) { + assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) ); + assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) ); + } + else if (d64 == 16) { + assign( rHi, mkexpr(dHi) ); + assign( rLo, mkexpr(dLo) ); + } + else if (d64 >= 17 && d64 <= 23) { + assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) ); + assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) ); + } + else if (d64 == 24) { + assign( rHi, mkU64(0) ); + assign( rLo, mkexpr(dHi) ); + } + else if (d64 >= 25 && d64 <= 31) { + assign( rHi, mkU64(0) ); + assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) ); + } + else if (d64 >= 32 && d64 <= 255) { + assign( rHi, mkU64(0) ); + assign( rLo, mkU64(0) ); + } + else + vassert(0); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)) + ); + goto decode_success; + } + + /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */ + if (haveNo66noF2noF3(pfx) + && sz == 4 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) { + IRTemp sV = newTemp(Ity_I64); + IRTemp dV = newTemp(Ity_I64); + + modrm = insn[3]; + do_MMX_preamble(); + assign( dV, getMMXReg(gregLO3ofRM(modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getMMXReg(eregLO3ofRM(modrm)) ); + delta += 3+1; + DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)), + nameMMXReg(gregLO3ofRM(modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + assign( sV, loadLE(Ity_I64, mkexpr(addr)) ); + delta += 3+alen; + DIP("pshufb %s,%s\n", dis_buf, + nameMMXReg(gregLO3ofRM(modrm))); + } + + putMMXReg( + gregLO3ofRM(modrm), + binop( + Iop_And64, + /* permute the lanes */ + binop( + Iop_Perm8x8, + mkexpr(dV), + binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL)) + ), + /* mask off lanes which have (index & 0x80) == 0x80 */ + unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7))) + ) + ); + goto decode_success; + } + + /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */ + if (have66noF2noF3(pfx) + && (sz == 2 || /*redundant REX.W*/ sz == 8) + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) { + IRTemp sV = newTemp(Ity_V128); + IRTemp dV = newTemp(Ity_V128); + IRTemp sHi = newTemp(Ity_I64); + IRTemp sLo = newTemp(Ity_I64); + IRTemp dHi = newTemp(Ity_I64); + IRTemp dLo = newTemp(Ity_I64); + IRTemp rHi = newTemp(Ity_I64); + IRTemp rLo = newTemp(Ity_I64); + IRTemp sevens = newTemp(Ity_I64); + IRTemp mask0x80hi = newTemp(Ity_I64); + IRTemp mask0x80lo = newTemp(Ity_I64); + IRTemp maskBit3hi = newTemp(Ity_I64); + IRTemp maskBit3lo = newTemp(Ity_I64); + IRTemp sAnd7hi = newTemp(Ity_I64); + IRTemp sAnd7lo = newTemp(Ity_I64); + IRTemp permdHi = newTemp(Ity_I64); + IRTemp permdLo = newTemp(Ity_I64); + + modrm = insn[3]; + assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) ); + + if (epartIsReg(modrm)) { + assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) ); + delta += 3+1; + DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), + nameXMMReg(gregOfRexRM(pfx,modrm))); + } else { + addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 ); + /* FIXME: generate trap if addr is not 16-aligned */ + assign( sV, loadLE(Ity_V128, mkexpr(addr)) ); + delta += 3+alen; + DIP("pshufb %s,%s\n", dis_buf, + nameXMMReg(gregOfRexRM(pfx,modrm))); + } + + assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); + assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); + assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); + assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); + + assign( sevens, mkU64(0x0707070707070707ULL) ); + + /* + mask0x80hi = Not(SarN8x8(sHi,7)) + maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7) + sAnd7hi = And(sHi,sevens) + permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi), + And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) ) + rHi = And(permdHi,mask0x80hi) + */ + assign( + mask0x80hi, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7)))); + + assign( + maskBit3hi, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)), + mkU8(7))); + + assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens))); + + assign( + permdHi, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)), + mkexpr(maskBit3hi)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)), + unop(Iop_Not64,mkexpr(maskBit3hi))) )); + + assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) ); + + /* And the same for the lower half of the result. What fun. */ + + assign( + mask0x80lo, + unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7)))); + + assign( + maskBit3lo, + binop(Iop_SarN8x8, + binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)), + mkU8(7))); + + assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens))); + + assign( + permdLo, + binop( + Iop_Or64, + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)), + mkexpr(maskBit3lo)), + binop(Iop_And64, + binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)), + unop(Iop_Not64,mkexpr(maskBit3lo))) )); + + assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) ); + + putXMMReg( + gregOfRexRM(pfx,modrm), + binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)) + ); + goto decode_success; + } + + /* ---------------------------------------------------- */ + /* --- end of the SSSE3 decoder. --- */ + /* ---------------------------------------------------- */ + /*after_sse_decoders:*/ /* Get the primary opcode. */ @@ -14699,11 +15682,13 @@ DisResult disInstr_AMD64_WRK ( decode_failure: /* All decode failures end up here. */ vex_printf("vex amd64->IR: unhandled instruction bytes: " - "0x%x 0x%x 0x%x 0x%x\n", + "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", (Int)getUChar(delta_start+0), (Int)getUChar(delta_start+1), (Int)getUChar(delta_start+2), - (Int)getUChar(delta_start+3) ); + (Int)getUChar(delta_start+3), + (Int)getUChar(delta_start+4), + (Int)getUChar(delta_start+5) ); /* Tell the dispatcher that this insn cannot be decoded, and so has not been executed, and (is currently) the next to be executed. diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c index c00dbdab91..d49031e7c7 100644 --- a/VEX/priv/host-amd64/isel.c +++ b/VEX/priv/host-amd64/isel.c @@ -1038,6 +1038,12 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) fn = (HWord)h_generic_calc_InterleaveHI32x2; break; case Iop_InterleaveLO32x2: fn = (HWord)h_generic_calc_InterleaveLO32x2; break; + case Iop_CatOddLanes16x4: + fn = (HWord)h_generic_calc_CatOddLanes16x4; break; + case Iop_CatEvenLanes16x4: + fn = (HWord)h_generic_calc_CatEvenLanes16x4; break; + case Iop_Perm8x8: + fn = (HWord)h_generic_calc_Perm8x8; break; case Iop_Max8Ux8: fn = (HWord)h_generic_calc_Max8Ux8; break; @@ -1050,6 +1056,8 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Mul16x4: fn = (HWord)h_generic_calc_Mul16x4; break; + case Iop_Mul32x2: + fn = (HWord)h_generic_calc_Mul32x2; break; case Iop_MulHi16Sx4: fn = (HWord)h_generic_calc_MulHi16Sx4; break; case Iop_MulHi16Ux4: @@ -1095,6 +1103,10 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) fn = (HWord)h_generic_calc_ShlN16x4; second_is_UInt = True; break; + case Iop_ShlN8x8: + fn = (HWord)h_generic_calc_ShlN8x8; + second_is_UInt = True; + break; case Iop_ShrN32x2: fn = (HWord)h_generic_calc_ShrN32x2; second_is_UInt = True; diff --git a/VEX/priv/host-generic/h_generic_simd64.c b/VEX/priv/host-generic/h_generic_simd64.c index b271c0fea1..252fd4814d 100644 --- a/VEX/priv/host-generic/h_generic_simd64.c +++ b/VEX/priv/host-generic/h_generic_simd64.c @@ -142,6 +142,11 @@ static inline UChar sel8x8_0 ( ULong w64 ) { return toUChar(0xFF & (lo32 >> 0)); } +static inline UChar index8x8 ( ULong w64, UChar ix ) { + ix &= 7; + return toUChar((w64 >> (8*ix)) & 0xFF); +} + /* Scalar helpers. */ @@ -213,6 +218,12 @@ static inline Short mul16 ( Short xx, Short yy ) return (Short)t; } +static inline Int mul32 ( Int xx, Int yy ) +{ + Int t = ((Int)xx) * ((Int)yy); + return (Int)t; +} + static inline Short mulhi16S ( Short xx, Short yy ) { Int t = ((Int)xx) * ((Int)yy); @@ -299,6 +310,11 @@ static inline UChar qnarrow16Uto8 ( UShort xx0 ) /* shifts: we don't care about out-of-range ones, since that is dealt with at a higher level. */ +static inline UChar shl8 ( UChar v, UInt n ) +{ + return toUChar(v << n); +} + static inline UChar sar8 ( UChar v, UInt n ) { return toUChar(((Char)v) >> n); @@ -555,6 +571,14 @@ ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy ) ); } +ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy ) +{ + return mk32x2( + mul32( sel32x2_1(xx), sel32x2_1(yy) ), + mul32( sel32x2_0(xx), sel32x2_0(yy) ) + ); +} + ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy ) { return mk16x4( @@ -799,6 +823,42 @@ ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb ) ); } +/* ------------ Concatenation ------------ */ + +ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb ) +{ + return mk16x4( + sel16x4_3(aa), + sel16x4_1(aa), + sel16x4_3(bb), + sel16x4_1(bb) + ); +} + +ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) +{ + return mk16x4( + sel16x4_2(aa), + sel16x4_0(aa), + sel16x4_2(bb), + sel16x4_0(bb) + ); +} + +/* misc hack looking for a proper home */ +ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) +{ + return mk8x8( + index8x8(aa, sel8x8_7(bb)), + index8x8(aa, sel8x8_6(bb)), + index8x8(aa, sel8x8_5(bb)), + index8x8(aa, sel8x8_4(bb)), + index8x8(aa, sel8x8_3(bb)), + index8x8(aa, sel8x8_2(bb)), + index8x8(aa, sel8x8_1(bb)), + index8x8(aa, sel8x8_0(bb)) + ); +} /* ------------ Shifting ------------ */ /* Note that because these primops are undefined if the shift amount @@ -829,6 +889,22 @@ ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn ) ); } +ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn ) +{ + /* vassert(nn < 8); */ + nn &= 7; + return mk8x8( + shl8( sel8x8_7(xx), nn ), + shl8( sel8x8_6(xx), nn ), + shl8( sel8x8_5(xx), nn ), + shl8( sel8x8_4(xx), nn ), + shl8( sel8x8_3(xx), nn ), + shl8( sel8x8_2(xx), nn ), + shl8( sel8x8_1(xx), nn ), + shl8( sel8x8_0(xx), nn ) + ); +} + ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn ) { /* vassert(nn < 32); */ diff --git a/VEX/priv/host-generic/h_generic_simd64.h b/VEX/priv/host-generic/h_generic_simd64.h index eb00e690cf..e7d4c6927e 100644 --- a/VEX/priv/host-generic/h_generic_simd64.h +++ b/VEX/priv/host-generic/h_generic_simd64.h @@ -83,6 +83,7 @@ extern ULong h_generic_calc_QSub16Ux4 ( ULong, ULong ); extern ULong h_generic_calc_QSub8Ux8 ( ULong, ULong ); extern ULong h_generic_calc_Mul16x4 ( ULong, ULong ); +extern ULong h_generic_calc_Mul32x2 ( ULong, ULong ); extern ULong h_generic_calc_MulHi16Sx4 ( ULong, ULong ); extern ULong h_generic_calc_MulHi16Ux4 ( ULong, ULong ); @@ -108,6 +109,11 @@ extern ULong h_generic_calc_InterleaveLO16x4 ( ULong, ULong ); extern ULong h_generic_calc_InterleaveHI32x2 ( ULong, ULong ); extern ULong h_generic_calc_InterleaveLO32x2 ( ULong, ULong ); +extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong ); +extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong ); +extern ULong h_generic_calc_Perm8x8 ( ULong, ULong ); + +extern ULong h_generic_calc_ShlN8x8 ( ULong, UInt ); extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt ); extern ULong h_generic_calc_ShlN32x2 ( ULong, UInt ); diff --git a/VEX/priv/ir/irdefs.c b/VEX/priv/ir/irdefs.c index f4d80b88ed..af949c560e 100644 --- a/VEX/priv/ir/irdefs.c +++ b/VEX/priv/ir/irdefs.c @@ -326,6 +326,7 @@ void ppIROp ( IROp op ) case Iop_QSub8Sx8: vex_printf("QSub8Sx8"); return; case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return; case Iop_Mul16x4: vex_printf("Mul16x4"); return; + case Iop_Mul32x2: vex_printf("Mul32x2"); return; case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return; case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return; case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return; @@ -340,6 +341,7 @@ void ppIROp ( IROp op ) case Iop_CmpGT8Sx8: vex_printf("CmpGT8Sx8"); return; case Iop_CmpGT16Sx4: vex_printf("CmpGT16Sx4"); return; case Iop_CmpGT32Sx2: vex_printf("CmpGT32Sx2"); return; + case Iop_ShlN8x8: vex_printf("ShlN8x8"); return; case Iop_ShlN16x4: vex_printf("ShlN16x4"); return; case Iop_ShlN32x2: vex_printf("ShlN32x2"); return; case Iop_ShrN16x4: vex_printf("ShrN16x4"); return; @@ -356,6 +358,9 @@ void ppIROp ( IROp op ) case Iop_InterleaveLO8x8: vex_printf("InterleaveLO8x8"); return; case Iop_InterleaveLO16x4: vex_printf("InterleaveLO16x4"); return; case Iop_InterleaveLO32x2: vex_printf("InterleaveLO32x2"); return; + case Iop_CatOddLanes16x4: vex_printf("CatOddLanes16x4"); return; + case Iop_CatEvenLanes16x4: vex_printf("CatEvenLanes16x4"); return; + case Iop_Perm8x8: vex_printf("Iop_Perm8x8"); return; case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return; case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return; @@ -1506,9 +1511,12 @@ void typeOfPrimop ( IROp op, case Iop_InterleaveHI8x8: case Iop_InterleaveLO8x8: case Iop_InterleaveHI16x4: case Iop_InterleaveLO16x4: case Iop_InterleaveHI32x2: case Iop_InterleaveLO32x2: + case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4: + case Iop_Perm8x8: case Iop_Max8Ux8: case Iop_Max16Sx4: case Iop_Min8Ux8: case Iop_Min16Sx4: - case Iop_Mul16x4: case Iop_MulHi16Sx4: case Iop_MulHi16Ux4: + case Iop_Mul16x4: case Iop_Mul32x2: + case Iop_MulHi16Sx4: case Iop_MulHi16Ux4: case Iop_QAdd8Sx8: case Iop_QAdd16Sx4: case Iop_QAdd8Ux8: case Iop_QAdd16Ux4: case Iop_QNarrow32Sx2: @@ -1518,7 +1526,7 @@ void typeOfPrimop ( IROp op, case Iop_QSub8Ux8: case Iop_QSub16Ux4: BINARY(Ity_I64,Ity_I64, Ity_I64); - case Iop_ShlN32x2: case Iop_ShlN16x4: + case Iop_ShlN32x2: case Iop_ShlN16x4: case Iop_ShlN8x8: case Iop_ShrN32x2: case Iop_ShrN16x4: case Iop_SarN32x2: case Iop_SarN16x4: case Iop_SarN8x8: BINARY(Ity_I64,Ity_I8, Ity_I64); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 056dd23dbb..4f8ef30771 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -658,7 +658,7 @@ typedef Iop_QSub8Sx8, Iop_QSub16Sx4, /* MULTIPLICATION (normal / high half of signed/unsigned) */ - Iop_Mul16x4, + Iop_Mul16x4, Iop_Mul32x2, Iop_MulHi16Ux4, Iop_MulHi16Sx4, @@ -677,7 +677,7 @@ typedef Iop_CmpGT8Sx8, Iop_CmpGT16Sx4, Iop_CmpGT32Sx2, /* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */ - Iop_ShlN16x4, Iop_ShlN32x2, + Iop_ShlN8x8, Iop_ShlN16x4, Iop_ShlN32x2, Iop_ShrN16x4, Iop_ShrN32x2, Iop_SarN8x8, Iop_SarN16x4, Iop_SarN32x2, @@ -692,6 +692,19 @@ typedef Iop_InterleaveHI8x8, Iop_InterleaveHI16x4, Iop_InterleaveHI32x2, Iop_InterleaveLO8x8, Iop_InterleaveLO16x4, Iop_InterleaveLO32x2, + /* CONCATENATION -- build a new value by concatenating either + the even or odd lanes of both operands. Note that + Cat{Odd,Even}Lanes32x2 are identical to Interleave{HI,LO}32x2 + and so are omitted. */ + Iop_CatOddLanes16x4, Iop_CatEvenLanes16x4, + + /* PERMUTING -- copy src bytes to dst, + as indexed by control vector bytes: + for i in 0 .. 7 . result[i] = argL[ argR[i] ] + argR[i] values may only be in the range 0 .. 7, else behaviour + is undefined. */ + Iop_Perm8x8, + /* ------------------ 128-bit SIMD FP. ------------------ */ /* --- 32x4 vector FP --- */