From: Julian Seward Date: Thu, 29 Jul 2010 11:34:38 +0000 (+0000) Subject: Support the amd SSE4.something LZCNT instruction. Fixes #212335 X-Git-Tag: svn/VALGRIND_3_6_1^2~78 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=75c8959a07f6c1b2cf20eb4eb7a5dec04efdfd21;p=thirdparty%2Fvalgrind.git Support the amd SSE4.something LZCNT instruction. Fixes #212335 and its various clones, at least #227551, #241290 and #240639. git-svn-id: svn://svn.valgrind.org/vex/trunk@1994 --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 4ce3b1e32a..2270e424df 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -4436,6 +4436,40 @@ static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src ) } +/* Generate an IR sequence to do a count-leading-zeroes operation on + the supplied IRTemp, and return a new IRTemp holding the result. + 'ty' may be Ity_I16, Ity_I32 or Ity_I64 only. In the case where + the argument is zero, return the number of bits in the word (the + natural semantics). */ +static IRTemp gen_LZCNT ( IRType ty, IRTemp src ) +{ + vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16); + + IRTemp src64 = newTemp(Ity_I64); + assign(src64, widenUto64( mkexpr(src) )); + + IRTemp src64x = newTemp(Ity_I64); + assign(src64x, + binop(Iop_Shl64, mkexpr(src64), + mkU8(64 - 8 * sizeofIRType(ty)))); + + // Clz64 has undefined semantics when its input is zero, so + // special-case around that. + IRTemp res64 = newTemp(Ity_I64); + assign(res64, + IRExpr_Mux0X( + unop(Iop_1Uto8, + binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))), + unop(Iop_Clz64, mkexpr(src64x)), + mkU64(8 * sizeofIRType(ty)) + )); + + IRTemp res = newTemp(ty); + assign(res, narrowTo(ty, mkexpr(res64))); + return res; +} + + /*------------------------------------------------------------*/ /*--- ---*/ /*--- x87 FLOATING POINT INSTRUCTIONS ---*/ @@ -15065,6 +15099,67 @@ DisResult disInstr_AMD64_WRK ( goto decode_success; } + /* F3 0F BD -- LZCNT (count leading zeroes. An AMD extension, but + fortunately occupying opcode space which AFAICS is not occupied + by anything else, even in Intel land. NB: 0F BD is BSR, but + that's decoded below here, and we reject it if there's an F3 + prefix. Hence there is no possibility of confusion with this + one. */ + if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */ + && insn[0] == 0x0F && insn[1] == 0xBD) { + vassert(sz == 2 || sz == 4 || sz == 8); + /*IRType*/ ty = szToITy(sz); + IRTemp src = newTemp(ty); + modrm = insn[2]; + if (epartIsReg(modrm)) { + assign(src, getIRegE(sz, pfx, modrm)); + delta += 2+1; + DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm), + nameIRegG(sz, pfx, modrm)); + } else { + addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0); + assign(src, loadLE(ty, mkexpr(addr))); + delta += 2+alen; + DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf, + nameIRegG(sz, pfx, modrm)); + } + + IRTemp res = gen_LZCNT(ty, src); + putIRegG(sz, pfx, modrm, mkexpr(res)); + + // Update flags. This is pretty lame .. perhaps can do better + // if this turns out to be performance critical. + // O S A P are cleared. Z is set if RESULT == 0. + // C is set if SRC is zero. + IRTemp src64 = newTemp(Ity_I64); + IRTemp res64 = newTemp(Ity_I64); + assign(src64, widenUto64(mkexpr(src))); + assign(res64, widenUto64(mkexpr(res))); + + IRTemp oszacp = newTemp(Ity_I64); + assign( + oszacp, + binop(Iop_Or64, + binop(Iop_Shl64, + unop(Iop_1Uto64, + binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))), + mkU8(AMD64G_CC_SHIFT_Z)), + binop(Iop_Shl64, + unop(Iop_1Uto64, + binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))), + mkU8(AMD64G_CC_SHIFT_C)) + ) + ); + + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) )); + stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) )); + + goto decode_success; + } + + /* ---------------------------------------------------- */ /* --- end of the SSE4 decoder --- */ /* ---------------------------------------------------- */ @@ -17707,6 +17802,72 @@ DisResult disInstr_AMD64 ( IRSB* irsb_IN, } +/*------------------------------------------------------------*/ +/*--- Unused stuff ---*/ +/*------------------------------------------------------------*/ + +// A potentially more Memcheck-friendly version of gen_LZCNT, if +// this should ever be needed. +// +//static IRTemp gen_LZCNT ( IRType ty, IRTemp src ) +//{ +// /* Scheme is simple: propagate the most significant 1-bit into all +// lower positions in the word. This gives a word of the form +// 0---01---1. Now invert it, giving a word of the form +// 1---10---0, then do a population-count idiom (to count the 1s, +// which is the number of leading zeroes, or the word size if the +// original word was 0. +// */ +// Int i; +// IRTemp t[7]; +// for (i = 0; i < 7; i++) { +// t[i] = newTemp(ty); +// } +// if (ty == Ity_I64) { +// assign(t[0], binop(Iop_Or64, mkexpr(src), +// binop(Iop_Shr64, mkexpr(src), mkU8(1)))); +// assign(t[1], binop(Iop_Or64, mkexpr(t[0]), +// binop(Iop_Shr64, mkexpr(t[0]), mkU8(2)))); +// assign(t[2], binop(Iop_Or64, mkexpr(t[1]), +// binop(Iop_Shr64, mkexpr(t[1]), mkU8(4)))); +// assign(t[3], binop(Iop_Or64, mkexpr(t[2]), +// binop(Iop_Shr64, mkexpr(t[2]), mkU8(8)))); +// assign(t[4], binop(Iop_Or64, mkexpr(t[3]), +// binop(Iop_Shr64, mkexpr(t[3]), mkU8(16)))); +// assign(t[5], binop(Iop_Or64, mkexpr(t[4]), +// binop(Iop_Shr64, mkexpr(t[4]), mkU8(32)))); +// assign(t[6], unop(Iop_Not64, mkexpr(t[5]))); +// return gen_POPCOUNT(ty, t[6]); +// } +// if (ty == Ity_I32) { +// assign(t[0], binop(Iop_Or32, mkexpr(src), +// binop(Iop_Shr32, mkexpr(src), mkU8(1)))); +// assign(t[1], binop(Iop_Or32, mkexpr(t[0]), +// binop(Iop_Shr32, mkexpr(t[0]), mkU8(2)))); +// assign(t[2], binop(Iop_Or32, mkexpr(t[1]), +// binop(Iop_Shr32, mkexpr(t[1]), mkU8(4)))); +// assign(t[3], binop(Iop_Or32, mkexpr(t[2]), +// binop(Iop_Shr32, mkexpr(t[2]), mkU8(8)))); +// assign(t[4], binop(Iop_Or32, mkexpr(t[3]), +// binop(Iop_Shr32, mkexpr(t[3]), mkU8(16)))); +// assign(t[5], unop(Iop_Not32, mkexpr(t[4]))); +// return gen_POPCOUNT(ty, t[5]); +// } +// if (ty == Ity_I16) { +// assign(t[0], binop(Iop_Or16, mkexpr(src), +// binop(Iop_Shr16, mkexpr(src), mkU8(1)))); +// assign(t[1], binop(Iop_Or16, mkexpr(t[0]), +// binop(Iop_Shr16, mkexpr(t[0]), mkU8(2)))); +// assign(t[2], binop(Iop_Or16, mkexpr(t[1]), +// binop(Iop_Shr16, mkexpr(t[1]), mkU8(4)))); +// assign(t[3], binop(Iop_Or16, mkexpr(t[2]), +// binop(Iop_Shr16, mkexpr(t[2]), mkU8(8)))); +// assign(t[4], unop(Iop_Not16, mkexpr(t[3]))); +// return gen_POPCOUNT(ty, t[4]); +// } +// vassert(0); +//} + /*--------------------------------------------------------------------*/ /*--- end guest_amd64_toIR.c ---*/ diff --git a/VEX/priv/guest_x86_toIR.c b/VEX/priv/guest_x86_toIR.c index 2bf234a772..991d1a076e 100644 --- a/VEX/priv/guest_x86_toIR.c +++ b/VEX/priv/guest_x86_toIR.c @@ -3362,6 +3362,40 @@ UInt dis_imul_I_E_G ( UChar sorb, } +/* Generate an IR sequence to do a count-leading-zeroes operation on + the supplied IRTemp, and return a new IRTemp holding the result. + 'ty' may be Ity_I16 or Ity_I32 only. In the case where the + argument is zero, return the number of bits in the word (the + natural semantics). */ +static IRTemp gen_LZCNT ( IRType ty, IRTemp src ) +{ + vassert(ty == Ity_I32 || ty == Ity_I16); + + IRTemp src32 = newTemp(Ity_I32); + assign(src32, widenUto32( mkexpr(src) )); + + IRTemp src32x = newTemp(Ity_I32); + assign(src32x, + binop(Iop_Shl32, mkexpr(src32), + mkU8(32 - 8 * sizeofIRType(ty)))); + + // Clz32 has undefined semantics when its input is zero, so + // special-case around that. + IRTemp res32 = newTemp(Ity_I32); + assign(res32, + IRExpr_Mux0X( + unop(Iop_1Uto8, + binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))), + unop(Iop_Clz32, mkexpr(src32x)), + mkU32(8 * sizeofIRType(ty)) + )); + + IRTemp res = newTemp(ty); + assign(res, narrowTo(ty, mkexpr(res32))); + return res; +} + + /*------------------------------------------------------------*/ /*--- ---*/ /*--- x87 FLOATING POINT INSTRUCTIONS ---*/ @@ -12579,6 +12613,66 @@ DisResult disInstr_X86_WRK ( goto decode_success; } + /* F3 0F BD -- LZCNT (count leading zeroes. An AMD extension, but + fortunately occupying opcode space which AFAICS is not occupied + by anything else, even in Intel land. NB: 0F BD is BSR, but + that's decoded below here, and it won't match there's an F3 + prefix. Hence there is no possibility of confusion with this + one. */ + if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD) { + vassert(sz == 2 || sz == 4); + /*IRType*/ ty = szToITy(sz); + IRTemp src = newTemp(ty); + modrm = insn[3]; + if (epartIsReg(modrm)) { + assign(src, getIReg(sz, eregOfRM(modrm))); + delta += 3+1; + DIP("lzcnt%c %s, %s\n", nameISize(sz), + nameIReg(sz, eregOfRM(modrm)), + nameIReg(sz, gregOfRM(modrm))); + } else { + addr = disAMode( &alen, sorb, delta+3, dis_buf ); + assign(src, loadLE(ty, mkexpr(addr))); + delta += 3+alen; + DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf, + nameIReg(sz, gregOfRM(modrm))); + } + + IRTemp res = gen_LZCNT(ty, src); + putIReg(sz, gregOfRM(modrm), mkexpr(res)); + + // Update flags. This is pretty lame .. perhaps can do better + // if this turns out to be performance critical. + // O S A P are cleared. Z is set if RESULT == 0. + // C is set if SRC is zero. + IRTemp src32 = newTemp(Ity_I32); + IRTemp res32 = newTemp(Ity_I32); + assign(src32, widenUto32(mkexpr(src))); + assign(res32, widenUto32(mkexpr(res))); + + IRTemp oszacp = newTemp(Ity_I32); + assign( + oszacp, + binop(Iop_Or32, + binop(Iop_Shl32, + unop(Iop_1Uto32, + binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))), + mkU8(X86G_CC_SHIFT_Z)), + binop(Iop_Shl32, + unop(Iop_1Uto32, + binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))), + mkU8(X86G_CC_SHIFT_C)) + ) + ); + + stmt( IRStmt_Put( OFFB_CC_OP, mkU32(X86G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) )); + stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) )); + stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) )); + + goto decode_success; + } + /* ---------------------------------------------------- */ /* --- end of the SSE4 decoder --- */ /* ---------------------------------------------------- */