From b145cd9c3c0a2e27af4aaba00778b12e733b8e34 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Tue, 17 Aug 2010 22:52:08 +0000 Subject: [PATCH] Add a moderately comprehensive implementation of the SSE4.2 string instructions PCMP{I,E}STR{I,M}. They are an absolute nightmare of complexity. Most of the 8-bit data processing variants are supported, but none of the 16-bit variants. Also add support for PINSRB and PTEST. With these changes, I believe Valgrind supports all the SSE4.2 instructions used in glibc-2.11 on x86_64-linux, as well as anything that gcc can emit. So that gives fairly good coverage. Currently these instructions are handled, but CPUID still claims to be an older, non-SSE4 capable Core 2, so that software that correctly checks CPU features should not use them. Following further testing I will enable the relevant SSE4.2 bits in CPUID. git-svn-id: svn://svn.valgrind.org/vex/trunk@2010 --- VEX/priv/guest_amd64_defs.h | 53 +++- VEX/priv/guest_amd64_helpers.c | 144 ++++++++--- VEX/priv/guest_amd64_toIR.c | 263 +++++++++++++++++--- VEX/priv/guest_generic_x87.c | 427 +++++++++++++++++++------------- VEX/priv/guest_generic_x87.h | 17 +- VEX/priv/host_amd64_isel.c | 102 +++----- VEX/priv/host_generic_simd128.h | 12 - VEX/pub/libvex_basictypes.h | 10 + 8 files changed, 692 insertions(+), 336 deletions(-) diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h index 7a40ba6e01..3d6e128bd0 100644 --- a/VEX/priv/guest_amd64_defs.h +++ b/VEX/priv/guest_amd64_defs.h @@ -157,14 +157,51 @@ extern void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, extern void amd64g_dirtyhelper_SxDT ( void* address, ULong op /* 0 or 1 */ ); -extern ULong amd64g_dirtyhelper_ISTRI_08 ( VexGuestAMD64State*, - HWord, HWord ); -extern ULong amd64g_dirtyhelper_ISTRI_0C ( VexGuestAMD64State*, - HWord, HWord ); -extern ULong amd64g_dirtyhelper_ISTRI_3A ( VexGuestAMD64State*, - HWord, HWord ); -extern ULong amd64g_dirtyhelper_ISTRI_4A ( VexGuestAMD64State*, - HWord, HWord ); +/* Helps with PCMP{I,E}STR{I,M}. + + CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, + actually it could be a clean helper, but for the fact that we can't + pass by value 2 x V128 to a clean helper, nor have one returned.) + Reads guest state, writes to guest state for the xSTRM cases, no + accesses of memory, is a pure function. + + opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so + the callee knows which I/E and I/M variant it is dealing with and + what the specific operation is. 4th byte of opcode is in the range + 0x60 to 0x63: + istri 66 0F 3A 63 + istrm 66 0F 3A 62 + estri 66 0F 3A 61 + estrm 66 0F 3A 60 + + gstOffL and gstOffR are the guest state offsets for the two XMM + register inputs. We never have to deal with the memory case since + that is handled by pre-loading the relevant value into the fake + XMM16 register. + + For ESTRx variants, edxIN and eaxIN hold the values of those two + registers. + + In all cases, the bottom 16 bits of the result contain the new + OSZACP %rflags values. For xSTRI variants, bits[31:16] of the + result hold the new %ecx value. For xSTRM variants, the helper + writes the result directly to the guest XMM0. + + Declarable side effects: in all cases, reads guest state at + [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes + guest_XMM0. + + Is expected to be called with opc_and_imm combinations which have + actually been validated, and will assert if otherwise. The front + end should ensure we're only called with verified values. +*/ +extern ULong amd64g_dirtyhelper_PCMPxSTRx ( + VexGuestAMD64State*, + HWord opc4_and_imm, + HWord gstOffL, HWord gstOffR, + HWord edxIN, HWord eaxIN + ); + //extern void amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* ); //extern void amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* ); diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c index 48f915cdc7..e3f9d510f7 100644 --- a/VEX/priv/guest_amd64_helpers.c +++ b/VEX/priv/guest_amd64_helpers.c @@ -2514,45 +2514,123 @@ ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ) /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/ /*---------------------------------------------------------------*/ -/* CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, - actually it could be a clean helper, but for the fact that we can't - pass by value 2 x V128 to a clean helper.) Reads guest state, no - writes to guest state, no accesses of memory, is a pure function. - This relies on the property that the XMM regs are laid out - consecutively in the guest state, so we can index into them here. - Returned value (0 .. 16) is in the low 16 bits of the return value. - Returned bits 31:16 hold the result OSZACP value. -*/ -ULong amd64g_dirtyhelper_ISTRI_08 ( VexGuestAMD64State* gst, - HWord gstOffL, HWord gstOffR ) +static UInt zmask_from_V128 ( V128* arg ) { - U128* argL = (U128*)( ((UChar*)gst) + gstOffL ); - U128* argR = (U128*)( ((UChar*)gst) + gstOffR ); - return (HWord) compute_ISTRI_08( argL, argR ); + UInt i, res = 0; + for (i = 0; i < 16; i++) { + res |= ((arg->w8[i] == 0) ? 1 : 0) << i; + } + return res; } -ULong amd64g_dirtyhelper_ISTRI_0C ( VexGuestAMD64State* gst, - HWord gstOffL, HWord gstOffR ) -{ - U128* argL = (U128*)( ((UChar*)gst) + gstOffL ); - U128* argR = (U128*)( ((UChar*)gst) + gstOffR ); - return (HWord) compute_ISTRI_0C( argL, argR ); -} +/* Helps with PCMP{I,E}STR{I,M}. -ULong amd64g_dirtyhelper_ISTRI_3A ( VexGuestAMD64State* gst, - HWord gstOffL, HWord gstOffR ) + CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, + actually it could be a clean helper, but for the fact that we can't + pass by value 2 x V128 to a clean helper, nor have one returned.) + Reads guest state, writes to guest state for the xSTRM cases, no + accesses of memory, is a pure function. + + opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so + the callee knows which I/E and I/M variant it is dealing with and + what the specific operation is. 4th byte of opcode is in the range + 0x60 to 0x63: + istri 66 0F 3A 63 + istrm 66 0F 3A 62 + estri 66 0F 3A 61 + estrm 66 0F 3A 60 + + gstOffL and gstOffR are the guest state offsets for the two XMM + register inputs. We never have to deal with the memory case since + that is handled by pre-loading the relevant value into the fake + XMM16 register. + + For ESTRx variants, edxIN and eaxIN hold the values of those two + registers. + + In all cases, the bottom 16 bits of the result contain the new + OSZACP %rflags values. For xSTRI variants, bits[31:16] of the + result hold the new %ecx value. For xSTRM variants, the helper + writes the result directly to the guest XMM0. + + Declarable side effects: in all cases, reads guest state at + [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes + guest_XMM0. + + Is expected to be called with opc_and_imm combinations which have + actually been validated, and will assert if otherwise. The front + end should ensure we're only called with verified values. +*/ +ULong amd64g_dirtyhelper_PCMPxSTRx ( + VexGuestAMD64State* gst, + HWord opc4_and_imm, + HWord gstOffL, HWord gstOffR, + HWord edxIN, HWord eaxIN + ) { - U128* argL = (U128*)( ((UChar*)gst) + gstOffL ); - U128* argR = (U128*)( ((UChar*)gst) + gstOffR ); - return (HWord) compute_ISTRI_3A( argL, argR ); -} + HWord opc4 = (opc4_and_imm >> 8) & 0xFF; + HWord imm8 = opc4_and_imm & 0xFF; + HWord isISTRx = opc4 & 2; + HWord isxSTRM = (opc4 & 1) ^ 1; + vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */ + vassert((imm8 & 1) == 0); /* we support byte-size cases only */ + + // where the args are + V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); + V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); + + /* Create the arg validity masks, either from the vectors + themselves or from the supplied edx/eax values. */ + // FIXME: this is only right for the 8-bit data cases. + // At least that is asserted above. + UInt zmaskL, zmaskR; + if (isISTRx) { + zmaskL = zmask_from_V128(argL); + zmaskR = zmask_from_V128(argR); + } else { + Int tmp; + tmp = edxIN & 0xFFFFFFFF; + if (tmp < -16) tmp = -16; + if (tmp > 16) tmp = 16; + if (tmp < 0) tmp = -tmp; + vassert(tmp >= 0 && tmp <= 16); + zmaskL = (1 << tmp) & 0xFFFF; + tmp = eaxIN & 0xFFFFFFFF; + if (tmp < -16) tmp = -16; + if (tmp > 16) tmp = 16; + if (tmp < 0) tmp = -tmp; + vassert(tmp >= 0 && tmp <= 16); + zmaskR = (1 << tmp) & 0xFFFF; + } -ULong amd64g_dirtyhelper_ISTRI_4A ( VexGuestAMD64State* gst, - HWord gstOffL, HWord gstOffR ) -{ - U128* argL = (U128*)( ((UChar*)gst) + gstOffL ); - U128* argR = (U128*)( ((UChar*)gst) + gstOffR ); - return (HWord) compute_ISTRI_4A( argL, argR ); + // temp spot for the resulting flags and vector. + V128 resV; + UInt resOSZACP; + + // do the meyaath + Bool ok = compute_PCMPxSTRx ( + &resV, &resOSZACP, argL, argR, + zmaskL, zmaskR, imm8, (Bool)isxSTRM + ); + + // front end shouldn't pass us any imm8 variants we can't + // handle. Hence: + vassert(ok); + + // So, finally we need to get the results back to the caller. + // In all cases, the new OSZACP value is the lowest 16 of + // the return value. + if (isxSTRM) { + /* gst->guest_XMM0 = resV; */ // gcc don't like that + gst->guest_XMM0[0] = resV.w32[0]; + gst->guest_XMM0[1] = resV.w32[1]; + gst->guest_XMM0[2] = resV.w32[2]; + gst->guest_XMM0[3] = resV.w32[3]; + return resOSZACP & 0x8D5; + } else { + UInt newECX = resV.w32[0] & 0xFFFF; + return (newECX << 16) | (resOSZACP & 0x8D5); + } } diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 7ffb433a9d..b7e78a80ab 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -14461,6 +14461,61 @@ DisResult disInstr_AMD64_WRK ( goto decode_success; } + /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8 + Extract byte from r32/m8 and insert into xmm1 */ + if ( have66noF2noF3( pfx ) + && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x20 ) { + + Int imm8; + IRTemp new8 = newTemp(Ity_I64); + + modrm = insn[3]; + + if ( epartIsReg( modrm ) ) { + imm8 = (Int)(insn[3+1] & 0xF); + assign( new8, binop(Iop_And64, + unop(Iop_32Uto64, + getIReg32(eregOfRexRM(pfx,modrm))), + mkU64(0xFF))); + delta += 3+1+1; + DIP( "pinsrb $%d,%s,%s\n", imm8, + nameIReg32( eregOfRexRM(pfx, modrm) ), + nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 ); + imm8 = (Int)(insn[3+alen] & 0xF); + assign( new8, unop(Iop_8Uto64, loadLE( Ity_I8, mkexpr(addr) ))); + delta += 3+alen+1; + DIP( "pinsrb $%d,%s,%s\n", + imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } + + // Create a V128 value which has the selected byte in the + // specified lane, and zeroes everywhere else. + IRTemp tmp128 = newTemp(Ity_V128); + IRTemp halfshift = newTemp(Ity_I64); + assign(halfshift, binop(Iop_Shl64, + mkexpr(new8), mkU8(8 * (imm8 & 7)))); + vassert(imm8 >= 0 && imm8 <= 15); + if (imm8 < 8) { + assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift))); + } else { + assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0))); + } + + UShort mask = ~(1 << imm8); + + putXMMReg( gregOfRexRM(pfx, modrm), + binop( Iop_OrV128, + mkexpr(tmp128), + binop( Iop_AndV128, + getXMMReg( gregOfRexRM(pfx, modrm) ), + mkV128(mask) ) ) ); + + goto decode_success; + } + /* 66 0F 38 37 = PCMPGTQ 64x2 comparison (signed, presumably; the Intel docs don't say :-) */ @@ -15174,14 +15229,19 @@ DisResult disInstr_AMD64_WRK ( } /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1 + 66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1 + 66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1 + 66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1 (selected special cases that actually occur in glibc, not by any means a complete implementation.) */ if (have66noF2noF3(pfx) && sz == 2 && insn[0] == 0x0F && insn[1] == 0x3A - && insn[2] == 0x63) { + && (insn[2] >= 0x60 && insn[2] <= 0x63)) { + UInt isISTRx = insn[2] & 2; + UInt isxSTRM = (insn[2] & 1) ^ 1; UInt regNoL = 0; UInt regNoR = 0; UChar imm = 0; @@ -15208,35 +15268,41 @@ DisResult disInstr_AMD64_WRK ( } /* Now we know the XMM reg numbers for the operands, and the - immediate byte. Is it one we can actually handle? */ - void* fn = NULL; - HChar* nm = NULL; + immediate byte. Is it one we can actually handle? Throw out + any cases for which the helper function has not been + verified. */ switch (imm) { - case 0x08: fn = &amd64g_dirtyhelper_ISTRI_08; - nm = "amd64g_dirtyhelper_ISTRI_08"; break; - case 0x0C: fn = &amd64g_dirtyhelper_ISTRI_0C; - nm = "amd64g_dirtyhelper_ISTRI_0C"; break; - case 0x3A: fn = &amd64g_dirtyhelper_ISTRI_3A; - nm = "amd64g_dirtyhelper_ISTRI_3A"; break; - case 0x4A: fn = &amd64g_dirtyhelper_ISTRI_4A; - nm = "amd64g_dirtyhelper_ISTRI_4A"; break; - default: goto decode_failure; - } - vassert(fn); vassert(nm); + case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12: + case 0x1A: case 0x3A: case 0x44: case 0x4A: + break; + default: + goto decode_failure; + } + + /* Who ya gonna call? Presumably not Ghostbusters. */ + void* fn = &amd64g_dirtyhelper_PCMPxSTRx; + HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx"; + /* Round up the arguments. Note that this is a kludge -- the + use of mkU64 rather than mkIRExpr_HWord implies the + assumption that the host's word size is 64-bit. */ UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL); UInt gstOffR = xmmGuestRegOffset(regNoR); - IRTemp resT = newTemp(Ity_I64); - IRDirty* d - = unsafeIRDirty_1_N( resT, 0/*regparms*/, - nm, fn, - mkIRExprVec_2( mkIRExpr_HWord(gstOffL), - mkIRExpr_HWord(gstOffR)) ); + IRExpr* opc4_and_imm = mkU64((insn[2] << 8) | (imm & 0xFF)); + IRExpr* gstOffLe = mkU64(gstOffL); + IRExpr* gstOffRe = mkU64(gstOffR); + IRExpr* edxIN = isISTRx ? mkU64(0) : getIRegRDX(8); + IRExpr* eaxIN = isISTRx ? mkU64(0) : getIRegRAX(8); + IRExpr** args + = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN ); + + IRTemp resT = newTemp(Ity_I64); + IRDirty* d = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args ); /* It's not really a dirty call, but we can't use the clean helper mechanism here for the very lame reason that we can't - pass 2 x V128s by value to a helper. Hence this roundabout - scheme. */ + pass 2 x V128s by value to a helper, nor get one back. Hence + this roundabout scheme. */ d->needsBBP = True; d->nFxState = 2; d->fxState[0].fx = Ifx_Read; @@ -15245,33 +15311,164 @@ DisResult disInstr_AMD64_WRK ( d->fxState[1].fx = Ifx_Read; d->fxState[1].offset = gstOffR; d->fxState[1].size = sizeof(U128); + if (isxSTRM) { + /* Declare that the helper writes XMM0. */ + d->nFxState = 3; + d->fxState[2].fx = Ifx_Write; + d->fxState[2].offset = xmmGuestRegOffset(0); + d->fxState[2].size = sizeof(U128); + } + stmt( IRStmt_Dirty(d) ); - /* Now resT[15:0] holds what the Intel docs call IntRes2, and - resT[31:16] holds the new OSZACP values. We must park the - resultin ECX and update the condition codes. */ - putIReg64(R_RCX, binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))); + /* Now resT[15:0] holds the new OSZACP values, so the condition + codes must be updated. And for a xSTRI case, resT[31:16] + holds the new ECX value, so stash that too. */ + if (!isxSTRM) { + putIReg64(R_RCX, binop(Iop_And64, + binop(Iop_Shr64, mkexpr(resT), mkU8(16)), + mkU64(0xFFFF))); + } stmt( IRStmt_Put( OFFB_CC_DEP1, - binop(Iop_And64, binop(Iop_Shr64, mkexpr(resT), mkU8(16)), - mkU64(0xFFFF)) + binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF)) )); stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) )); if (regNoL == 16) { - DIP("pcmpistri $%x,%s,%s\n", + DIP("pcmp%cstr%c $%x,%s,%s\n", + isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i', (UInt)imm, dis_buf, nameXMMReg(regNoR)); } else { - DIP("pcmpistri $%x,%s,%s\n", + DIP("pcmp%cstr%c $%x,%s,%s\n", + isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i', (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR)); } goto decode_success; } + + /* 66 0f 38 17 /r = PTEST xmm1, xmm2/m128 + Logical compare (set ZF and CF from AND/ANDN of the operands) */ + if (have66noF2noF3( pfx ) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x17) { + modrm = insn[3]; + IRTemp vecE = newTemp(Ity_V128); + IRTemp vecG = newTemp(Ity_V128); + + if ( epartIsReg(modrm) ) { + assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm))); + delta += 3+1; + DIP( "ptest %s,%s\n", + nameXMMReg( eregOfRexRM(pfx, modrm) ), + nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 ); + assign(vecE, loadLE( Ity_V128, mkexpr(addr) )); + delta += 3+alen; + DIP( "ptest %s,%s\n", + dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } + + assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm))); + + /* Set Z=1 iff (vecE & vecG) == 0 + Set C=1 iff (vecE & not vecG) == 0 + */ + + /* andV, andnV: vecE & vecG, vecE and not(vecG) */ + IRTemp andV = newTemp(Ity_V128); + IRTemp andnV = newTemp(Ity_V128); + assign(andV, binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG))); + assign(andnV, binop(Iop_AndV128, + mkexpr(vecE), + binop(Iop_XorV128, mkexpr(vecG), + mkV128(0xFFFF)))); + + /* The same, but reduced to 64-bit values, by or-ing the top + and bottom 64-bits together. It relies on this trick: + + InterleaveLO64x2([a,b],[c,d]) == [b,d] hence + + InterleaveLO64x2([a,b],[a,b]) == [b,b] and similarly + InterleaveHI64x2([a,b],[a,b]) == [a,a] + + and so the OR of the above 2 exprs produces + [a OR b, a OR b], from which we simply take the lower half. + */ + IRTemp and64 = newTemp(Ity_I64); + IRTemp andn64 = newTemp(Ity_I64); + + assign( + and64, + unop(Iop_V128to64, + binop(Iop_OrV128, + binop(Iop_InterleaveLO64x2, mkexpr(andV), mkexpr(andV)), + binop(Iop_InterleaveHI64x2, mkexpr(andV), mkexpr(andV)) + ) + ) + ); + + assign( + andn64, + unop(Iop_V128to64, + binop(Iop_OrV128, + binop(Iop_InterleaveLO64x2, mkexpr(andnV), mkexpr(andnV)), + binop(Iop_InterleaveHI64x2, mkexpr(andnV), mkexpr(andnV)) + ) + ) + ); + + /* Now convert and64, andn64 to all-zeroes or all-1s, so we can + slice out the Z and C bits conveniently. We use the standard + trick all-zeroes -> all-zeroes, anything-else -> all-ones + done by "(x | -x) >>s (word-size - 1)". + */ + IRTemp z64 = newTemp(Ity_I64); + IRTemp c64 = newTemp(Ity_I64); + assign(z64, + unop(Iop_Not64, + binop(Iop_Sar64, + binop(Iop_Or64, + binop(Iop_Sub64, mkU64(0), mkexpr(and64)), + mkexpr(and64) + ), + mkU8(63))) + ); + + assign(c64, + unop(Iop_Not64, + binop(Iop_Sar64, + binop(Iop_Or64, + binop(Iop_Sub64, mkU64(0), mkexpr(andn64)), + mkexpr(andn64) + ), + mkU8(63))) + ); + + /* And finally, slice out the Z and C flags and set the flags + thunk to COPY for them. OSAP are set to zero. */ + IRTemp newOSZACP = newTemp(Ity_I64); + assign(newOSZACP, + binop(Iop_Or64, + binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)), + binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C)) + ) + ); + + stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP))); + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) )); + + goto decode_success; + } + + /* ---------------------------------------------------- */ /* --- end of the SSE4 decoder --- */ /* ---------------------------------------------------- */ @@ -17319,8 +17516,8 @@ DisResult disInstr_AMD64_WRK ( fName = "amd64g_dirtyhelper_CPUID_sse3_and_cx16"; fAddr = &amd64g_dirtyhelper_CPUID_sse3_and_cx16; /* This is a Core-2-like machine */ - /* fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16"; */ - /* fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16; */ + //fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16"; + //fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16; /* This is a Core-i5-like machine */ } else { diff --git a/VEX/priv/guest_generic_x87.c b/VEX/priv/guest_generic_x87.c index 0b8294474d..4204893bdd 100644 --- a/VEX/priv/guest_generic_x87.c +++ b/VEX/priv/guest_generic_x87.c @@ -542,9 +542,10 @@ ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp ) /* We need the definitions for OSZACP eflags/rflags offsets. #including guest_{amd64,x86}_defs.h causes chaos, so just copy the - require values directly. They are not going to change in the - future :-) + required values directly. They are not going to change in the + foreseeable future :-) */ + #define SHIFT_O 11 #define SHIFT_S 7 #define SHIFT_Z 6 @@ -591,204 +592,294 @@ static UInt ctz32 ( UInt x ) return 32 - clz32((~x) & (x-1)); } - -/* Do the computations for SSE4.2 ISTRI_XX. Not called directly from - generated code. Pure function, reads *argLU and *argRU, returned - value (0 .. 16) is in the low 16 bits of the return value. - Returned bits 31:16 hold the result OSZACP value. -*/ -UInt compute_ISTRI_08 ( U128* argLU, U128* argRU ) +/* Convert a 4-bit value to a 32-bit value by cloning each bit 8 + times. There's surely a better way to do this, but I don't know + what it is. */ +static UInt bits4_to_bytes4 ( UInt bits4 ) { - /* unsigned bytes (also works for unsigned) - equal each (straightforward parallel compare) - polarity + (IntRes2 = IntRes1) - index 0 (want index of ls 1 bit) - */ - Int i; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolResII = 0, zmaskL = 0, zmaskR = 0; - for (i = 15; i >= 0; i--) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0); - zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0); - boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); - } - UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) - UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) - - // do invalidation, common to all equal-each cases - UInt intRes1 - = (boolResII & validL & validR) // if both valid, use cmpres - | (~ (validL | validR)); // if both invalid, force 1 - // else force 0 - intRes1 &= 0xFFFF; - - // polarity: + - UInt intRes2 = intRes1; - - // generate ecx value, common to all index-of-ls-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2); - - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] - - return (newFlags << 16) | newECX; + UInt r = 0; + r |= (bits4 & 1) ? 0x000000FF : 0; + r |= (bits4 & 2) ? 0x0000FF00 : 0; + r |= (bits4 & 4) ? 0x00FF0000 : 0; + r |= (bits4 & 8) ? 0xFF000000 : 0; + return r; } -UInt compute_ISTRI_0C ( U128* argLU, U128* argRU ) +/* Given partial results from a pcmpXstrX operation (intRes1, + basically), generate an I- or M-format output value, also the new + OSZACP flags. */ +static +void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV, + /*OUT*/UInt* resOSZACP, + UInt intRes1, + UInt zmaskL, UInt zmaskR, + UInt validL, + UInt pol, UInt idx, + Bool isxSTRM ) { - /* unsigned bytes - equal ordered (substring search) - polarity + (IntRes2 = IntRes1) - index 0 (want index of ls 1 bit) + vassert((pol >> 2) == 0); + vassert((idx >> 1) == 0); + + UInt intRes2 = 0; + switch (pol) { + case 0: intRes2 = intRes1; break; // pol + + case 1: intRes2 = ~intRes1; break; // pol - + case 2: intRes2 = intRes1; break; // pol m+ + case 3: intRes2 = intRes1 ^ validL; break; // pol m- + } + intRes2 &= 0xFFFF; + + if (isxSTRM) { + + // generate M-format output (a bit or byte mask in XMM0) + if (idx) { + resV->w32[0] = bits4_to_bytes4( (intRes2 >> 0) & 0xF ); + resV->w32[1] = bits4_to_bytes4( (intRes2 >> 4) & 0xF ); + resV->w32[2] = bits4_to_bytes4( (intRes2 >> 8) & 0xF ); + resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF ); + } else { + resV->w32[0] = intRes2 & 0xFFFF; + resV->w32[1] = 0; + resV->w32[2] = 0; + resV->w32[3] = 0; + } - argL: haystack, argR: needle - */ - UInt i, hi, ni; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolRes = 0, zmaskL = 0, zmaskR = 0; - UInt keepSearching = 1; - for (i = 0; i < 16; i++) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL >> 1) | (cL == 0 ? (1 << 15) : 0); - zmaskR = (zmaskR >> 1) | (cR == 0 ? (1 << 15) : 0); - - if (argL[i] == 0) { - // run off the end of the haystack. - keepSearching = 0; - } - - UInt m = 1; - if (keepSearching) { - for (ni = 0; ni < 16; ni++) { - if (argR[ni] == 0) break; - hi = ni + i; - if (hi >= 16) break; - if (argL[hi] != argR[ni]) { m = 0; break; } - } + } else { + + // generate I-format output (an index in ECX) + // generate ecx value + UInt newECX = 0; + if (idx) { + // index of ms-1-bit + newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2)); } else { - m = 0; + // index of ls-1-bit + newECX = intRes2 == 0 ? 16 : ctz32(intRes2); } - boolRes = (boolRes >> 1) | (m << 15); + + resV->w32[0] = newECX; + resV->w32[1] = 0; + resV->w32[2] = 0; + resV->w32[3] = 0; } - // boolRes is "pre-invalidated" - UInt intRes1 = boolRes & 0xFFFF; + // generate new flags, common to all ISTRI and ISTRM cases + *resOSZACP // A, P are zero + = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 + | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 + | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 + | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] +} + - // polarity: + - UInt intRes2 = intRes1; +/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M} + variants. - // generate ecx value, common to all index-of-ls-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2); + For xSTRI variants, the new ECX value is placed in the 32 bits + pointed to by *resV, and the top 96 bits are zeroed. For xSTRM + variants, the result is a 128 bit value and is placed at *resV in + the obvious way. - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] + For all variants, the new OSZACP value is placed at *resOSZACP. - return (newFlags << 16) | newECX; -} + argLV and argRV are the vector args. The caller must prepare a + 16-bit mask for each, zmaskL and zmaskR. For ISTRx variants this + must be 1 for each zero byte of of the respective arg. For ESTRx + variants this is derived from the explicit length indication, and + must be 0 in all places except at the bit index corresponding to + the valid length (0 .. 16). If the valid length is 16 then the + mask must be all zeroes. In all cases, bits 31:16 must be zero. + imm8 is the original immediate from the instruction. isSTRM + indicates whether this is a xSTRM or xSTRI variant, which controls + how much of *res is written. -UInt compute_ISTRI_3A ( U128* argLU, U128* argRU ) + If the given imm8 case can be handled, the return value is True. + If not, False is returned, and neither *res not *resOSZACP are + altered. +*/ + +Bool compute_PCMPxSTRx ( /*OUT*/V128* resV, + /*OUT*/UInt* resOSZACP, + V128* argLV, V128* argRV, + UInt zmaskL, UInt zmaskR, + UInt imm8, Bool isxSTRM ) { - /* signed bytes (also works for unsigned) - equal each (straightforward parallel compare) - polarity Masked- (IntRes2 = IntRes1 ^ validL) - index 0 (want index of ls 1 bit) - */ - Int i; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolResII = 0, zmaskL = 0, zmaskR = 0; - for (i = 15; i >= 0; i--) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0); - zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0); - boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); + vassert(imm8 < 0x80); + vassert((zmaskL >> 16) == 0); + vassert((zmaskR >> 16) == 0); + + /* Explicitly reject any imm8 values that haven't been validated, + even if they would probably work. Life is too short to have + unvalidated cases in the code base. */ + switch (imm8) { + case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12: + case 0x1A: case 0x3A: case 0x44: case 0x4A: + break; + default: + return False; } - UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) - UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) - // do invalidation, common to all equal-each cases - UInt intRes1 - = (boolResII & validL & validR) // if both valid, use cmpres - | (~ (validL | validR)); // if both invalid, force 1 - // else force 0 - intRes1 &= 0xFFFF; + UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format + UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn + UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity + UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask + + /*----------------------------------------*/ + /*-- strcmp on byte data --*/ + /*----------------------------------------*/ + + if (agg == 2/*equal each, aka strcmp*/ + && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) { + Int i; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolResII = 0; + for (i = 15; i >= 0; i--) { + UChar cL = argL[i]; + UChar cR = argR[i]; + boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); + } + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + + // do invalidation, common to all equal-each cases + UInt intRes1 + = (boolResII & validL & validR) // if both valid, use cmpres + | (~ (validL | validR)); // if both invalid, force 1 + // else force 0 + intRes1 &= 0xFFFF; + + // generate I-format output + compute_PCMPxSTRx_gen_output( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM + ); + + return True; + } - // polarity: Masked- - UInt intRes2 = (intRes1 ^ validL) & 0xFFFF; + /*----------------------------------------*/ + /*-- set membership on byte data --*/ + /*----------------------------------------*/ + + if (agg == 0/*equal any, aka find chars in a set*/ + && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) { + /* argL: the string, argR: charset */ + UInt si, ci; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolRes = 0; + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + + for (si = 0; si < 16; si++) { + if ((validL & (1 << si)) == 0) + // run off the end of the string. + break; + UInt m = 0; + for (ci = 0; ci < 16; ci++) { + if ((validR & (1 << ci)) == 0) break; + if (argR[ci] == argL[si]) { m = 1; break; } + } + boolRes |= (m << si); + } - // generate ecx value, common to all index-of-ls-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2); + // boolRes is "pre-invalidated" + UInt intRes1 = boolRes & 0xFFFF; + + // generate I-format output + compute_PCMPxSTRx_gen_output( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM + ); - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] + return True; + } - return (newFlags << 16) | newECX; -} + /*----------------------------------------*/ + /*-- substring search on byte data --*/ + /*----------------------------------------*/ + + if (agg == 3/*equal ordered, aka substring search*/ + && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) { + + /* argL: haystack, argR: needle */ + UInt ni, hi; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolRes = 0; + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + for (hi = 0; hi < 16; hi++) { + if ((validL & (1 << hi)) == 0) + // run off the end of the haystack + break; + UInt m = 1; + for (ni = 0; ni < 16; ni++) { + if ((validR & (1 << ni)) == 0) break; + UInt i = ni + hi; + if (i >= 16) break; + if (argL[i] != argR[ni]) { m = 0; break; } + } + boolRes |= (m << hi); + } + // boolRes is "pre-invalidated" + UInt intRes1 = boolRes & 0xFFFF; -UInt compute_ISTRI_4A ( U128* argLU, U128* argRU ) -{ - /* signed bytes (also works for unsigned) - equal each (straightforward parallel compare) - polarity + (IntRes2 = IntRes1) - index 1 (want index of ms 1 bit) - */ - Int i; - UChar* argL = (UChar*)argLU; - UChar* argR = (UChar*)argRU; - UInt boolResII = 0, zmaskL = 0, zmaskR = 0; - for (i = 15; i >= 0; i--) { - UChar cL = argL[i]; - UChar cR = argR[i]; - zmaskL = (zmaskL << 1) | (cL == 0 ? 1 : 0); - zmaskR = (zmaskR << 1) | (cR == 0 ? 1 : 0); - boolResII = (boolResII << 1) | (cL == cR ? 1 : 0); + // generate I-format output + compute_PCMPxSTRx_gen_output( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM + ); + + return True; } - UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) - UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) - // do invalidation, common to all equal-each cases - UInt intRes1 - = (boolResII & validL & validR) // if both valid, use cmpres - | (~ (validL | validR)); // if both invalid, force 1 - // else force 0 - intRes1 &= 0xFFFF; + /*----------------------------------------*/ + /*-- ranges, unsigned byte data --*/ + /*----------------------------------------*/ + + if (agg == 1/*ranges*/ + && fmt == 0/*ub*/) { + + /* argL: string, argR: range-pairs */ + UInt ri, si; + UChar* argL = (UChar*)argLV; + UChar* argR = (UChar*)argRV; + UInt boolRes = 0; + UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL)) + UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR)) + for (si = 0; si < 16; si++) { + if ((validL & (1 << si)) == 0) + // run off the end of the string + break; + UInt m = 0; + for (ri = 0; ri < 16; ri += 2) { + if ((validR & (3 << ri)) != (3 << ri)) break; + if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { + m = 1; break; + } + } + boolRes |= (m << si); + } - // polarity - UInt intRes2 = intRes1; + // boolRes is "pre-invalidated" + UInt intRes1 = boolRes & 0xFFFF; - // generate ecx value, common to all index-of-ms-1-bit cases - UInt newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2)); + // generate I-format output + compute_PCMPxSTRx_gen_output( + resV, resOSZACP, + intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM + ); - // generate new flags, common to all ISTRI and ISTRM cases - UInt newFlags // A, P are zero - = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0 - | ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0 - | ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0 - | ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0] + return True; + } - return (newFlags << 16) | newECX; + return False; } diff --git a/VEX/priv/guest_generic_x87.h b/VEX/priv/guest_generic_x87.h index f07b0ea080..9cbe23b38d 100644 --- a/VEX/priv/guest_generic_x87.h +++ b/VEX/priv/guest_generic_x87.h @@ -98,15 +98,14 @@ typedef generated code. CLEAN HELPER. */ extern ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp ); -/* Do the computations for SSE4.2 ISTRI_XX. Not called directly from - generated code. Pure function, reads *argLU and *argRU, returned - value (0 .. 16) is in the low 16 bits of the return value. - Returned bits 31:16 hold the result OSZACP value. */ -extern UInt compute_ISTRI_08 ( U128* argLU, U128* argRU ); -extern UInt compute_ISTRI_0C ( U128* argLU, U128* argRU ); -extern UInt compute_ISTRI_3A ( U128* argLU, U128* argRU ); -extern UInt compute_ISTRI_4A ( U128* argLU, U128* argRU ); - +/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M} + variants. See bigger comment on implementation of this function + for details on call/return conventions. */ +extern Bool compute_PCMPxSTRx ( /*OUT*/V128* resV, + /*OUT*/UInt* resOSZACP, + V128* argLV, V128* argRV, + UInt zmaskL, UInt zmaskR, + UInt imm8, Bool isxSTRM ); #endif /* ndef __VEX_GUEST_GENERIC_X87_H */ diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index cf89d535c0..1e719e7151 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -788,6 +788,21 @@ static HReg do_sse_NotV128 ( ISelEnv* env, HReg src ) } +/* Expand the given byte into a 64-bit word, by cloning each bit + 8 times. */ +static ULong bitmask8_to_bytemask64 ( UShort w8 ) +{ + vassert(w8 == (w8 & 0xFF)); + ULong w64 = 0; + Int i; + for (i = 0; i < 8; i++) { + if (w8 & (1<Iex.Const.con->Ico.V128) { case 0x0000: dst = generate_zeroes_V128(env); - return dst; + break; case 0xFFFF: dst = generate_ones_V128(env); - return dst; - default: - break; - } - AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); - const ULong const_z64 = 0x0000000000000000ULL; - const ULong const_o64 = 0xFFFFFFFFFFFFFFFFULL; - const ULong const_z32o32 = 0x00000000FFFFFFFFULL; - const ULong const_o32z32 = 0xFFFFFFFF00000000ULL; - switch (e->Iex.Const.con->Ico.V128) { - case 0x0000: case 0xFFFF: - vassert(0); /* handled just above */ - /* do push_uimm64 twice, first time for the high-order half. */ - case 0x00F0: - push_uimm64(env, const_z64); - push_uimm64(env, const_o32z32); - break; - case 0x00FF: - push_uimm64(env, const_z64); - push_uimm64(env, const_o64); - break; - case 0x000F: - push_uimm64(env, const_z64); - push_uimm64(env, const_z32o32); break; - case 0x0F00: - push_uimm64(env, const_z32o32); - push_uimm64(env, const_z64); - break; - case 0x0F0F: - push_uimm64(env, const_z32o32); - push_uimm64(env, const_z32o32); - break; - case 0x0FF0: - push_uimm64(env, const_z32o32); - push_uimm64(env, const_o32z32); - break; - case 0x0FFF: - push_uimm64(env, const_z32o32); - push_uimm64(env, const_o64); - break; - case 0xF000: - push_uimm64(env, const_o32z32); - push_uimm64(env, const_z64); - break; - case 0xF00F: - push_uimm64(env, const_o32z32); - push_uimm64(env, const_z32o32); - break; - case 0xF0F0: - push_uimm64(env, const_o32z32); - push_uimm64(env, const_o32z32); - break; - case 0xF0FF: - push_uimm64(env, const_o32z32); - push_uimm64(env, const_o64); - break; - case 0xFF00: - push_uimm64(env, const_o64); - push_uimm64(env, const_z64); - break; - case 0xFF0F: - push_uimm64(env, const_o64); - push_uimm64(env, const_z32o32); - break; - case 0xFFF0: - push_uimm64(env, const_o64); - push_uimm64(env, const_o32z32); + default: { + AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); + /* do push_uimm64 twice, first time for the high-order half. */ + push_uimm64(env, bitmask8_to_bytemask64( + (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF + )); + push_uimm64(env, bitmask8_to_bytemask64( + (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF + )); + addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 )); + add_to_rsp(env, 16); break; - default: - goto vec_fail; + } } - addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 )); - add_to_rsp(env, 16); return dst; } @@ -3723,7 +3679,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } - vec_fail: + //vec_fail: vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n", LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps)); ppIRExpr(e); diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h index 125514a737..53850cbdd2 100644 --- a/VEX/priv/host_generic_simd128.h +++ b/VEX/priv/host_generic_simd128.h @@ -43,18 +43,6 @@ #ifndef __VEX_HOST_GENERIC_SIMD128_H #define __VEX_HOST_GENERIC_SIMD128_H -/* A union for doing 128-bit primitives conveniently. It is not - public and so not placed in pub/. */ -typedef - union { - UChar w8[16]; - UShort w16[8]; - UInt w32[4]; - ULong w64[2]; - } - V128; - - #include "libvex_basictypes.h" /* DO NOT MAKE THESE INTO REGPARM FNS! THIS WILL BREAK CALLING diff --git a/VEX/pub/libvex_basictypes.h b/VEX/pub/libvex_basictypes.h index a996f2e563..a945913547 100644 --- a/VEX/pub/libvex_basictypes.h +++ b/VEX/pub/libvex_basictypes.h @@ -62,7 +62,17 @@ typedef signed long long int Long; /* Always 128 bits. */ typedef UInt U128[4]; +/* A union for doing 128-bit vector primitives conveniently. */ +typedef + union { + UChar w8[16]; + UShort w16[8]; + UInt w32[4]; + ULong w64[2]; + } + V128; +/* Floating point. */ typedef float Float; /* IEEE754 single-precision (32-bit) value */ typedef double Double; /* IEEE754 double-precision (64-bit) value */ -- 2.47.2